# Playground Season 3 Episode 9
## #35 Solution

This notebook will outline the relevent parts of the solution for Season 3 Episode 9's challenge of predicting the strength of concreate. 


In this notebook we will not go over too much on the Exploritory Data Analysis, but more on the technical side of finding the solution using Feature Engineering and machine learning models. 

### Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,  StratifiedKFold, KFold,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, ensemble
import catboost as cat
import category_encoders as ce
import optuna
from sklearn.linear_model import LassoCV, Lasso
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import warnings 
warnings.filterwarnings("ignore")

### Importing Data

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e9/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e9/test.csv')

#### Dropping ID

In [None]:
# Drop the Id column from both train and test data
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

### Feature Engineering

**Time** - *lnAgeInDays* - After testing, the best method of calculating Time was a logged version to help "reign in" the outliers.

**Chemistry** - *CementToWaterRatio* - I remember my dad mixing concrete, water was important to the factor... Also after testing a bunch of variables, this one helped the variance in RMSE. 

**Charun Umesh** utilized *FlyAshComponent* as a dummy which worked well for them. They noticed "FlyAshComponent feature has more than 73% of 0 values. it will be good to categorize that feature to 0 and 1s"


In [None]:
#train
train_df['lnAgeInDays'] = np.log(train_df['AgeInDays']) 
train_df['CementToWaterRatio'] = train_df['CementComponent']/train_df['WaterComponent']
train_df['FlyAshComponent'] = np.where(train_df['FlyAshComponent'] == 0.0, 0, 1).astype('int64')

#test
test_df['lnAgeInDays'] = np.log(test_df['AgeInDays'])
test_df['CementToWaterRatio'] = test_df['CementComponent']/test_df['WaterComponent']
test_df['FlyAshComponent'] = np.where(test_df['FlyAshComponent'] == 0.0, 0, 1).astype('int64')

#### Capping Outliers

In [None]:
def iqr_capping(df, cols, factor):
    for col in cols:
        print(f"column name is : {col}")
        df_outliers = df.copy()
        df_outliers[col] = sorted(df_outliers[col])
        Q1 = df_outliers[col].quantile(0.25)
        Q3 = df_outliers[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (factor*IQR)
        upper_bound = Q3 + (factor*IQR)
        print(f"lower_bound is : {lower_bound}")
        print(f"upper_bound is : {upper_bound}")
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])  
    return df

def outliers(data):   
    col_attributes = data.select_dtypes([np.int64, np.float64]).columns.values.tolist()
    data_cap = data.copy()
    new_data = iqr_capping(data_cap, col_attributes, 1.5)
    return new_data

In [None]:
train_df = outliers(train_df)
test_df = outliers(test_df)

### Model Creation

In [None]:
X = train_df.drop(columns=['Strength'], axis=1)
y = train_df['Strength']

#### Define Hyperparameters for the models

We will be using:
* XBG
* LGBM
* CatBoost

In [None]:
#XGB (Extreme Gradient Boost)

def objective_xgb(trial):
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)
    }
    optuna_model = XGBRegressor(**params)
    optuna_model.fit(x_train, y_train,  verbose=0)

    # Make predictions
    val_preds = optuna_model.predict(x_test)
    # Evaluate predictions
    root_mean_squared_error = mean_squared_error(y_test, val_preds,squared = False)

    return root_mean_squared_error

study_xgb = optuna.create_study(direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgb.optimize(objective_xgb, n_trials=50)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

#LGBM (Light Gradient Boost Machine)

def objective_lgbm(trial):
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0)
    }
    optuna_model = LGBMRegressor(**params)
    optuna_model.fit(x_train, y_train,  verbose=0)

    # Make predictions
    val_preds = optuna_model.predict(x_test)
    # Evaluate predictions
    root_mean_squared_error = mean_squared_error(y_test, val_preds,squared = False)

    return root_mean_squared_error

study_lgbm = optuna.create_study(direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm.optimize(objective_lgbm, n_trials=50)

In [None]:
xgb_params = {
    'learning_rate': 0.0225,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror',
    'max_depth': 3, 
    'reg_alpha': 0.8435, 
    'reg_lambda': 0.823545, 
    'n_estimators': 1000
}

lgb_params = {
     'learning_rate': 0.05,
     'max_depth': 10,
     'subsample_for_bin': 20000,
     'subsample': 0.8,
     'colsample_bytree': 0.8,
     'objective': 'regression',
 }
    
cbr_params = {'iterations': 2000, 
              'max_depth': 7, #4, #10,
              'learning_rate': 0.0036012104807528686, #0.03725416892898261, #0.01,
              'verbose': 100,
              'subsample': 0.6215023014443006,
              'l2_leaf_reg': 0.7998684766493955}

### K-fold cross validation

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
pred_xgb = np.zeros(test_df.shape[0])
pred_cat = np.zeros(test_df.shape[0])
pred_lgb = np.zeros(test_df.shape[0])
rmse_xgb=[]  
rmse_cat=[]
rmse_lgb=[]
rmse = []
n=0

for i, (train_index, valid_index) in enumerate(kfold.split(X, y)):
    x_train, y_train = X.iloc[train_index], y.iloc[train_index]
    x_valid, y_valid = X.loc[valid_index], y.iloc[valid_index]
    
    # XGBoost regressor
    model_xgb = XGBRegressor(**study_xgb.best_params)
    model_xgb.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],early_stopping_rounds=100,verbose=False) 
    pred_xgb+=model_xgb.predict(test_df)/kfold.n_splits
    rmse_xgb.append(mean_squared_error(y_valid, model_xgb.predict(x_valid), squared=False))
    
    # CatBoost regressor
    model_cat = cat.CatBoostRegressor(**cbr_params)
    model_cat.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],early_stopping_rounds=100,verbose=False) 
    pred_cat+=model_cat.predict(test_df)/kfold.n_splits
    rmse_cat.append(mean_squared_error(y_valid, model_cat.predict(x_valid), squared=False))
    
    
    # LightGBM regressor
    model_lgb = LGBMRegressor(**study_lgbm.best_params)
    model_lgb.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],early_stopping_rounds=100,verbose=False) 
    pred_lgb+=model_lgb.predict(test_df)/kfold.n_splits
    rmse_lgb.append(mean_squared_error(y_valid, model_lgb.predict(x_valid), squared=False))
    
    rmse_val = (rmse_xgb[n]+ rmse_cat[n]+ rmse_lgb[n]) / 3    
    rmse.append(rmse_val)
    print(f"fold: {n+1} rmse xgb: {rmse_xgb[n]} | rmse cat: {rmse_cat[n]} | rmse LGBM: {rmse_lgb[n]} final rmse is: {rmse[n]}")
    n+=1

print(np.mean(rmse))

### Submission

When evaluating, the targeted RSME. XGB consistantly had a higher variance than the other two models combined. So, it will be removed from the final answer. 

When testing, CatBoost out performed LGBM most times, and a "mean" of the model was not the best option in this case. Instead, opting for a 40/60 split leaning more towards CatBoost. 

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e9/sample_submission.csv')
submission['XGB'] = pred_xgb
submission['CAT'] = pred_cat
submission['LGB'] = pred_lgb
submission['Strength'] = (submission['LGB'] * 0.4 + submission['CAT'] * 0.6 )

final_submission = pd.DataFrame(submission, columns=['id', 'Strength'])
final_submission