In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
exp_data = pd.read_csv('../Data/CleanedData.csv')
inter_data = pd.read_csv('../Data/InterpolatedData.csv')

print(exp_data.columns)
inter_data.head(5)

Index(['helical_angle', 'helical_layer_count', 'hoop_layer_count',
       'bust_pressure', 'tensile_str', 'e1_gpa', 'youngs_modulus',
       'poision_ratio', 'yeild_strength', 'ult_tensile_strength',
       'liner_thickness', 'diameter', 'doily_layers'],
      dtype='object')


Unnamed: 0,helical_angle,helical_layer_count,hoop_layer_count,tensile_str,e1_gpa,youngs_modulus,poision_ratio,yeild_strength,ult_tensile_strength,liner_thickness,diameter,doily_layers,bust_pressure
0,20.45,17.0,7.0,1045.37,96.0,72.0,0.32,221.16,398.08,6.74,150.0,19.0,659.08
1,16.02,3.0,14.0,1087.71,94.0,135.0,0.32,209.12,317.55,6.13,140.0,18.0,656.02
2,14.47,10.0,14.0,1148.06,110.0,140.0,0.29,211.41,329.37,6.52,130.0,15.0,660.04
3,16.51,5.0,15.0,1098.74,103.0,73.0,0.28,221.32,389.34,7.78,126.0,13.0,662.92
4,18.53,9.0,9.0,1136.65,97.0,56.0,0.3,249.28,351.95,6.45,137.0,12.0,661.6


Making the Training and testing dataset
- Using the ratio - 75:15:10

In [6]:
input_cols = ['bust_pressure',  'tensile_str', 'e1_gpa', 'youngs_modulus', 'poision_ratio', 'yeild_strength', 'diameter', 'ult_tensile_strength']
output_cols = ['helical_angle', 'helical_layer_count', 'hoop_layer_count', 'liner_thickness', 'doily_layers']

In [7]:
exp_data_inp = exp_data[input_cols]
exp_data_out = exp_data[output_cols]
inter_data_inp = inter_data[input_cols]
inter_data_out = inter_data[output_cols]

In [14]:
x_train_exp, x_test_exp, y_train_exp, y_test_exp = train_test_split(exp_data_inp, exp_data_out, test_size=0.25, random_state=42)
x_train_inter, x_test_inter, y_train_inter, y_test_inter = train_test_split(inter_data_inp, inter_data_out, test_size=0.25, random_state=42)

x_val_exp, x_test_exp, y_val_exp, y_test_exp = train_test_split(x_test_exp, y_test_exp, train_size=0.6, random_state=49)
x_val_inter, x_test_inter, y_val_inter, y_test_inter = train_test_split(x_test_inter, y_test_inter, train_size=0.6, random_state=49)

I have 3 set - train, val, test
each for 2 section - exp and inter 
followed by their output

In [15]:
x_train_exp.shape, x_val_exp.shape, x_test_exp.shape, x_train_inter.shape, x_val_inter.shape, x_test_inter.shape

((36, 8), (7, 8), (6, 8), (150, 8), (30, 8), (20, 8))

In [16]:
x_train = pd.concat([x_train_exp, x_train_inter], axis=0)
y_train = pd.concat([y_train_exp, y_train_inter], axis=0)
x_val = pd.concat([x_val_exp, x_val_inter], axis=0)
y_val = pd.concat([y_val_exp, y_val_inter], axis=0)
x_test = pd.concat([x_test_exp, x_test_inter], axis=0)
y_test = pd.concat([y_test_exp, y_test_inter], axis=0)

x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

((186, 8), (37, 8), (26, 8), (186, 5), (37, 5), (26, 5))

## Modeling and evaluation

## Base line models

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.multioutput import MultiOutputRegressor

In [23]:
def evaluate(true, pred):
    ret = {}
    ret['MAE'] = mean_absolute_error(true, pred)
    ret['MSE'] = mean_squared_error(true, pred)
    ret['RMSE'] = np.sqrt(ret['MSE']).item()
    ret['R2S'] = r2_score(true, pred)
    return ret

In [34]:
models = {
    'Liner Regression' : LinearRegression,
    'Lasso'            : Lasso,
    'Ridge'            : Ridge,
    'KNN Regressor'    : KNeighborsRegressor,
    'Decision Tree'    : DecisionTreeRegressor,
    'Random Forest'    : RandomForestRegressor,
    'XG Boost'         : XGBRegressor,
    'Ada Boost'        : AdaBoostRegressor
}


params = {
    # 'Liner Regression' : dict(),
    # 'Lasso'            : dict(),
    # 'Ridge'            : dict(),
    # 'KNN Regressor'    : dict(),
    # 'Decision Tree'    : dict(),
    # 'Random Forest'    : dict(),
    # 'XG Boost'         : dict(),
    # 'Ada Boost'        : dict()
    'Liner Regression' : {'fit_intercept' : True},
    'Lasso'            : dict(),
    'Ridge'            : dict(),
    'KNN Regressor'    : {'n_neighbors': 10, 'n_jobs' : -1},
    'Decision Tree'    : {'max_depth' : 7, 'min_samples_split':2 },
    'Random Forest'    : {'n_estimators': 70, 'max_depth' : 5, },
    'XG Boost'         : {'n_estimators': 50, 'max_depth' : 5},
    'Ada Boost'        : {'n_estimators': 50, 'learning_rate':0.5}
}

In [None]:
result_list = []

for model_name, model_class in models.items():
    model_params = params[model_name]
    model = model_class(**model_params)

    # Wrap models that do NOT support multi-output regression natively
    if model_name in ['KNN Regressor', 'Decision Tree', 'Random Forest', 'Ada Boost', 'XG Boost']:
        model = MultiOutputRegressor(model)

    model.fit(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluation
    train_score = evaluate(y_train, y_train_pred)
    train_score['Type'] = 'Training'
    train_score['Model'] = model_name

    test_score = evaluate(y_test, y_test_pred)
    test_score['Type'] = 'Testing'
    test_score['Model'] = model_name

    print(model_name, test_score['R2S'])

    result_list.extend([train_score, test_score])

pd.DataFrame(result_list).sort_values(['Type', 'R2S'], ascending=[True, False])

Liner Regression 0.12098529124790916
Lasso 0.1183862537365272
Ridge 0.13590847057230102
KNN Regressor 0.18536991543042938
Decision Tree 0.0009028739699877386
Random Forest 0.3206207569046781
XG Boost 0.3469220697879791
Ada Boost 0.2888642695514054


Unnamed: 0,MAE,MSE,RMSE,R2S,Type,Model
13,2.369722,9.886594,3.144295,0.346922,Testing,XG Boost
11,2.465266,10.39155,3.223593,0.320621,Testing,Random Forest
15,2.581759,10.786268,3.284245,0.288864,Testing,Ada Boost
7,2.860531,14.080656,3.75242,0.18537,Testing,KNN Regressor
5,3.079325,16.339779,4.042249,0.135908,Testing,Ridge
1,3.101732,16.692884,4.085693,0.120985,Testing,Liner Regression
3,3.087926,16.341881,4.042509,0.118386,Testing,Lasso
9,2.677366,14.767013,3.842787,0.000903,Testing,Decision Tree
12,0.142019,0.053983,0.232343,0.997442,Training,XG Boost
8,1.536088,5.570194,2.360126,0.675446,Training,Decision Tree


### Scaling 

In [45]:
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
scaler_y = StandardScaler()

In [46]:
x_train_scaled = scaler_x.fit_transform(x_train)
x_val_scaled = scaler_x.transform(x_val)
x_test_scaled = scaler_x.transform(x_test)


y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)    
y_test_scaled = scaler_y.transform(y_test)

In [47]:
result_list = []

for model_name, model_class in models.items():
    model_params = params[model_name]
    model = model_class(**model_params)

    # Wrap models that do NOT support multi-output regression natively
    if model_name in ['KNN Regressor', 'Decision Tree', 'Random Forest', 'Ada Boost', 'XG Boost']:
        model = MultiOutputRegressor(model)

    model.fit(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluation
    train_score = evaluate(y_train, y_train_pred)
    train_score['Type'] = 'Training'
    train_score['Model'] = model_name

    test_score = evaluate(y_test, y_test_pred)
    test_score['Type'] = 'Testing'
    test_score['Model'] = model_name

    print(model_name, test_score['R2S'])

    result_list.extend([train_score, test_score])


pd.DataFrame(result_list).sort_values(['Type', 'R2S'], ascending=[True, False])

Liner Regression 0.12098529124790916
Lasso 0.1183862537365272
Ridge 0.13590847057230102
KNN Regressor 0.18536991543042938
Decision Tree 0.03921689976066869
Random Forest 0.3400305351660119
XG Boost 0.3469220697879791
Ada Boost 0.28347862005896135


Unnamed: 0,MAE,MSE,RMSE,R2S,Type,Model
13,2.369722,9.886594,3.144295,0.346922,Testing,XG Boost
11,2.428046,10.120278,3.181238,0.340031,Testing,Random Forest
15,2.597942,10.698604,3.270872,0.283479,Testing,Ada Boost
7,2.860531,14.080656,3.75242,0.18537,Testing,KNN Regressor
5,3.079325,16.339779,4.042249,0.135908,Testing,Ridge
1,3.101732,16.692884,4.085693,0.120985,Testing,Liner Regression
3,3.087926,16.341881,4.042509,0.118386,Testing,Lasso
9,2.67702,14.679477,3.831381,0.039217,Testing,Decision Tree
12,0.142019,0.053983,0.232343,0.997442,Training,XG Boost
8,1.536088,5.570194,2.360126,0.675446,Training,Decision Tree
