In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
exp_data = pd.read_csv('../Data/CleanedData.csv')
inter_data = pd.read_csv('../Data/InterpolatedData.csv')

print(exp_data.columns)
inter_data.head(5)

Index(['helical_angle', 'helical_layer_count', 'hoop_layer_count',
       'bust_pressure', 'tensile_str', 'e1_gpa', 'youngs_modulus',
       'poision_ratio', 'yeild_strength', 'ult_tensile_strength',
       'liner_thickness', 'diameter', 'doily_layers'],
      dtype='object')


Unnamed: 0,helical_angle,helical_layer_count,hoop_layer_count,tensile_str,e1_gpa,youngs_modulus,poision_ratio,yeild_strength,ult_tensile_strength,liner_thickness,diameter,doily_layers,bust_pressure
0,12.0,14.0,3.0,830.0,37.0,138.0,0.3,231.0,342.0,0.55,147.0,10.0,992.41
1,42.0,16.0,1.0,711.0,45.0,138.0,0.27,234.0,342.0,0.56,149.0,4.0,989.93
2,12.0,6.0,8.0,922.0,104.0,102.0,0.31,246.0,317.0,0.63,152.0,7.0,992.15
3,43.0,7.0,7.0,950.0,47.0,89.0,0.28,238.0,313.0,0.26,164.0,12.0,994.02
4,44.0,12.0,11.0,878.0,97.0,70.0,0.29,242.0,340.0,0.51,126.0,17.0,995.5


Making the Training and testing dataset
- Using the ratio - 75:15:10

In [3]:
input_cols = ['bust_pressure',  'tensile_str', 'e1_gpa', 'youngs_modulus', 'poision_ratio', 'yeild_strength', 'diameter', 'ult_tensile_strength']
output_cols = ['helical_angle', 'helical_layer_count', 'hoop_layer_count', 'liner_thickness', 'doily_layers']

In [4]:
exp_data_inp = exp_data[input_cols]
exp_data_out = exp_data[output_cols]
inter_data_inp = inter_data[input_cols]
inter_data_out = inter_data[output_cols]

In [5]:
x_train_exp, x_test_exp, y_train_exp, y_test_exp = train_test_split(exp_data_inp, exp_data_out, test_size=0.25, random_state=42)
x_train_inter, x_test_inter, y_train_inter, y_test_inter = train_test_split(inter_data_inp, inter_data_out, test_size=0.25, random_state=42)

x_val_exp, x_test_exp, y_val_exp, y_test_exp = train_test_split(x_test_exp, y_test_exp, train_size=0.6, random_state=49)
x_val_inter, x_test_inter, y_val_inter, y_test_inter = train_test_split(x_test_inter, y_test_inter, train_size=0.6, random_state=49)

I have 3 set - train, val, test
each for 2 section - exp and inter 
followed by their output

In [6]:
x_train_exp.shape, x_val_exp.shape, x_test_exp.shape, x_train_inter.shape, x_val_inter.shape, x_test_inter.shape

((32, 8), (6, 8), (5, 8), (337, 8), (67, 8), (46, 8))

In [7]:
x_train = pd.concat([x_train_exp, x_train_inter], axis=0)
y_train = pd.concat([y_train_exp, y_train_inter], axis=0)
x_val = pd.concat([x_val_exp, x_val_inter], axis=0)
y_val = pd.concat([y_val_exp, y_val_inter], axis=0)
x_test = pd.concat([x_test_exp, x_test_inter], axis=0)
y_test = pd.concat([y_test_exp, y_test_inter], axis=0)

x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

((369, 8), (73, 8), (51, 8), (369, 5), (73, 5), (51, 5))

## Modeling and evaluation

## Base line models

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.multioutput import MultiOutputRegressor

In [14]:
def evaluate(true, pred):
    ret = {}
    ret['MAE'] = mean_absolute_error(true, pred)
    ret['MSE'] = mean_squared_error(true, pred)
    ret['RMSE'] = np.sqrt(ret['MSE']).item()
    ret['R2S'] = r2_score(true, pred)
    return ret

In [15]:
models = {
    'Liner Regression' : LinearRegression,
    'Lasso'            : Lasso,
    'Ridge'            : Ridge,
    'KNN Regressor'    : KNeighborsRegressor,
    'Decision Tree'    : DecisionTreeRegressor,
    'Random Forest'    : RandomForestRegressor,
    'XG Boost'         : XGBRegressor,
    'Ada Boost'        : AdaBoostRegressor
}


params = {
    # 'Liner Regression' : dict(),
    # 'Lasso'            : dict(),
    # 'Ridge'            : dict(),
    # 'KNN Regressor'    : dict(),
    # 'Decision Tree'    : dict(),
    # 'Random Forest'    : dict(),
    # 'XG Boost'         : dict(),
    # 'Ada Boost'        : dict()
    'Liner Regression' : {'fit_intercept' : True},
    'Lasso'            : dict(),
    'Ridge'            : dict(),
    'KNN Regressor'    : {'n_neighbors': 10, 'n_jobs' : -1},
    'Decision Tree'    : {'max_depth' : 7, 'min_samples_split':2 },
    'Random Forest'    : {'n_estimators': 70, 'max_depth' : 5, },
    'XG Boost'         : {'n_estimators': 50, 'max_depth' : 5},
    'Ada Boost'        : {'n_estimators': 50, 'learning_rate':0.5}
}

In [17]:
result_list = []

for model_name, model_class in models.items():
    model_params = params[model_name]
    model = model_class(**model_params)

    # Wrap models that do NOT support multi-output regression natively
    if model_name in ['KNN Regressor', 'Decision Tree', 'Random Forest', 'Ada Boost', 'XG Boost']:
        model = MultiOutputRegressor(model)

    model.fit(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluation
    train_score = evaluate(y_train, y_train_pred)
    train_score['Type'] = 'Training'
    train_score['Model'] = model_name

    test_score = evaluate(y_test, y_test_pred)
    test_score['Type'] = 'Testing'
    test_score['Model'] = model_name

    print(model_name, test_score['R2S'])

    result_list.extend([train_score, test_score])

pd.DataFrame(result_list).sort_values(['Type', 'R2S'], ascending=[True, False])

Liner Regression 0.03776297575304257
Lasso 0.028760810656782976
Ridge 0.029587818592534898
KNN Regressor 0.2504263457989119
Decision Tree -0.016002698792777447
Random Forest 0.41313979457096783
XG Boost 0.3658352494239807
Ada Boost 0.38216237227140615


Unnamed: 0,MAE,MSE,RMSE,R2S,Type,Model
11,3.767019,33.341883,5.774243,0.41314,Testing,Random Forest
15,3.962407,33.182276,5.760406,0.382162,Testing,Ada Boost
13,3.762515,37.94199,6.159707,0.365835,Testing,XG Boost
7,4.188592,38.466172,6.20211,0.250426,Testing,KNN Regressor
1,4.832675,43.185449,6.571564,0.037763,Testing,Liner Regression
5,4.870261,43.315254,6.581433,0.029588,Testing,Ridge
3,4.880483,43.335696,6.582985,0.028761,Testing,Lasso
9,4.484349,55.934872,7.478962,-0.016003,Testing,Decision Tree
12,0.395771,0.453369,0.673327,0.990555,Training,XG Boost
8,2.348659,18.827085,4.339019,0.722661,Training,Decision Tree


### Scaling 

In [18]:
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
scaler_y = StandardScaler()

In [19]:
x_train_scaled = scaler_x.fit_transform(x_train)
x_val_scaled = scaler_x.transform(x_val)
x_test_scaled = scaler_x.transform(x_test)


y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)    
y_test_scaled = scaler_y.transform(y_test)

In [20]:
result_list = []

for model_name, model_class in models.items():
    model_params = params[model_name]
    model = model_class(**model_params)

    # Wrap models that do NOT support multi-output regression natively
    if model_name in ['KNN Regressor', 'Decision Tree', 'Random Forest', 'Ada Boost', 'XG Boost']:
        model = MultiOutputRegressor(model)

    model.fit(x_train, y_train)

    # Predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluation
    train_score = evaluate(y_train, y_train_pred)
    train_score['Type'] = 'Training'
    train_score['Model'] = model_name

    test_score = evaluate(y_test, y_test_pred)
    test_score['Type'] = 'Testing'
    test_score['Model'] = model_name

    print(model_name, test_score['R2S'])

    result_list.extend([train_score, test_score])


pd.DataFrame(result_list).sort_values(['Type', 'R2S'], ascending=[True, False])

Liner Regression 0.03776297575304257
Lasso 0.028760810656782976
Ridge 0.029587818592534898
KNN Regressor 0.2504263457989119
Decision Tree -0.0015531939746169065
Random Forest 0.39451869004407514
XG Boost 0.3658352494239807
Ada Boost 0.38664759166492996


Unnamed: 0,MAE,MSE,RMSE,R2S,Type,Model
11,3.804872,33.547783,5.792045,0.394519,Testing,Random Forest
15,3.973089,33.134546,5.756261,0.386648,Testing,Ada Boost
13,3.762515,37.94199,6.159707,0.365835,Testing,XG Boost
7,4.188592,38.466172,6.20211,0.250426,Testing,KNN Regressor
1,4.832675,43.185449,6.571564,0.037763,Testing,Liner Regression
5,4.870261,43.315254,6.581433,0.029588,Testing,Ridge
3,4.880483,43.335696,6.582985,0.028761,Testing,Lasso
9,4.290961,51.294848,7.162042,-0.001553,Testing,Decision Tree
12,0.395771,0.453369,0.673327,0.990555,Training,XG Boost
8,2.34884,18.827175,4.339029,0.722659,Training,Decision Tree
