# Lung cancer overall stage prediction

In [81]:
import radiomics as rm
import numpy as np
import nrrd
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

lung_df = pd.read_csv('features.csv', sep=';')
target_df = pd.read_csv('NSCLC_target.csv')
lung_df = lung_df.join(target_df)

## Dataset exploration

In [82]:
lung_df.head()

Unnamed: 0.1,Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,...,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,0,0.730057,0.54739,46.111107,84.238088,94.916393,82.904318,95.048181,95.45335,156058.192253,...,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
1,1,0.880026,0.755086,70.05051,92.771551,116.811054,101.661967,103.566608,125.188825,359165.698933,...,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1
2,2,0.541368,0.349893,25.21624,72.068431,56.708364,83.506157,62.535632,83.967822,34629.83472,...,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
3,3,0.49082,0.287317,30.821869,107.274834,110.729228,110.003931,68.533539,117.543366,84214.687347,...,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1
4,4,0.710922,0.448589,35.245066,78.568642,77.737209,87.455405,85.0944,103.629893,83150.790783,...,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1


In [83]:
lung_df = lung_df.drop(['Unnamed: 0'], axis=1)

In [84]:
lung_df.shape

(422, 111)

In [85]:
lung_df.describe()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,original_gldm_SmallDependenceEmphasis,original_gldm_SmallDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceLowGrayLevelEmphasis,metastases,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Survival.time,deadstatus.event
count,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,422.0,...,422.0,422.0,422.0,422.0,400.0,421.0,422.0,422.0,422.0,422.0
mean,0.728937,0.564163,31.579197,59.903315,61.618195,62.590186,58.770415,72.425552,72194.981,41.240452,...,0.242888,208.398794,0.001459,0.736967,68.0365,2.475059,1.35545,0.030806,988.85782,0.883886
std,0.159549,0.155409,15.240503,33.986929,31.548944,31.419757,28.662955,35.414637,93959.171485,19.420478,...,0.125994,80.947686,0.002055,0.863352,10.083498,1.130727,1.218116,0.294847,1035.567789,0.320742
min,0.062267,0.0,0.0,11.611576,11.301772,11.920146,12.079411,13.423194,472.784042,8.838781,...,0.033809,23.588675,3.2e-05,0.0,33.6849,1.0,0.0,0.0,10.0,0.0
25%,0.644885,0.471294,19.506015,34.690059,36.133637,36.631579,35.434087,42.90466,10188.102722,25.453748,...,0.148155,152.424461,0.00035,0.0,61.2348,2.0,0.0,0.0,261.0,1.0
50%,0.75251,0.575873,30.138991,54.390975,57.376514,57.874786,57.083719,68.111635,38690.883444,39.458662,...,0.213871,202.235113,0.000689,1.0,68.5834,2.0,2.0,0.0,545.5,1.0
75%,0.847179,0.668792,41.625214,75.877961,81.687375,82.227255,78.165686,95.151588,101446.032524,54.905546,...,0.326049,262.841738,0.001615,1.0,75.814475,4.0,2.0,0.0,1397.0,1.0
max,0.997176,0.915317,85.55639,304.261528,216.7939,187.719847,208.353677,216.848881,661001.068732,95.426431,...,0.634076,503.115123,0.0141,6.0,91.7043,5.0,4.0,3.0,4454.0,1.0


In [91]:
clean_lung = lung_df.drop(["PatientID"], axis=1)
label_encoder = LabelEncoder()
clean_lung["gender"] = label_encoder.fit_transform(clean_lung["gender"])
clean_lung = clean_lung.rename(columns={"clinical.T.Stage": "t_stage", "Clinical.N.Stage": "n_stage", 
                                        "Clinical.M.Stage": "m_stage", "Overall.Stage": "overall_stage"})
clean_lung = clean_lung.drop(labels=[271, 84], axis=0)
clean_lung["t_stage"] = clean_lung["t_stage"].dropna()
t_stage = clean_lung["t_stage"]
n_stage = clean_lung["n_stage"]
m_stage = clean_lung["m_stage"]

## Feature processing

In [92]:
from sklearn.preprocessing import LabelEncoder

predict_overall_stage = clean_lung.drop(["Histology", "Survival.time", "deadstatus.event", "m_stage", "n_stage", "t_stage"]
                                        , axis=1)

predict_overall_stage['overall_stage'].dropna(inplace = True)
target_overall = predict_overall_stage['overall_stage']

    
label_encoder = LabelEncoder()
target_overall = label_encoder.fit_transform(target_overall)

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer

features_overall = predict_overall_stage.drop(["overall_stage"], axis=1)
features_overall = features_overall.fillna(0)
features_overall

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,original_gldm_LargeDependenceEmphasis,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_gldm_LargeDependenceLowGrayLevelEmphasis,original_gldm_LowGrayLevelEmphasis,original_gldm_SmallDependenceEmphasis,original_gldm_SmallDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceLowGrayLevelEmphasis,metastases,age,gender
0,0.730057,0.547390,46.111107,84.238088,94.916393,82.904318,95.048181,95.453350,156058.192253,61.498644,...,64.282088,114939.612603,0.102407,0.002637,0.186916,206.427211,0.000523,0.0,78.7515,1
1,0.880026,0.755086,70.050510,92.771551,116.811054,101.661967,103.566608,125.188825,359165.698933,81.641365,...,76.437748,142846.540839,0.052014,0.001332,0.142663,165.984345,0.000334,0.0,83.8001,1
2,0.541368,0.349893,25.216240,72.068431,56.708364,83.506157,62.535632,83.967822,34629.834720,39.015548,...,25.568303,43154.784522,0.019421,0.001525,0.304123,311.508421,0.000710,2.0,68.1807,1
3,0.490820,0.287317,30.821869,107.274834,110.729228,110.003931,68.533539,117.543366,84214.687347,52.652619,...,40.228007,68201.600149,0.026679,0.001167,0.202153,253.308845,0.000398,2.0,70.8802,1
4,0.710922,0.448589,35.245066,78.568642,77.737209,87.455405,85.094400,103.629893,83150.790783,55.856187,...,43.938237,85015.211818,0.027079,0.001295,0.193209,277.260954,0.000468,0.0,80.4819,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,0.956116,0.520980,40.251075,77.260243,101.965908,118.000536,145.900529,146.334346,94588.637352,73.869738,...,101.637206,183819.319116,0.073470,0.001224,0.148707,207.179793,0.000237,1.0,53.6712,1
418,0.642440,0.497774,43.610094,87.610311,69.745562,108.443940,96.343719,109.304718,132211.446762,56.284411,...,107.774755,194589.231507,0.062325,0.000868,0.138558,167.362281,0.000204,1.0,66.5096,1
419,0.873311,0.683309,24.342849,35.624951,43.319309,41.255789,41.946741,43.713782,18872.380257,31.111663,...,88.975580,117774.434278,0.069912,0.001818,0.234923,169.101021,0.000843,1.0,73.3808,1
420,0.775330,0.640303,33.136846,51.751822,58.755344,58.028557,63.925695,67.862169,42064.547539,40.124730,...,95.202262,170635.632121,0.078652,0.002560,0.221144,222.123611,0.000645,1.0,61.7041,0


## Feature scaling

In [94]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(features_overall), columns=features_overall.columns)
scaled_features

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,original_gldm_LargeDependenceEmphasis,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_gldm_LargeDependenceLowGrayLevelEmphasis,original_gldm_LowGrayLevelEmphasis,original_gldm_SmallDependenceEmphasis,original_gldm_SmallDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceLowGrayLevelEmphasis,metastases,age,gender
0,-0.110164,-0.143725,0.711566,0.725912,0.819325,0.536432,0.881228,0.514615,1.281673,0.739346,...,0.237301,0.531865,1.148874,0.277522,-0.150548,0.032335,-0.122403,-1.0,0.694847,0.0
1,0.625674,0.904322,1.788944,0.934564,1.298722,0.946217,1.081244,1.084371,3.501904,1.417899,...,0.466814,0.876915,0.083930,-0.210961,-0.397710,-0.331857,-0.269776,-1.0,1.020837,0.0
2,-1.035991,-1.140309,-0.228794,0.428351,-0.017264,0.549580,0.117821,0.294542,-0.045697,-0.018048,...,-0.493658,-0.355705,-0.604869,-0.138797,0.504077,0.978601,0.023316,1.0,0.012288,0.0
3,-1.284012,-1.456072,0.023483,1.289185,1.165557,1.128458,0.258654,0.937878,0.496330,0.441348,...,-0.216866,-0.046018,-0.451486,-0.273069,-0.065447,0.454508,-0.220040,1.0,0.186595,0.0
4,-0.204054,-0.642280,0.222547,0.587288,0.443176,0.635856,0.647509,0.671284,0.484700,0.549267,...,-0.146813,0.161870,-0.443028,-0.224984,-0.115399,0.670199,-0.165773,-1.0,0.806580,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,0.999018,-0.276990,0.447839,0.555296,0.973679,1.303155,2.075260,1.489537,0.609731,1.156094,...,0.942607,1.383515,0.537354,-0.251410,-0.363955,0.039112,-0.345602,0.0,-0.924595,0.0
416,-0.540067,-0.394094,0.599010,0.808367,0.268193,1.094378,0.911648,0.780018,1.020997,0.563693,...,1.058491,1.516678,0.301833,-0.384989,-0.420638,-0.319449,-0.371789,0.0,-0.095616,0.0
417,0.592727,0.542130,-0.268101,-0.462730,-0.310426,-0.373435,-0.365614,-0.476759,-0.217947,-0.284308,...,0.703542,0.566916,0.462158,-0.028947,0.117582,-0.303791,0.127179,0.0,0.348060,0.0
418,0.111970,0.325119,0.127668,-0.068411,0.027555,-0.007012,0.150460,-0.014055,0.035574,0.019318,...,0.821108,1.220508,0.646854,0.248808,0.040624,0.173682,-0.027374,0.0,-0.405908,-1.0


In [95]:
from sklearn.model_selection import train_test_split

overall_X_train, overall_X_test, overall_y_train, overall_y_test = train_test_split(scaled_features, target_overall, 
                                                                                    test_size=0.2, random_state=0)

In [96]:
t_X_train, t_X_test, t_y_train, t_y_test = train_test_split(scaled_features, t_stage, test_size=0.2, random_state=0)

In [97]:
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(scaled_features, m_stage, test_size=0.2, random_state=0)

In [98]:
n_X_train, n_X_test, n_y_train, n_y_test = train_test_split(scaled_features, n_stage, test_size=0.2, random_state=0)

## Feature selection

In [99]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )
search.fit(overall_X_train, overall_y_train)

coefficients_overall = search.best_estimator_.named_steps['model'].coef_

search.fit(t_X_train, t_y_train)
coefficients_t = search.best_estimator_.named_steps['model'].coef_

search.fit(n_X_train, n_y_train)
coefficients_n = search.best_estimator_.named_steps['model'].coef_

search.fit(m_X_train, m_y_train)
coefficients_m = search.best_estimator_.named_steps['model'].coef_

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-1.173, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.893, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-1.089, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-1.251, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-1.098, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-1.181, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................... model__alpha=0.5, score=-1.035, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.412, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.636, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.396, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-1.236, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-1.035, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-1.412, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] .

[CV] .... model__alpha=1.8000000000000003, score=-1.236, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.035, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.412, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.636, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.396, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .... model__alpha=1.9000000000000001, score=-1.236, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .... model__alpha=1.9000000000000001, score=-1.035, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .

[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.236, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.035, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.412, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.636, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.396, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.236, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.035, total=   0.0s
[CV] m

[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-1.636, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-1.396, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.236, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.035, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.412, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.636, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.396, total=   0.0s
[CV] m

[CV] ................... model__alpha=5.6, score=-1.236, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-1.035, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-1.412, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-1.636, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-1.396, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-1.236, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-1.035, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] .

[CV] model__alpha=6.8 ................................................
[CV] ................... model__alpha=6.8, score=-1.396, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.236, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.035, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.412, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.636, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.396, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-1.236, total=   0.0s
[CV] m

[CV] ................... model__alpha=8.2, score=-1.412, total=   0.0s
[CV] model__alpha=8.2 ................................................
[CV] ................... model__alpha=8.2, score=-1.636, total=   0.0s
[CV] model__alpha=8.2 ................................................
[CV] ................... model__alpha=8.2, score=-1.396, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-1.236, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-1.035, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-1.412, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-1.636, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] .

[CV] ................... model__alpha=9.6, score=-1.636, total=   0.0s
[CV] model__alpha=9.6 ................................................
[CV] ................... model__alpha=9.6, score=-1.396, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.236, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.035, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.412, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.636, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.396, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:    4.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.876, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-1.077, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.959, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-1.048, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-1.102, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-0.961, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-1.123, total=   0.0s
[CV] 

[CV] .... model__alpha=1.5000000000000002, score=-1.242, total=   0.0s
[CV] model__alpha=1.5000000000000002 .................................
[CV] .... model__alpha=1.5000000000000002, score=-1.328, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-1.309, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-1.145, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-1.272, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-1.242, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-1.328, total=   0.0s
[CV] model__alpha=1.7000000000000002 .................................
[CV] .

[CV] .... model__alpha=2.9000000000000004, score=-1.272, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-1.242, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-1.328, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.309, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.145, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.272, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .... model__alpha=3.0000000000000004, score=-1.242, total=   0.0s
[CV] model__alpha=3.0000000000000004 .................................
[CV] .

[CV] ................... model__alpha=4.2, score=-1.272, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-1.242, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-1.328, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.309, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.145, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.272, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-1.242, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] .

[CV] ................... model__alpha=5.4, score=-1.328, total=   0.0s
[CV] model__alpha=5.5 ................................................
[CV] ................... model__alpha=5.5, score=-1.309, total=   0.0s
[CV] model__alpha=5.5 ................................................
[CV] ................... model__alpha=5.5, score=-1.145, total=   0.0s
[CV] model__alpha=5.5 ................................................
[CV] ................... model__alpha=5.5, score=-1.272, total=   0.0s
[CV] model__alpha=5.5 ................................................
[CV] ................... model__alpha=5.5, score=-1.242, total=   0.0s
[CV] model__alpha=5.5 ................................................
[CV] ................... model__alpha=5.5, score=-1.328, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-1.309, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] .

[CV] ................... model__alpha=6.8, score=-1.309, total=   0.0s
[CV] model__alpha=6.8 ................................................
[CV] ................... model__alpha=6.8, score=-1.145, total=   0.0s
[CV] model__alpha=6.8 ................................................
[CV] ................... model__alpha=6.8, score=-1.272, total=   0.0s
[CV] model__alpha=6.8 ................................................
[CV] ................... model__alpha=6.8, score=-1.242, total=   0.0s
[CV] model__alpha=6.8 ................................................
[CV] ................... model__alpha=6.8, score=-1.328, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.309, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-1.145, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] .

[CV] ................... model__alpha=8.0, score=-1.328, total=   0.0s
[CV] model__alpha=8.1 ................................................
[CV] ................... model__alpha=8.1, score=-1.309, total=   0.0s
[CV] model__alpha=8.1 ................................................
[CV] ................... model__alpha=8.1, score=-1.145, total=   0.0s
[CV] model__alpha=8.1 ................................................
[CV] ................... model__alpha=8.1, score=-1.272, total=   0.0s
[CV] model__alpha=8.1 ................................................
[CV] ................... model__alpha=8.1, score=-1.242, total=   0.0s
[CV] model__alpha=8.1 ................................................
[CV] ................... model__alpha=8.1, score=-1.328, total=   0.0s
[CV] model__alpha=8.2 ................................................
[CV] ................... model__alpha=8.2, score=-1.309, total=   0.0s
[CV] model__alpha=8.2 ................................................
[CV] .

[CV] ................... model__alpha=9.4, score=-1.242, total=   0.0s
[CV] model__alpha=9.4 ................................................
[CV] ................... model__alpha=9.4, score=-1.328, total=   0.0s
[CV] model__alpha=9.5 ................................................
[CV] ................... model__alpha=9.5, score=-1.309, total=   0.0s
[CV] model__alpha=9.5 ................................................
[CV] ................... model__alpha=9.5, score=-1.145, total=   0.0s
[CV] model__alpha=9.5 ................................................
[CV] ................... model__alpha=9.5, score=-1.272, total=   0.0s
[CV] model__alpha=9.5 ................................................
[CV] ................... model__alpha=9.5, score=-1.242, total=   0.0s
[CV] model__alpha=9.5 ................................................
[CV] ................... model__alpha=9.5, score=-1.328, total=   0.0s
[CV] model__alpha=9.6 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:    4.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] model__alpha=0.4 ................................................
[CV] ................... model__alpha=0.4, score=-1.278, total=   0.0s
[CV] model__alpha=0.4 ................................................
[CV] ................... model__alpha=0.4, score=-1.325, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.345, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.260, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.257, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.394, total=   0.0s
[CV] model__alpha=0.5 ................................................
[CV] ................... model__alpha=0.5, score=-1.412, total=   0.0s
[CV] 

[CV] .... model__alpha=1.8000000000000003, score=-1.451, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.437, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.432, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.581, total=   0.0s
[CV] model__alpha=1.8000000000000003 .................................
[CV] .... model__alpha=1.8000000000000003, score=-1.551, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .... model__alpha=1.9000000000000001, score=-1.451, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .... model__alpha=1.9000000000000001, score=-1.437, total=   0.0s
[CV] model__alpha=1.9000000000000001 .................................
[CV] .

[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.451, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.437, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.432, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.581, total=   0.0s
[CV] model__alpha=3.1 ................................................
[CV] ................... model__alpha=3.1, score=-1.551, total=   0.0s
[CV] model__alpha=3.2 ................................................
[CV] ................... model__alpha=3.2, score=-1.451, total=   0.0s
[CV] model__alpha=3.2 ................................................
[CV] ................... model__alpha=3.2, score=-1.437, total=   0.0s
[CV] m

[CV] .... model__alpha=4.3999999999999995, score=-1.432, total=   0.0s
[CV] model__alpha=4.3999999999999995 .................................
[CV] .... model__alpha=4.3999999999999995, score=-1.581, total=   0.0s
[CV] model__alpha=4.3999999999999995 .................................
[CV] .... model__alpha=4.3999999999999995, score=-1.551, total=   0.0s
[CV] model__alpha=4.5 ................................................
[CV] ................... model__alpha=4.5, score=-1.451, total=   0.0s
[CV] model__alpha=4.5 ................................................
[CV] ................... model__alpha=4.5, score=-1.437, total=   0.0s
[CV] model__alpha=4.5 ................................................
[CV] ................... model__alpha=4.5, score=-1.432, total=   0.0s
[CV] model__alpha=4.5 ................................................
[CV] ................... model__alpha=4.5, score=-1.581, total=   0.0s
[CV] model__alpha=4.5 ................................................
[CV] .

[CV] ................... model__alpha=5.7, score=-1.437, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-1.432, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-1.581, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-1.551, total=   0.0s
[CV] model__alpha=5.8 ................................................
[CV] ................... model__alpha=5.8, score=-1.451, total=   0.0s
[CV] model__alpha=5.8 ................................................
[CV] ................... model__alpha=5.8, score=-1.437, total=   0.0s
[CV] model__alpha=5.8 ................................................
[CV] ................... model__alpha=5.8, score=-1.432, total=   0.0s
[CV] model__alpha=5.8 ................................................
[CV] .

[CV] ................... model__alpha=7.0, score=-1.581, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-1.551, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] ................... model__alpha=7.1, score=-1.451, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] ................... model__alpha=7.1, score=-1.437, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] ................... model__alpha=7.1, score=-1.432, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] ................... model__alpha=7.1, score=-1.581, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] ................... model__alpha=7.1, score=-1.551, total=   0.0s
[CV] model__alpha=7.2 ................................................
[CV] .

[CV] ................... model__alpha=8.4, score=-1.432, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] ................... model__alpha=8.4, score=-1.581, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] ................... model__alpha=8.4, score=-1.551, total=   0.0s
[CV] model__alpha=8.5 ................................................
[CV] ................... model__alpha=8.5, score=-1.451, total=   0.0s
[CV] model__alpha=8.5 ................................................
[CV] ................... model__alpha=8.5, score=-1.437, total=   0.0s
[CV] model__alpha=8.5 ................................................
[CV] ................... model__alpha=8.5, score=-1.432, total=   0.0s
[CV] model__alpha=8.5 ................................................
[CV] ................... model__alpha=8.5, score=-1.581, total=   0.0s
[CV] model__alpha=8.5 ................................................
[CV] .

[CV] ..... model__alpha=9.700000000000001, score=-1.432, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.581, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-1.551, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] ................... model__alpha=9.8, score=-1.451, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] ................... model__alpha=9.8, score=-1.437, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] ................... model__alpha=9.8, score=-1.432, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] ................... model__alpha=9.8, score=-1.581, total=   0.0s
[CV] model__alpha=9.8 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:    4.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.133, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.133, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.001, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-0.001, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-0.133, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-0.133, total=   0.0s
[CV] model__alpha=0.30000000000000004 ................................
[CV] ... model__alpha=0.30000000000000004, score=-0.133, total=   0.0s
[CV] 

[CV] .... model__alpha=1.5000000000000002, score=-0.133, total=   0.0s
[CV] model__alpha=1.5000000000000002 .................................
[CV] .... model__alpha=1.5000000000000002, score=-0.133, total=   0.0s
[CV] model__alpha=1.5000000000000002 .................................
[CV] .... model__alpha=1.5000000000000002, score=-0.001, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-0.001, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-0.133, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-0.133, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] ................... model__alpha=1.6, score=-0.133, total=   0.0s
[CV] model__alpha=1.6 ................................................
[CV] .

[CV] .... model__alpha=2.8000000000000003, score=-0.133, total=   0.0s
[CV] model__alpha=2.8000000000000003 .................................
[CV] .... model__alpha=2.8000000000000003, score=-0.133, total=   0.0s
[CV] model__alpha=2.8000000000000003 .................................
[CV] .... model__alpha=2.8000000000000003, score=-0.001, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-0.001, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-0.133, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-0.133, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .... model__alpha=2.9000000000000004, score=-0.133, total=   0.0s
[CV] model__alpha=2.9000000000000004 .................................
[CV] .

[CV] ................... model__alpha=4.2, score=-0.001, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-0.133, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-0.133, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-0.133, total=   0.0s
[CV] model__alpha=4.2 ................................................
[CV] ................... model__alpha=4.2, score=-0.001, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-0.001, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] ................... model__alpha=4.3, score=-0.133, total=   0.0s
[CV] model__alpha=4.3 ................................................
[CV] .

[CV] ................... model__alpha=5.6, score=-0.001, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-0.133, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-0.133, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-0.133, total=   0.0s
[CV] model__alpha=5.6 ................................................
[CV] ................... model__alpha=5.6, score=-0.001, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-0.001, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] ................... model__alpha=5.7, score=-0.133, total=   0.0s
[CV] model__alpha=5.7 ................................................
[CV] .

[CV] ................... model__alpha=6.9, score=-0.133, total=   0.0s
[CV] model__alpha=6.9 ................................................
[CV] ................... model__alpha=6.9, score=-0.001, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-0.001, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-0.133, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-0.133, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-0.133, total=   0.0s
[CV] model__alpha=7.0 ................................................
[CV] ................... model__alpha=7.0, score=-0.001, total=   0.0s
[CV] model__alpha=7.1 ................................................
[CV] .

[CV] ................... model__alpha=8.3, score=-0.133, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-0.133, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-0.133, total=   0.0s
[CV] model__alpha=8.3 ................................................
[CV] ................... model__alpha=8.3, score=-0.001, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] ................... model__alpha=8.4, score=-0.001, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] ................... model__alpha=8.4, score=-0.133, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] ................... model__alpha=8.4, score=-0.133, total=   0.0s
[CV] model__alpha=8.4 ................................................
[CV] .

[CV] model__alpha=9.6 ................................................
[CV] ................... model__alpha=9.6, score=-0.133, total=   0.0s
[CV] model__alpha=9.6 ................................................
[CV] ................... model__alpha=9.6, score=-0.133, total=   0.0s
[CV] model__alpha=9.6 ................................................
[CV] ................... model__alpha=9.6, score=-0.133, total=   0.0s
[CV] model__alpha=9.6 ................................................
[CV] ................... model__alpha=9.6, score=-0.001, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-0.001, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-0.133, total=   0.0s
[CV] model__alpha=9.700000000000001 ..................................
[CV] ..... model__alpha=9.700000000000001, score=-0.133, total=   0.0s
[CV] m

[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:    4.1s finished


In [100]:
importance = np.abs(coefficients_overall)
not_relevant_features_over = np.array(scaled_features.columns)[importance == 0]
not_relevant_features_over = list(not_relevant_features)

importance = np.abs(coefficients_t)
not_relevant_features_t = np.array(scaled_features.columns)[importance == 0]
not_relevant_features_t = list(not_relevant_features)

importance = np.abs(coefficients_n)
not_relevant_features_n = np.array(scaled_features.columns)[importance == 0]
not_relevant_features_n = list(not_relevant_features)

In [101]:
selected_features = scaled_features.drop(not_relevant_features, axis= 1)
overall_X_train_2, overall_X_test_2, overall_y_train_2, overall_y_test_2 = train_test_split(selected_features, target_overall, 
                                                                                    test_size=0.2, random_state=0)

selected_features_t = scaled_features.drop(not_relevant_features_t, axis= 1)
t_X_train_2, t_X_test_2, t_y_train_2, t_y_test_2 = train_test_split(selected_features_t, t_stage, test_size=0.2, random_state=0)

selected_features_n = scaled_features.drop(not_relevant_features_n, axis= 1)
n_X_train_2, n_X_test_2, n_y_train_2, n_y_test_2 = train_test_split(selected_features_n, n_stage, test_size=0.2, random_state=0)

## Random forest model

In [102]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf.fit(overall_X_train, overall_y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'n_estimators': [50, 100, 150]})

In [103]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
def print_score(clf, X_train, y_train, X_test, y_test):
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [104]:
print_score(grid_search_rf, overall_X_train, overall_y_train, overall_X_test, overall_y_test)

Test Result:
Accuracy Score: 53.57%
_______________________________________________
CLASSIFICATION REPORT:
                   0    1      2          3  accuracy  macro avg  weighted avg
precision   0.833333  0.0   0.45   0.500000  0.535714   0.445833      0.541667
recall      0.416667  0.0   0.45   0.787879  0.535714   0.413636      0.535714
f1-score    0.555556  0.0   0.45   0.611765  0.535714   0.404330      0.506209
support    24.000000  7.0  20.00  33.000000  0.535714  84.000000     84.000000
_______________________________________________
Confusion Matrix: 
 [[10  0  3 11]
 [ 0  0  1  6]
 [ 2  0  9  9]
 [ 0  0  7 26]]



  _warn_prf(average, modifier, msg_start, len(result))


## Random forest model over selected features

In [105]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_2 = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_2.fit(overall_X_train_2, overall_y_train_2)
print("Best parameters: {}".format(grid_search_rf.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search_rf.best_score_))

Best parameters: {'bootstrap': True, 'n_estimators': 100}
Best cross-validation score: 0.51


In [106]:
print_score(grid_search_rf_2, overall_X_train_2, overall_y_train_2, overall_X_test_2, overall_y_test_2)

Test Result:
Accuracy Score: 44.05%
_______________________________________________
CLASSIFICATION REPORT:
                   0    1          2          3  accuracy  macro avg  \
precision   0.400000  0.0   0.333333   0.522727  0.440476   0.314015   
recall      0.250000  0.0   0.400000   0.696970  0.440476   0.336742   
f1-score    0.307692  0.0   0.363636   0.597403  0.440476   0.317183   
support    24.000000  7.0  20.000000  33.000000  0.440476  84.000000   

           weighted avg  
precision      0.399008  
recall         0.440476  
f1-score       0.409186  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[ 6  0 10  8]
 [ 0  0  1  6]
 [ 5  0  8  7]
 [ 4  1  5 23]]



## SVM model

In [107]:
from sklearn import svm

svm_clf_2 = svm.SVC(decision_function_shape='ovo')
svm_clf_2.fit(overall_X_train_2, overall_y_train_2)

lin_clf_2 = svm.LinearSVC(max_iter=100000)
lin_clf_2.fit(overall_X_train_2, overall_y_train_2)

LinearSVC(max_iter=100000)

In [108]:
print_score(svm_clf_2, overall_X_train_2, overall_y_train_2, overall_X_test_2, overall_y_test_2)

Test Result:
Accuracy Score: 44.05%
_______________________________________________
CLASSIFICATION REPORT:
                   0    1          2          3  accuracy  macro avg  \
precision   0.500000  0.0   0.250000   0.451613  0.440476   0.300403   
recall      0.291667  0.0   0.100000   0.848485  0.440476   0.310038   
f1-score    0.368421  0.0   0.142857   0.589474  0.440476   0.275188   
support    24.000000  7.0  20.000000  33.000000  0.440476  84.000000   

           weighted avg  
precision      0.379800  
recall         0.440476  
f1-score       0.370856  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[ 7  0  5 12]
 [ 0  0  0  7]
 [ 3  0  2 15]
 [ 4  0  1 28]]



  _warn_prf(average, modifier, msg_start, len(result))


In [109]:
print_score(lin_clf_2, overall_X_train_2, overall_y_train_2, overall_X_test_2, overall_y_test_2)

Test Result:
Accuracy Score: 44.05%
_______________________________________________
CLASSIFICATION REPORT:
                   0    1          2          3  accuracy  macro avg  \
precision   0.578947  0.0   0.214286   0.450980  0.440476   0.311053   
recall      0.458333  0.0   0.150000   0.696970  0.440476   0.326326   
f1-score    0.511628  0.0   0.176471   0.547619  0.440476   0.308929   
support    24.000000  7.0  20.000000  33.000000  0.440476  84.000000   

           weighted avg  
precision      0.393605  
recall         0.440476  
f1-score       0.403332  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[11  0  5  8]
 [ 0  0  1  6]
 [ 3  0  3 14]
 [ 5  0  5 23]]



  _warn_prf(average, modifier, msg_start, len(result))


## T_stage models

In [110]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_t = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_t.fit(t_X_train, t_y_train)
print_score(grid_search_rf_t, t_X_train, t_y_train, t_X_test, t_y_test)



Test Result:
Accuracy Score: 52.38%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0        3.0        4.0  accuracy  macro avg  \
precision   0.666667   0.517241   1.000000   0.400000   0.52381   0.645977   
recall      0.761905   0.500000   0.100000   0.521739   0.52381   0.470911   
f1-score    0.711111   0.508475   0.181818   0.452830   0.52381   0.463559   
support    21.000000  30.000000  10.000000  23.000000   0.52381  84.000000   

           weighted avg  
precision      0.579967  
recall         0.523810  
f1-score       0.505010  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[16  3  0  2]
 [ 5 15  0 10]
 [ 0  3  1  6]
 [ 3  8  0 12]]



In [111]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_t_2 = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_t_2.fit(t_X_train_2, t_y_train_2)
print_score(grid_search_rf_t_2, t_X_train_2, t_y_train_2, t_X_test_2, t_y_test_2)



Test Result:
Accuracy Score: 57.14%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0        3.0        4.0  accuracy  macro avg  \
precision   0.695652   0.555556   1.000000   0.458333  0.571429   0.677385   
recall      0.761905   0.666667   0.100000   0.478261  0.571429   0.501708   
f1-score    0.727273   0.606061   0.181818   0.468085  0.571429   0.495809   
support    21.000000  30.000000  10.000000  23.000000  0.571429  84.000000   

           weighted avg  
precision      0.616869  
recall         0.571429  
f1-score       0.548080  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[16  3  0  2]
 [ 4 20  0  6]
 [ 0  4  1  5]
 [ 3  9  0 11]]



In [112]:
svm_clf_t = svm.SVC(decision_function_shape='ovo')
svm_clf_t.fit(t_X_train, t_y_train)

svm_clf_2_t = svm.SVC(decision_function_shape='ovo')
svm_clf_2_t.fit(t_X_train_2, t_y_train_2)

lin_clf_t = svm.LinearSVC(max_iter=100000)
lin_clf_t.fit(t_X_train, t_y_train)

lin_clf_2_t = svm.LinearSVC(max_iter=100000)
lin_clf_2_t.fit(t_X_train_2, t_y_train_2)



LinearSVC(max_iter=100000)

In [113]:
print_score(svm_clf_t, t_X_train, t_y_train, t_X_test, t_y_test)
print_score(svm_clf_2_t, t_X_train_2, t_y_train_2, t_X_test_2, t_y_test_2)

Test Result:
Accuracy Score: 48.81%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0   3.0        4.0  accuracy  macro avg  \
precision   0.642857   0.465517   0.0   0.416667  0.488095   0.381260   
recall      0.428571   0.900000   0.0   0.217391  0.488095   0.386491   
f1-score    0.514286   0.613636   0.0   0.285714  0.488095   0.353409   
support    21.000000  30.000000  10.0  23.000000  0.488095  84.000000   

           weighted avg  
precision      0.441058  
recall         0.488095  
f1-score       0.425959  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[ 9 10  0  2]
 [ 2 27  0  1]
 [ 0  6  0  4]
 [ 3 15  0  5]]

Test Result:
Accuracy Score: 51.19%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0   3.0        4.0  accuracy  macro avg  \
precision   0.750000   0.487805   0.0   0.407407  0.511905   0.411303   
recall      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [114]:
print_score(lin_clf_t, t_X_train, t_y_train, t_X_test, t_y_test)
print_score(lin_clf_2_t, t_X_train_2, t_y_train_2, t_X_test_2, t_y_test_2)

Test Result:
Accuracy Score: 44.05%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0        3.0        4.0  accuracy  macro avg  \
precision   0.590909   0.466667   0.375000   0.291667  0.440476   0.431061   
recall      0.619048   0.466667   0.300000   0.304348  0.440476   0.422516   
f1-score    0.604651   0.466667   0.333333   0.297872  0.440476   0.425631   
support    21.000000  30.000000  10.000000  23.000000  0.440476  84.000000   

           weighted avg  
precision      0.438898  
recall         0.440476  
f1-score       0.439072  
support       84.000000  
_______________________________________________
Confusion Matrix: 
 [[13  5  1  2]
 [ 4 14  0 12]
 [ 0  4  3  3]
 [ 5  7  4  7]]

Test Result:
Accuracy Score: 46.43%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0        3.0        4.0  accuracy  macro avg  \
precision   0.619048   0.434783   0.333333   0.357143  0.

## N_stage models

In [41]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_n = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_n.fit(n_X_train, n_y_train)
print_score(grid_search_rf_n, n_X_train, n_y_train, n_X_test, n_y_test)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'n_estimators': [50, 100, 150]})

In [42]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_n_2 = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_n_2.fit(n_X_train_2, n_y_train_2)
print_score(grid_search_rf_n_2, n_X_train_2, n_y_train_2, n_X_test_2, n_y_test_2)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'n_estimators': [50, 100, 150]})

In [None]:
svm_clf_n = svm.SVC(decision_function_shape='ovo')
svm_clf_n.fit(n_X_train, n_y_train)

svm_clf_2_n = svm.SVC(decision_function_shape='ovo')
svm_clf_2_n.fit(n_X_train_2, n_y_train_2)

lin_clf_n = svm.LinearSVC(max_iter=100000)
lin_clf_n.fit(n_X_train, n_y_train)

lin_clf_2_n = svm.LinearSVC(max_iter=100000)
lin_clf_2_n.fit(n_X_train_2, n_y_train_2)

In [None]:
print_score(svm_clf_n, n_X_train, n_y_train, n_X_test, n_y_test)
print_score(svm_clf_2_n, n_X_train_2, n_y_train_2, n_X_test_2, n_y_test_2)

In [None]:
print_score(lin_clf_n, n_X_train, n_y_train, n_X_test, n_y_test)
print_score(lin_clf_2_n, n_X_train_2, n_y_train_2, n_X_test_2, n_y_test_2)

## M_stage models

In [None]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_m = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_m.fit(m_X_train, m_y_train)
print_score(grid_search_rf_m, m_X_train, m_y_train, m_X_test, m_y_test)

In [None]:
param_grid = {"n_estimators": [50, 100, 150], "bootstrap": [True, False]}

grid_search_rf_m_2 = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search_rf_m_2.fit(m_X_train_2, m_y_train_2)
print_score(grid_search_rf_m_2, m_X_train_2, m_y_train_2, m_X_test_2, m_y_test_2)

In [None]:
svm_clf_m = svm.SVC(decision_function_shape='ovo')
svm_clf_m.fit(m_X_train, m_y_train)

svm_clf_2_m = svm.SVC(decision_function_shape='ovo')
svm_clf_2_m.fit(m_X_train_2, m_y_train_2)

lin_clf_m = svm.LinearSVC(max_iter=100000)
lin_clf_m.fit(m_X_train, m_y_train)

lin_clf_2_m = svm.LinearSVC(max_iter=100000)
lin_clf_2_m.fit(m_X_train_2, m_y_train_2)

In [None]:
print_score(svm_clf_m, m_X_train, m_y_train, m_X_test, m_y_test)
print_score(svm_clf_2_m, m_X_train_2, m_y_train_2, m_X_test_2, m_y_test_2)

In [None]:
print_score(lin_clf_m, m_X_train, m_y_train, m_X_test, m_y_test)
print_score(lin_clf_2_m, m_X_train_2, m_y_train_2, m_X_test_2, m_y_test_2)