# DOJO ML - 3rd Place solution
## team : curiosos_do_ml 
## Mateus C. Pedrino - Bruno Rasteiro - HelloToMyLittleFriend

## Training and tuning models

The aim of this notebook is to train models to classificate data that was generated with the first notebook (Preparing Data).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.datasets as skdata
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
df=pd.read_csv('data_treino_0days.csv', header=(0))
#pd.concat([df[df.columns[0:11]],df[df.columns[-4:]]], axis=1).head()
df[df.columns[11:-4]].head()

Unnamed: 0,std_voltagem,std_rotacao,std_pressao,std_vibracao,min_voltagem,min_rotacao,min_pressao,min_vibracao,max_voltagem,max_rotacao,max_pressao,max_vibracao,rms_voltagem,rms_rotacao,rms_pressao,rms_vibracao
0,15.523593,58.018104,9.71554,4.960047,129.526479,323.885291,85.081021,28.745974,191.972061,573.84566,116.316702,47.091065,166.373541,439.923378,100.505567,38.362183
1,16.52383,50.504585,12.082691,3.608012,133.87287,367.179243,79.524147,33.438301,193.119235,566.234917,123.367616,46.253889,168.761455,451.922074,101.089575,40.333582
2,13.776066,43.845024,7.614312,6.009319,140.135108,369.586722,82.187235,23.524639,191.889417,516.402811,110.227672,49.146355,173.819984,463.107103,97.140708,39.573016
3,18.396645,53.864923,11.908198,5.874302,137.422712,317.393133,76.143593,28.074336,201.729701,574.805957,130.017462,50.041233,168.191389,453.567885,100.128199,41.232695
4,12.846186,60.070156,7.903329,5.866805,143.394542,338.67951,85.857917,30.340104,197.987199,567.14478,111.286537,53.820648,174.096593,445.283208,99.047389,39.855513


In [4]:
# Split into X and Y and standardize X
Y=np.array(df['falha'], dtype=str)
X=df.drop('falha', axis=1)
X_std=StandardScaler().fit_transform(X)

In [5]:
# Combinantions of parameters to be tested with each classifier

mlp_param = {
    'hidden_layer_sizes': [(10,10,10), (10,20,10), (20,), (10)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

rf_param = {
    'n_estimators': [5, 10, 15, 30], 
    'max_depth': [3, 5, 10], 
    'min_samples_split': [2, 3],
}
    
knn_param = {
    'n_neighbors' : np.arange(2,11,1),
    'metric' : ['euclidean','manhattan','chebyshev'],
}

xgb_param = {
    'max_depth': [3, 7, 10],
    'max_features' : [5, 10],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [10, 15, 30],
}

params = [mlp_param, rf_param, xgb_param]

In [6]:
# Initializing models (and parallelizing)

mlp = MLPClassifier()
rf = RandomForestClassifier(n_jobs=-1)
#knn = KNeighborsClassifier(n_jobs=-1)
xgb = XGBClassifier(n_jobs=-1)
# Models with hiperparameters to tune (except gnb)
models=[mlp, rf, xgb]

In [7]:
# Grid search with the hiperparameters

gs=[] # Grid searchs that will be executed with each metric

for i in range(len(params)):
    print(i) # track progression
    gs.append(GridSearchCV(estimator=models[i], param_grid=params[i], \
                           scoring='f1_micro', cv=10, n_jobs=-1).fit(X_std, Y))

0
1
2


In [8]:
# Best f1 score for each classificator
classificadores=['MLP','RF','XGB']

for i in range(len(classificadores)):
    print(classificadores[i])
    idx=gs[i].best_index_
    print([gs[i].cv_results_['mean_test_score'][idx],\
          gs[i].cv_results_['std_test_score'][idx]])

MLP
[0.9798830409356725, 0.0011907811608056517]
RF
[0.9796491228070175, 0.000943689460285855]
XGB
[0.9801559454191033, 0.0017502884163058997]


In [28]:
# Best parameters for RF
idx=gs[1].best_index_
gs[1].cv_results_['params'][idx]

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 15}

In [30]:
# Feature selection
from sklearn.feature_selection import SelectKBest, chi2, RFE, RFECV

rfecv = RFECV(estimator=gs[1].best_estimator_, step=1, cv=10, scoring='f1_micro')
rfecv = rfecv.fit(X_std, Y)

In [32]:
input_labels = df.drop('falha', axis=1).columns # Features de X
print('Num. original de features : ',len(input_labels))
print('Melhor número de features : ', rfecv.n_features_)
print('Melhores features : ', input_labels[rfecv.support_])
scrs=rfecv.grid_scores_
print('Melhor score : ', [np.mean(scrs), np.std(scrs)])

Num. original de features :  30
Melhor número de features :  25
Melhores features :  Index(['idade', 'mean_voltagem', 'mean_rotacao', 'mean_pressao',
       'mean_vibracao', 'std_voltagem', 'std_rotacao', 'std_pressao',
       'std_vibracao', 'min_voltagem', 'min_rotacao', 'min_pressao',
       'min_vibracao', 'max_voltagem', 'max_rotacao', 'max_pressao',
       'max_vibracao', 'rms_voltagem', 'rms_rotacao', 'rms_pressao',
       'rms_vibracao', 'model1', 'model2', 'model3', 'model4'],
      dtype='object')
Melhor score :  [0.978915312438525, 0.0004579059639043891]


In [33]:
# Test model

#modelo = gs[1].best_estimator_

modelo = rfecv.estimator_

In [10]:
# Test
test=pd.read_csv('data_teste_0days.csv', header=(0))
test_no_idx=test.drop(['index', 'error1', 'error2', 'error3', 'error4', 'error5'], axis=1)
test_no_idx.head()

Unnamed: 0,idade,mean_voltagem,mean_rotacao,mean_pressao,mean_vibracao,std_voltagem,std_rotacao,std_pressao,std_vibracao,min_voltagem,...,max_pressao,max_vibracao,rms_voltagem,rms_rotacao,rms_pressao,rms_vibracao,model1,model2,model3,model4
0,1,174.930809,464.875981,103.30843,41.460377,13.501364,49.004705,11.557952,4.307224,140.535476,...,132.828593,49.092153,175.429415,467.34471,103.926184,41.674237,0,0,0,1
1,1,174.011837,448.216195,96.893716,40.030568,18.831546,37.098417,8.964805,4.859134,130.104711,...,115.442952,48.056273,174.985629,449.685117,97.290345,40.312204,0,0,0,1
2,1,168.249533,456.817587,102.344155,39.147756,12.83107,55.997355,8.625702,5.149518,140.255879,...,120.857067,49.706263,168.717758,460.094944,102.691912,39.470994,0,0,0,1
3,1,170.989978,434.912994,98.296343,40.799268,15.613689,47.521158,10.914604,4.506465,145.521888,...,118.811712,49.682059,171.671786,437.393963,98.875355,41.037085,0,0,0,1
4,1,173.194069,444.169114,98.639648,39.552186,15.512615,51.257369,11.033279,4.792396,138.85793,...,125.422011,49.662658,173.858563,446.994461,99.229235,39.829456,0,0,0,1


In [15]:
test.drop('index', axis=1).columns

Index(['idade', 'error1', 'error2', 'error3', 'error4', 'error5',
       'mean_voltagem', 'mean_rotacao', 'mean_pressao', 'mean_vibracao',
       'std_voltagem', 'std_rotacao', 'std_pressao', 'std_vibracao',
       'min_voltagem', 'min_rotacao', 'min_pressao', 'min_vibracao',
       'max_voltagem', 'max_rotacao', 'max_pressao', 'max_vibracao',
       'rms_voltagem', 'rms_rotacao', 'rms_pressao', 'rms_vibracao', 'model1',
       'model2', 'model3', 'model4'],
      dtype='object')

In [37]:
# Preparing test

X_teste=np.array(test_no_idx)
X_teste_std=StandardScaler().fit_transform(X_teste)

In [38]:
Y_pred=modelo.predict(X_teste_std)

In [39]:
np.unique(Y_pred)

array(['comp1', 'comp2', 'comp3', 'comp4', 'ok'], dtype='<U64')

In [40]:
df_sub=pd.concat([test['index'],pd.Series(Y_pred, name='falha')], axis=1)
df_sub.head()

Unnamed: 0,index,falha
0,10083,ok
1,9914,ok
2,9961,ok
3,9991,ok
4,10114,ok


In [41]:
df_sub.to_csv('submissao.csv', index=False)

In [None]:
### Saving grid search results to pickles (just choose the model !)
with open('model.pickle', 'wb') as model:
    pickle.dump(gs, model)