In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data_train_182ft.csv")
df.drop(columns=["Unnamed: 0"],inplace=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,171,172,173,174,175,176,177,178,179,180
0,3.0,57.0,1.0,18.0,550.000000,550.0,5.487955,18.0,-0.546652,-0.850098,...,552.0,5.357787,16.0,-0.660904,-1.051465,605.411357,1.000000,2.0,18.0,17.0
1,1.0,70.0,0.0,8.0,1110.000000,1118.0,29.684051,98.0,-0.546613,-0.531840,...,1118.0,29.684051,98.0,-0.546613,-0.531840,1340.012422,1.000000,0.0,8.0,7.0
2,0.0,91.0,1.0,10.0,895.777778,912.0,189.800935,548.0,-0.161693,-1.293806,...,872.0,203.207185,566.0,-0.019694,-1.412580,1211.265139,0.909091,-20.0,11.0,4.0
3,3.0,38.0,0.0,16.0,592.800000,590.0,29.237419,108.0,0.272553,-0.573703,...,590.0,29.319391,106.0,0.205445,-0.692952,882.174548,1.000000,4.0,16.0,15.0
4,1.0,78.0,0.0,10.0,1029.111111,1036.0,13.469673,34.0,-0.263671,-1.730131,...,1036.0,12.472191,34.0,-0.305441,-1.590514,590.264901,0.900000,-3.0,10.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,1.0,53.0,0.0,9.0,1078.750000,1088.0,22.403962,66.0,-0.612409,-1.044878,...,1087.0,22.224705,66.0,-0.567772,-1.079990,342.338532,1.000000,-1.0,8.0,8.0
7711,1.0,82.0,1.0,9.0,1033.500000,1033.0,8.703448,28.0,-0.331040,-0.880502,...,1034.0,8.996527,26.0,-0.412643,-1.180532,681.152117,1.000000,-3.0,9.0,8.0
7712,0.0,77.0,0.0,14.0,710.769231,628.0,153.204817,556.0,0.996355,0.207174,...,626.0,152.864716,552.0,0.990096,0.175894,794.307350,0.928571,-10.0,10.0,9.0
7713,1.0,67.0,0.0,7.0,1450.666667,1442.0,27.848798,72.0,0.445371,-1.456690,...,1442.0,26.095977,68.0,0.460966,-1.463724,568.329229,1.000000,-4.0,7.0,6.0


In [3]:
x_train = df.iloc[:,1:].values
y_train = df.iloc[:,0].values

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train , test_size=0.5, shuffle=True, stratify=y_train, random_state=119)
print(f"Train: {x_train.shape}")
print(f"Vallidation: {x_val.shape}")

Train: (3857, 180)
Vallidation: (3858, 180)


In [5]:
X = x_train
y = y_train

In [6]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)

In [7]:
from sklearn.metrics import confusion_matrix
def confusion_matrix_scorer(clf=None,X=None, y=None,y_pred=None,y_prob=None):
    if clf != None:
        y_pred = clf.predict(X)
        y_prob = clf.predict_proba(X)[:,1]
    cm = confusion_matrix(y,y_pred)
    acc_arr = []
    precision_arr = []
    recall_arr = []
    specificity_arr = []
    f1_arr = []
    for c in range(0,len(cm)):
        c = len(cm)-1
        tp = cm[c,c]
        fp = sum(cm[:,c]) - cm[c,c]
        fn = sum(cm[c,:]) - cm[c,c]
        tn = sum(np.delete(sum(cm)-cm[c,:],c))
        acc_arr.append((tp+tn) / (tp+fn+tn+fp))
        recall_arr.append(tp/(tp+fn))
        precision_arr.append(tp/(tp+fp))
        specificity_arr.append(tn/(tn+fp))
        f1_arr.append(2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))
    return {'acc':np.nanmean(acc_arr),'precision':np.nanmean(precision_arr),'specificity':np.nanmean(specificity_arr),'recall':np.nanmean(recall_arr),'f1_score':np.nanmean(f1_arr)}

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
model = GradientBoostingClassifier()
params = {
    'loss':['log_loss', 'deviance', 'exponential'],
    'learning_rate':[0.001,0.01,0.1,1],
    'n_estimators':[100,200],
    'criterion':['friedman_mse', 'squared_error']
}
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=2, return_train_score=True,refit=True)
grid_model = grid_search.fit(X,y)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END criterion=friedman_mse, learning_rate=0.001, loss=log_loss, n_estimators=100; total time=  32.6s
[CV] END criterion=friedman_mse, learning_rate=0.001, loss=log_loss, n_estimators=100; total time=  32.1s
[CV] END criterion=friedman_mse, learning_rate=0.001, loss=log_loss, n_estimators=100; total time=  32.0s


KeyboardInterrupt: 

In [None]:
grid_model.best_params_

{'criterion': 'squared_error',
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'n_estimators': 200}

In [None]:
grid_model.best_score_

0.9432207871621342