In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline
import pickle
pd.set_option("display.max_columns", 101)

In [8]:
df_final = pd.read_html('cv_results_final.html')

In [9]:
df_final[0].columns

Index(['Unnamed: 0', 'mean_fit_time', 'std_fit_time', 'mean_score_time',
       'std_score_time', 'param_decomposition',
       'param_decomposition__n_components', 'param_decomposition__svd_solver',
       'param_decomposition__whiten', 'param_estimator',
       'param_estimator__class_weight', 'param_estimator__criterion',
       'param_estimator__max_depth', 'param_estimator__splitter',
       'param_scaler', 'param_decomposition__gamma',
       'param_decomposition__kernel', 'param_estimator__C',
       'param_estimator__penalty', 'param_estimator__solver',
       'param_estimator__gamma', 'param_estimator__kernel', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [12]:
df_final[0].sort_values(by=[ 'mean_test_score'],inplace = True)

In [15]:
df_final = df_final[0].tail(1)

In [16]:
list(df_final.params)

["{'decomposition': KernelPCA(gamma=0.03), 'decomposition__gamma': 0.03, 'decomposition__kernel': 'linear', 'estimator': LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear'), 'estimator__C': 1.0, 'estimator__class_weight': 'balanced', 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear', 'scaler': StandardScaler()}"]

In [25]:
LR = LogisticRegression(C= 1.0, class_weight= 'balanced', penalty= 'l1', solver= 'liblinear')
SC = StandardScaler()
KPCA = KernelPCA(gamma= 0.03, kernel= 'linear')

In [26]:
pipe = Pipeline(steps=[('scaler',SC),('decomposition',KPCA),('estimator',LR)])

### Getting data

In [19]:
encoder = lambda x:1 if x == -1 else 0
decoder = lambda x:-1 if x == 1 else 1

In [20]:
y = pd.read_csv('train_labels.csv', header=None, names=['y'])
X = pd.read_csv("train_data.csv",header=None, low_memory = False)

In [21]:
y= y['y'].apply(encoder)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [27]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('decomposition', KernelPCA(gamma=0.03)),
                ('estimator',
                 LogisticRegression(class_weight='balanced', penalty='l1',
                                    solver='liblinear'))])

In [28]:
y_pred = pipe.predict(X_test)

In [30]:
confusion_matrix(y_pred,y_test)

array([[836,  92],
       [  8,   2]], dtype=int64)

In [32]:
print(classification_report_imbalanced(y_pred,y_test))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.90      0.20      0.94      0.42      0.19       928
          1       0.02      0.20      0.90      0.04      0.42      0.17        10

avg / total       0.98      0.89      0.21      0.93      0.42      0.19       938



In [34]:
tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()

In [35]:
(tn, fp, fn, tp)

(836, 92, 8, 2)

In [38]:
type(y_pred)

numpy.ndarray

In [46]:
df = pd.DataFrame([y_pred]).T

In [51]:
df[0].to_csv('lol2.csv',index=False)

In [52]:
filename = 'finalized_model.sav'
pickle.dump(pipe, open(filename, 'wb'))