In [463]:
from tqdm import tqdm
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from feature_engine.encoding import WoEEncoder

import warnings

warnings.filterwarnings('ignore')

In [464]:
train = pd.read_csv('data/train.csv')
label = train['failure']

In [465]:
#encode not digit column
labelencoder = LabelEncoder()
col_encode = ['product_code', 'attribute_0', 'attribute_1']
for col in col_encode:
    train[col] = labelencoder.fit_transform(train[col])
    print(col, train[col].value_counts())

product_code 2    5765
4    5343
1    5250
3    5112
0    5100
Name: product_code, dtype: int64
attribute_0 1    21320
0     5250
Name: attribute_0, dtype: int64
attribute_1 2    10865
0    10362
1     5343
Name: attribute_1, dtype: int64


In [466]:
#use feature 'product_code' to split data
train_product_code = []
for code in train.product_code.unique():
    cur_data = train.loc[train.product_code == code]
    train_product_code.append(cur_data)

In [467]:
# every feature select 10 higher correlation to use KNN imputer
i = 0
for d in train_product_code:
    index = i
    i += 1
    corr = d.drop(['id', 'failure', 'product_code'], axis=1).corr()
    cols_with_nan = d.columns[train.isnull().any()].tolist()
    for col in tqdm(cols_with_nan):
        cur_corr = corr[col]
        sorted_corrs = cur_corr.sort_values(kind="quicksort")
        largest_corr = sorted_corrs.abs().tail(11).tolist()
        largest_corr_index = sorted_corrs.abs().tail(11).index.tolist()
        cur_data = d.loc[:, largest_corr_index]
        imputer = KNNImputer(n_neighbors=100)
        train.loc[train.product_code == index, largest_corr_index] = imputer.fit_transform(
            train.loc[train.product_code == index, largest_corr_index])

100%|██████████| 16/16 [00:01<00:00, 11.18it/s]
100%|██████████| 16/16 [00:02<00:00,  7.71it/s]
100%|██████████| 16/16 [00:02<00:00,  7.68it/s]
100%|██████████| 16/16 [00:01<00:00, 10.89it/s]
100%|██████████| 16/16 [00:02<00:00,  7.39it/s]


In [468]:
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(train.drop('failure', axis=1).astype("object"), label)
tr = woe_encoder.transform(train.drop('failure', axis=1))
train.loc[:, tr.columns] = tr
pickle.dump(woe_encoder, open('pickle_model/woe.pkl', 'wb'))

In [469]:
#select 2 features which is higher correlation to failure
select_n = 2
train_corr = train.corr()
failure_corr = train_corr['failure']
sorted_corrs = failure_corr.sort_values(kind="quicksort")
select_feature = sorted_corrs.drop('failure', axis=0).abs().tail(select_n).index.tolist()
print(select_feature)

['measurement_17', 'loading']


In [470]:
#add attribute_0 to select_feature
if 'attribute_0' not in select_feature:
    select_feature.append("attribute_0")
pickle.dump(select_feature, open('pickle_model/select_feature.pkl', 'wb'))

In [471]:
#scaler the data and train LR model
scaler = StandardScaler()
model = LogisticRegression(max_iter=500, C=0.00001, penalty='l2', solver='newton-cg')
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_idx, val_idx in kfold.split(train, label):
    x_train, x_val = train.iloc[train_idx], train.iloc[val_idx]
    y_train, y_val = label.iloc[train_idx], label.iloc[val_idx]
    x_train = scaler.fit_transform(x_train[select_feature])
    x_val = scaler.transform(x_val[select_feature])
    x_train = pd.DataFrame(x_train, columns=select_feature)
    x_val = pd.DataFrame(x_val, columns=select_feature)

    model.fit(x_train[select_feature], y_train)
    val_preds = model.predict_proba(x_val[select_feature])[:, 1]
    print(round(roc_auc_score(y_val, val_preds), 5))
pickle.dump(model, open('pickle_model/LR.pkl', 'wb'))
pickle.dump(scaler, open('pickle_model/scaler.pkl', 'wb'))

0.60124
0.59256
0.58057
0.59377
0.58534
