In [176]:
import pickle
import warnings

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [177]:
test = pd.read_csv('data/test.csv')

In [178]:
labelencoder = LabelEncoder()
col_encode = ['product_code', 'attribute_0', 'attribute_1']
for col in col_encode:
    test[col] = labelencoder.fit_transform(test[col])
    print(col, test[col].value_counts())

product_code 0    5422
3    5228
1    5107
2    5018
Name: product_code, dtype: int64
attribute_0 0    10529
1    10246
Name: attribute_0, dtype: int64
attribute_1 1    10529
0     5228
2     5018
Name: attribute_1, dtype: int64


In [179]:
test_product_code = []
for code in test.product_code.unique():
    cur_data = test.loc[test.product_code == code]
    test_product_code.append(cur_data)

In [180]:
# KNN imputer
i = 0
for d in test_product_code:
    index = i
    i += 1
    corr = d.drop(['id', 'product_code'], axis=1).corr()
    cols_with_nan = d.columns[test.isnull().any()].tolist()
    for col in tqdm(cols_with_nan):
        cur_corr = corr[col]
        sorted_corrs = cur_corr.sort_values(kind="quicksort")
        largest_corr = sorted_corrs.abs().tail(11).tolist()
        largest_corr_index = sorted_corrs.abs().tail(11).index.tolist()
        cur_data = d.loc[:, largest_corr_index]
        imputer = KNNImputer(n_neighbors=50)
        test.loc[test.product_code == index, largest_corr_index] = imputer.fit_transform(
            test.loc[test.product_code == index, largest_corr_index])

100%|██████████| 16/16 [00:02<00:00,  7.32it/s]
100%|██████████| 16/16 [00:01<00:00, 12.36it/s]
100%|██████████| 16/16 [00:01<00:00,  8.12it/s]
100%|██████████| 16/16 [00:01<00:00, 11.70it/s]


In [188]:
select_feature = pickle.load(open('pickle_model/select_feature.pkl', 'rb'))
scaler = pickle.load(open('pickle_model/scaler.pkl', 'rb'))
model = pickle.load(open('pickle_model/LR.pkl', 'rb'))
x_test = scaler.transform(test[select_feature])
x_test = pd.DataFrame(x_test, columns=select_feature)
test_pred = model.predict_proba(x_test[select_feature])[:, 1]

In [189]:
sub = pd.DataFrame({'id': test.id, 'failure': test_pred})
sub.to_csv("109550054_Final_submission.csv", index=False)

In [190]:
sub.tail()

Unnamed: 0,id,failure
20770,47340,0.213468
20771,47341,0.209817
20772,47342,0.210146
20773,47343,0.212769
20774,47344,0.21049
