In [1]:
import numpy as np
import pandas as pd

import pickle

from sklearn import linear_model
from sklearn.linear_model import HuberRegressor
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

# Load data

In [2]:
train_path = '/kaggle/input/tabular-playground-series-aug-2022/train.csv'
test_path = '/kaggle/input/tabular-playground-series-aug-2022/test.csv'
sample_submission_path = '/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(sample_submission_path)

In [4]:
y = train['failure']
train.drop('failure',axis=1, inplace = True)

In [5]:
print('train shape:', train.shape)
print('test_shape: ', test.shape)
print('train failure shape: ', y.shape)

train shape: (26570, 25)
test_shape:  (20775, 25)
train failure shape:  (26570,)


# Select features

In [6]:
train['attribute_4'] = train['attribute_2'] * train['attribute_3']
test['attribute_4'] = test['attribute_2'] * test['attribute_3']

In [7]:
select_feature = ['loading',
                  'measurement_17',
                  #'attribute_0',
                  #'attribute_1',
                  #'attribute_2',
                  #'attribute_3',
                  #'attribute_4',
                  #'measurement_0',
                  #'measurement_1',
                  #'measurement_2',
                  #'measurement_3',
                  #'measurement_4',
                  #'measurement_5',
                  #'measurement_6',
                  #'measurement_7',
                  #'measurement_8',
                  'measurement_9',
                  #'measurement_10',
                  #'measurement_11',
                  #'measurement_12',
                  #'measurement_13',
                  'measurement_14',
                  #'measurement_15',
                  #'measurement_16',
                  ]

In [8]:
x_train = train[select_feature]
x_test = test[select_feature]

# Fill miss value by HuberRegressor

In [9]:
index_x = [[0,1,2],[0,1,3],[0,2,3],[1,2,3]]
index_y = [[3],[2],[1],[0]]
index_z = ['1110','1101','1011','0111']

In [10]:
df_x = [pd.DataFrame() for i in range(4)]
df_y = [pd.DataFrame() for i in range(4)]
m = [HuberRegressor(epsilon=1.35, max_iter = 400) for i in range(4)]
miss_type = dict()

In [11]:
for i in range(len(x_train)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_train[feat][i]):
            miss += '0'
        else:
            miss += '1'
    if miss == '1111':
        for j in range(4):
            df_x[j] = pd.concat([df_x[j], x_train.iloc[[i], index_x[j]]])
            df_y[j] = pd.concat([df_y[j], x_train.iloc[[i], index_y[j]]])
    if miss in miss_type.keys():
        miss_type[miss] += 1
    else:
        miss_type[miss] = 1

In [12]:
for i in range(len(x_test)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_test[feat][i]):
            miss += '0'
        else:
            miss += '1'
    if miss == '1111':
        for j in range(4):
            df_x[j] = pd.concat([df_x[j], x_test.iloc[[i], index_x[j]]])
            df_y[j] = pd.concat([df_y[j], x_test.iloc[[i], index_y[j]]])
    if miss in miss_type.keys():
        miss_type[miss] += 1
    else:
        miss_type[miss] = 1

In [13]:
print(miss_type)

{'1110': 2866, '1111': 38076, '1011': 3550, '1101': 1820, '1010': 269, '0111': 384, '1000': 19, '1001': 148, '0101': 19, '1100': 124, '0110': 32, '0011': 33, '0010': 4, '0001': 1}


In [14]:
for i in range(4):
    m[i].fit(df_x[i], df_y[i])

In [15]:
for i in range(len(x_train)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_train[feat][i]):
            miss += '0'
        else:
            miss += '1'
        for j in range(4):
            if j == 2 or j == 3:
                continue
            if miss == index_z[j]:
                x_train.iloc[[i], index_y[j]] = m[j].predict(x_train.iloc[[i], index_x[j]])[0]

In [16]:
for i in range(len(x_test)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_test[feat][i]):
            miss += '0'
        else:
            miss += '1'
        for j in range(4):
            if j == 2 or j == 3:
                continue
            if miss == index_z[j]:
                x_test.iloc[[i], index_y[j]] = m[j].predict(x_test.iloc[[i], index_x[j]])[0]

In [17]:
miss_type = dict()
for i in range(len(x_train)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_train[feat][i]):
            miss += '0'
        else:
            miss += '1'
    if miss in miss_type.keys():
        miss_type[miss] += 1
    else:
        miss_type[miss] = 1

In [18]:
for i in range(len(x_test)):
    miss = str()
    for feat in select_feature:
        if np.isnan(x_test[feat][i]):
            miss += '0'
        else:
            miss += '1'
    if miss in miss_type.keys():
        miss_type[miss] += 1
    else:
        miss_type[miss] = 1

In [19]:
print(miss_type)

{'1111': 42762, '1011': 3550, '1010': 269, '0111': 384, '1000': 19, '1001': 148, '0101': 19, '1100': 124, '0110': 32, '0011': 33, '0010': 4, '0001': 1}


# Fill miss value with KNNImputer

In [20]:
#x_train = train[select_feature]
#x_test = test[select_feature]
my_imputer = KNNImputer(n_neighbors=3, weights="uniform")
x_train = my_imputer.fit_transform(x_train)[:,0:2]
x_test = my_imputer.fit_transform(x_test)[:,0:2]

# Train model

In [22]:
model = linear_model.LogisticRegression(max_iter=500, C=0.0001, penalty='l2', solver='newton-cg')
model.fit(x_train, y)

# Save model and test data which filled

In [22]:
pickle.dump(model, open('final_model.pickle', 'wb'))
np.save('x_test.npy', x_test)

In [22]:
#l_model = pickle.load(open('final_model.pickle', 'rb'))
#l_x_test = np.load('x_test.npy')

#lr_test = np.zeros(len(test))
#lr_test += l_model.predict_proba(l_x_test)[:, 1]
#submission['failure'] = lr_test
#submission.to_csv(f"./109550206.csv", index=False)