# Importing Libraries and Loading datasets

In [288]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression

import joblib

In [289]:
test = pd.read_csv("test.csv", index_col='id')
sub = pd.read_csv("sample_submission.csv")

In [290]:
test_data = test.copy()

## Numerical Feature

In [291]:
numerical_cols = test_data.select_dtypes(np.number).columns.values.tolist()
test_data[numerical_cols].head()

Unnamed: 0_level_0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,119.57,6,4,6,9,6,19.305,10.178,17.534,18.168,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
26571,113.51,6,4,11,8,0,17.883,11.927,17.228,16.033,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
26572,112.16,6,4,8,12,4,18.475,10.481,16.619,18.189,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
26573,112.72,6,4,8,11,10,16.518,10.888,15.293,18.592,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
26574,208.0,6,4,14,16,8,17.808,12.693,17.678,15.814,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


## Categorical Feature

In [292]:
categorical_cols = [x for x in test_data.columns.values if (x not in numerical_cols)]

# Preprocessing

## Fill Missing Values

In [293]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(test_data[numerical_cols])
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols])

## Feature Engineering


In [294]:
# test_data['m_3_missing'] = test_data.measurement_3.isna()
# test_data['m_5_missing'] = test_data.measurement_5.isna()


test_data['attribute_2*3'] = test_data['attribute_2'] * test_data['attribute_3']
numerical_cols = numerical_cols + ['attribute_2*3']

meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 4)) +list(range(5, 7))+ list(range(8, 9))]
test_data['meas_gr1_avg'] = np.mean(test_data[meas_gr1_cols], axis=1)
numerical_cols = numerical_cols + ['meas_gr1_avg']
test_data['meas_gr1_std'] = np.std(test_data[meas_gr1_cols], axis=1)
numerical_cols = numerical_cols + ['meas_gr1_std']

meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(7, 8))+list(range(9, 10))+list(range(13, 14))]
test_data['meas_gr2_avg'] = np.mean(test_data[meas_gr2_cols], axis=1)
numerical_cols = numerical_cols + ['meas_gr2_avg']
test_data['meas_gr2_std'] = np.std(test_data[meas_gr2_cols], axis=1)
numerical_cols = numerical_cols + ['meas_gr2_std']
test_data.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,attribute_2*3,meas_gr1_avg,meas_gr1_std,meas_gr2_avg,meas_gr2_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,F,119.57,material_5,material_6,6.0,4.0,6.0,9.0,6.0,19.305,...,13.659,16.825,13.742,17.71,634.612,24.0,18.41525,0.649306,12.019667,1.203873
26571,F,113.51,material_5,material_6,6.0,4.0,11.0,8.0,0.0,17.883,...,17.468,16.708,14.776,14.102,537.037,24.0,17.628,1.203812,13.559667,2.785463
26572,F,112.16,material_5,material_6,6.0,4.0,8.0,12.0,4.0,18.475,...,13.363,15.737,17.065,16.021,658.995,24.0,17.76425,0.706624,12.410667,0.691316
26573,F,112.72,material_5,material_6,6.0,4.0,8.0,11.0,10.0,16.518,...,15.501,15.667,12.62,16.111,594.301,24.0,17.33775,1.501586,12.865,1.874464
26574,F,208.0,material_5,material_6,6.0,4.0,14.0,16.0,8.0,17.808,...,16.07,16.183,13.324,17.15,801.044,24.0,17.61025,1.184631,13.957,1.555636


# Encoding

In [295]:
for column in categorical_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(test_data[column])
    test_data[column] = label_encoder.transform(test_data[column])
test_data.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,attribute_2*3,meas_gr1_avg,meas_gr1_std,meas_gr2_avg,meas_gr2_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,0,119.57,0,1,6.0,4.0,6.0,9.0,6.0,19.305,...,13.659,16.825,13.742,17.71,634.612,24.0,18.41525,0.649306,12.019667,1.203873
26571,0,113.51,0,1,6.0,4.0,11.0,8.0,0.0,17.883,...,17.468,16.708,14.776,14.102,537.037,24.0,17.628,1.203812,13.559667,2.785463
26572,0,112.16,0,1,6.0,4.0,8.0,12.0,4.0,18.475,...,13.363,15.737,17.065,16.021,658.995,24.0,17.76425,0.706624,12.410667,0.691316
26573,0,112.72,0,1,6.0,4.0,8.0,11.0,10.0,16.518,...,15.501,15.667,12.62,16.111,594.301,24.0,17.33775,1.501586,12.865,1.874464
26574,0,208.0,0,1,6.0,4.0,14.0,16.0,8.0,17.808,...,16.07,16.183,13.324,17.15,801.044,24.0,17.61025,1.184631,13.957,1.555636


## Select Features from Training

In [296]:
columns = ['measurement_17', 'attribute_0', 'measurement_1', 'product_code', 'attribute_2', 'attribute_1', 'measurement_12', 'measurement_2', 'meas_gr2_avg', 'measurement_7', 'measurement_3', 'loading']
test_X = test_data[columns].copy()

In [297]:
model = joblib.load('109550039_model')
sub['failure'] = model.predict_proba(test_X)[:, 1]
sub.to_csv('109550039.csv', index=False)
sub

Unnamed: 0,id,failure
0,26570,0.190268
1,26571,0.163577
2,26572,0.177785
3,26573,0.181262
4,26574,0.338323
...,...,...
20770,47340,0.236140
20771,47341,0.142546
20772,47342,0.141043
20773,47343,0.211588
