In [77]:
'''
  Intall Package
'''
!pip install feature-engine
#!pip install -U scikit-learn #LogisticRegression newton-cholesky solver.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
'''
  Import Package
'''
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from feature_engine.encoding import WoEEncoder

import warnings; warnings.filterwarnings("ignore")

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
'''
  Load train/test data & Add Column
'''
train = pd.read_csv('/content/drive/MyDrive/ML/Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML/Project/test.csv')

print(train.shape)
print(test.shape)
train.head()

# add additional column
# the statistics are gained from insightful raw data analysis
# You can access it with the notebooks listed
# reference:
# 1. TPSAUG22 EDA which makes sense (loading, m3 m5 missing)
# https://www.kaggle.com/code/ambrosm/tpsaug22-eda-which-makes-sense/notebook
# 2. Less can be more: Feature Engineering Ideas (area, measuerment avg)
# https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/342126
label = train.pop('failure')
data = pd.concat([train, test])
data['loading'] = np.log(data['loading'])
data['m_3_missing'] = data.measurement_3.isna().astype(np.int8)
data['m_5_missing'] = data.measurement_5.isna().astype(np.int8)

data['area'] = data['attribute_2'] * data['attribute_3']
#data['measurement_2'] = data['measurement_2'].clip(11, None)
#data['measurement_avg'] would be added after imputation
#data['m_3_5_missing'] would be added after WoEEncoder

print(data.shape)
data.head()

(26570, 26)
(20775, 25)
(47345, 28)


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,m_3_missing,m_5_missing,area
0,0,A,4.383276,material_7,material_8,9,5,7,8,4,...,17.594,15.193,15.029,,13.034,14.684,764.1,0,0,45
1,1,A,4.441356,material_7,material_8,9,5,14,3,3,...,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,0,45
2,2,A,4.411949,material_7,material_8,9,5,12,1,5,...,,13.798,16.711,18.631,14.094,17.946,663.376,0,0,45
3,3,A,4.615813,material_7,material_8,9,5,13,2,6,...,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0,0,45
4,4,A,5.236761,material_7,material_8,9,5,9,2,8,...,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0,0,45


In [81]:
'''
  Missing Value Imputation: Find the most correlated columns
                  for each product group
'''

# write the measurement_17 in correlation table Hard
full_fill_dict ={}
full_fill_dict['measurement_17'] = {
    'A':['measurement_5','measurement_6','measurement_8'],
    'B':['measurement_4','measurement_5','measurement_7'],
    'C':['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D':['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E':['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F':['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G':['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H':['measurement_4','measurement_5','measurement_7','measurement_8',
        'measurement_9'],
    'I':['measurement_3','measurement_7','measurement_8']
}

# To calculate correlation between measurements,
# drop the column without prefix 'measurement'.
col = ([col for col in test.columns if 'measurement' not in col]
       + ['loading','m_3_missing','m_5_missing'])
corr_total = []
title_col =[]

# Find the 10 measurements that each of it is most correlated with others
# The Ranking condition is based on the sum of top 3 corr coef.
for x in range(3,17):
    corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}'])
    corr = corr.sort_values(ascending=False)
    corr_total.append(np.round(np.sum(corr[1:4]),3))
    title_col.append(f'measurement_{x}')
corr_df = pd.DataFrame()
corr_df['Selected columns'] = title_col
corr_df['correlation total'] = corr_total
corr_df = corr_df.sort_values(by = 'correlation total', ascending=False)
corr_df = corr_df.reset_index(drop = True)
display(corr_df.head(10))

# For each measurement,
# calculate corr coef for each product group
for i in range(10):
    measure_col = 'measurement_' + corr_df.iloc[i,0][12:] 
    fill_dict ={}
    for x in data.product_code.unique() :
        corr = data[data.product_code == x].drop(col, axis=1)
        corr = corr.corr()[measure_col]
        corr = np.absolute(corr).sort_values(ascending=False)
        measure_col_dict = {}
        # select 4 column, the index 0 is itself
        measure_col_dict[measure_col] = corr[1:5].index.tolist()
        fill_dict[x] = measure_col_dict[measure_col]
    full_fill_dict[measure_col] = fill_dict
# Display/Dump
for key in full_fill_dict.keys():
  print(key)
  for product in full_fill_dict[key].keys():
    print(product, full_fill_dict[key][product])

Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.448
1,measurement_11,0.395
2,measurement_5,0.376
3,measurement_6,0.359
4,measurement_7,0.33
5,measurement_4,0.328
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225


measurement_17
A ['measurement_5', 'measurement_6', 'measurement_8']
B ['measurement_4', 'measurement_5', 'measurement_7']
C ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9']
D ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8']
E ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8']
F ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_7']
G ['measurement_4', 'measurement_6', 'measurement_8', 'measurement_9']
H ['measurement_4', 'measurement_5', 'measurement_7', 'measurement_8', 'measurement_9']
I ['measurement_3', 'measurement_7', 'measurement_8']
measurement_8
A ['measurement_17', 'measurement_2', 'measurement_0', 'measurement_11']
B ['measurement_16', 'measurement_10', 'measurement_15', 'measurement_12']
C ['measurement_17', 'measurement_1', 'measurement_6', 'measurement_15']
D ['measurement_17', 'measurement_16', 'measurement_5', 'measurement_15']
E ['measurement_17', 'measurement_13', 'measurement_11', 'measurement_1

In [82]:
'''
  Missing Value Imputation: Impute all the missing value
'''

model1 = HuberRegressor(epsilon=1.9)
model2 = KNNImputer(n_neighbors=3)
#model2 = LGBMImputer(n_iter=50)
#model2 = IterativeImputer(random_state=0) 

feature = [f for f in data.columns 
           if f.startswith('measurement') or f=='loading']
nullValue_cols = [col for col in train.columns 
                    if train[col].isnull().sum()!=0]
# For each product code group,
# impute the missing value via HuberRegressor(model1) first
# then impute the rest via KNNImputer

for code in data.product_code.unique():
    # HuberRegressor
    for measurement_col in list(full_fill_dict.keys()):
        tmp = data[data.product_code==code]
        column = full_fill_dict[measurement_col][code]
        tmp_train = tmp[column+[measurement_col]].dropna(how='any')
        tmp_test = tmp[((tmp[column].isnull().sum(axis=1)==0)
                      &(tmp[measurement_col].isnull()))]

        model1.fit(tmp_train[column], tmp_train[measurement_col])
        data.loc[((data.product_code==code)
            &(data[column].isnull().sum(axis=1)==0)
            &(data[measurement_col].isnull())), measurement_col] = model1.predict(tmp_test[column])
    # KNNImputer
    data.loc[data.product_code==code, feature] = model2.fit_transform(data.loc[data.product_code==code, feature])
  
# After the missing value is imputed,
# we can calculate & add the column 'measurement_avg'
data['measurement_avg'] = data[[f'measurement_{i}' 
                    for i in range(3, 17)]].mean(axis=1)
#data['measurement_stddev'] = data[[f'measurement_{i}' 
#                for i in range(3, 17)]].std(axis=1, ddof=0)
data.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,m_3_missing,m_5_missing,area,measurement_avg
0,0,A,4.383276,material_7,material_8,9,5,7.0,8.0,4.0,...,15.193,15.029,15.495868,13.034,14.684,764.1,0,0,45,15.360919
1,1,A,4.441356,material_7,material_8,9,5,14.0,3.0,3.0,...,11.755,14.732,15.425,14.395,15.631,682.057,0,0,45,15.446286
2,2,A,4.411949,material_7,material_8,9,5,12.0,1.0,5.0,...,13.798,16.711,18.631,14.094,17.946,663.376,0,0,45,16.09882
3,3,A,4.615813,material_7,material_8,9,5,13.0,2.0,6.0,...,10.02,15.25,15.562,16.154,17.172,826.282,0,0,45,15.5995
4,4,A,5.236761,material_7,material_8,9,5,9.0,2.0,8.0,...,12.428,16.182,12.76,13.153,16.412,579.885,0,0,45,15.194071


In [83]:
train = data.iloc[:train.shape[0],:]
test = data.iloc[train.shape[0]:,:]
print(train.shape, test.shape)

groups = train.product_code
X = train
y = label

(26570, 29) (20775, 29)


In [84]:
'''
  Encode the atrribute_0 to float number
'''
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(X, y)
X = woe_encoder.transform(X)
test = woe_encoder.transform(test)

In [85]:
# I don't know why if a put the snippet here,
# I would get a better result on LB @@
X['m_3_5_missing'] = X['m_5_missing'] * (X['m_3_missing'])
test['m_3_5_missing'] =test['m_5_missing'] * (test['m_3_missing'])

In [86]:
data.info()
X.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47345 entries, 0 to 20774
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               47345 non-null  int64  
 1   product_code     47345 non-null  object 
 2   loading          47345 non-null  float64
 3   attribute_0      47345 non-null  object 
 4   attribute_1      47345 non-null  object 
 5   attribute_2      47345 non-null  int64  
 6   attribute_3      47345 non-null  int64  
 7   measurement_0    47345 non-null  float64
 8   measurement_1    47345 non-null  float64
 9   measurement_2    47345 non-null  float64
 10  measurement_3    47345 non-null  float64
 11  measurement_4    47345 non-null  float64
 12  measurement_5    47345 non-null  float64
 13  measurement_6    47345 non-null  float64
 14  measurement_7    47345 non-null  float64
 15  measurement_8    47345 non-null  float64
 16  measurement_9    47345 non-null  float64
 17  measurement_

In [87]:
selected_cols = [
    'loading',
    'attribute_0',
    'measurement_17',
    'measurement_0',
    'measurement_1',
    'measurement_2',
    'area',
    'm_3_missing',
    'm_5_missing',
    'measurement_avg',
#    'measurement_stddev',
    'm_3_5_missing'
]

In [88]:
'''
  Normalize
'''
scaler = StandardScaler()
#scaler = RobustScaler()
#scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X[selected_cols])
#X_test = scaler.fit_transform(test[selected_cols])

In [89]:
'''
  Model 1: Logistic Regression
'''
lr_oof = np.zeros(len(train))
lr_test = np.zeros(len(test))
lr_auc = 0
lr_model = {}

#10 ^ 1/3 = 2.1544
#10 ^ 2/3 = 4.6415
#10 ^ 1/2 = 3.1622
#kf = GroupKFold(n_splits=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
c = 0.0008
factor = 0.00005
#while c < 0.002:
for c in [0.00125]:
    lr_auc = 0
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        x_train, x_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        #model = LogisticRegression(max_iter=10000, C=c, penalty='l2', solver='newton-cholesky')
        model = LogisticRegression(max_iter=10000, C=c, penalty='l2', solver='newton-cg')
        model.fit(x_train, y_train)

        preds_val = model.predict_proba(x_val)[:, 1]
        lr_auc += roc_auc_score(y_val, preds_val) / 5
        #lr_test += model.predict_proba(X_test)[:, 1] / 5
        lr_oof[val_idx] = preds_val
        lr_model[fold_idx] = model
    #print(iter, c)
    print(f"Average auc = {round(lr_auc, 5)}")
    print(f"OOF auc = {round(roc_auc_score(y, lr_oof), 5)}\n")
    #c += factor
'''
LR no Hyperparameter Tuning
Average auc = 0.59065
OOF auc = 0.59051
'''

Average auc = 0.59073
OOF auc = 0.59059



'\nLR no Hyperparameter Tuning\nAverage auc = 0.59065\nOOF auc = 0.59051\n'

In [90]:
'''
  Model 2: Support Vector Machine
'''
'''
from sklearn.svm import LinearSVC
svm_oof = np.zeros(len(train))
svm_test = np.zeros(len(test))
svm_auc = 0

#10 ^ 1/3 = 2.1544
#10 ^ 2/3 = 4.6415
#10 ^ 1/2 = 3.1622
#kf = GroupKFold(n_splits=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    #model = SVC(probability=True)
    model = LinearSVC(C=0.0001, probability=True)
    model.fit(x_train, y_train)

    preds_val = model.predict_proba(x_val)[:, 1]
    svm_auc += roc_auc_score(y_val, preds_val) / 5
    svm_test += model.predict_proba(X_test)[:, 1] / 5
    svm_oof[val_idx] = preds_val

print(f"Average auc = {round(svm_auc, 5)}")
print(f"OOF auc = {round(roc_auc_score(y, svm_oof), 5)}\n")
'''
'''
Average auc = 0.50926
OOF auc = 0.5079
'''

'\nAverage auc = 0.50926\nOOF auc = 0.5079\n'

In [91]:
'''
  Model 3: Naive Bayes
'''

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
nb_oof = np.zeros(len(train))
nb_test = np.zeros(len(test))
nb_auc = 0

#10 ^ 1/3 = 2.1544
#10 ^ 2/3 = 4.6415
#10 ^ 1/2 = 3.1622
#kf = GroupKFold(n_splits=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    #model = MultinomialNB()
    #model = BernoulliNB()
    model = GaussianNB()
    model.fit(x_train, y_train)

    preds_val = model.predict_proba(x_val)[:, 1]
    nb_auc += roc_auc_score(y_val, preds_val) / 5
    #nb_test += model.predict_proba(X_test)[:, 1] / 5
    nb_oof[val_idx] = preds_val

print(f"Average auc = {round(nb_auc, 5)}")
print(f"OOF auc = {round(roc_auc_score(y, nb_oof), 5)}\n")

'''
MultinomialNB() MinMaxScaler
Average auc = 0.54283
OOF auc = 0.53971
'''
'''
BernoulliNB() StandardScaler
Average auc = 0.57341
OOF auc = 0.57322
'''
'''
GaussianNBNB() StandardScaler
Average auc = 0.58397
OOF auc = 0.55722
'''

Average auc = 0.58397
OOF auc = 0.55722



'\nGaussianNBNB() StandardScaler\nAverage auc = 0.58397\nOOF auc = 0.55722\n'

In [92]:
'''
  Model 4: DT: Adaboost, RandomForest
'''

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
dt_oof = np.zeros(len(train))
dt_test = np.zeros(len(test))
dt_auc = 0

#10 ^ 1/3 = 2.1544
#10 ^ 2/3 = 4.6415
#10 ^ 1/2 = 3.1622
#kf = GroupKFold(n_splits=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    #model = RandomForestClassifier(n_estimators=100, max_depth=6)
    model = GradientBoostingClassifier()
    model.fit(x_train, y_train)

    preds_val = model.predict_proba(x_val)[:, 1]
    dt_auc += roc_auc_score(y_val, preds_val) / 5
    #nb_test += model.predict_proba(X_test)[:, 1] / 5
    dt_oof[val_idx] = preds_val

print(f"Average auc = {round(dt_auc, 5)}")
print(f"OOF auc = {round(roc_auc_score(y, dt_oof), 5)}\n")
'''
default parameter
Average auc = 0.54735
OOF auc = 0.54728
RandomForestClassifier(n_estimators=100, max_depth=6)
Average auc = 0.58618
OOF auc = 0.58572
'''
'''
default parameter
Average auc = 0.5815
OOF auc = 0.58141
AdaBoostClassifier(n_estimators=50, learning_rate=0.1)
Average auc = 0.58818
OOF auc = 0.58742
'''
'''
GradientBoostingClassifier()
Average auc = 0.58294
OOF auc = 0.58276
'''

Average auc = 0.58281
OOF auc = 0.58263



'\nGradientBoostingClassifier()\nAverage auc = 0.58294\nOOF auc = 0.58276\n'

In [93]:
'''
  Save the models with Best Performance and the processed dataframe
'''
import joblib
path = '/content/drive/MyDrive/ML/Project/model'
for key in lr_model.keys():
  joblib.dump(lr_model[key], f'{path}{key}.sav')

test.to_csv("/content/drive/MyDrive/ML/Project/X_test.csv", index=False)
'''
submission = pd.read_csv('/content/drive/MyDrive/ML/Project/sample_submission.csv')
submission.failure = lr_test
submission.to_csv("0816201.csv", index=False)
submission
'''

'\nsubmission = pd.read_csv(\'/content/drive/MyDrive/ML/Project/sample_submission.csv\')\nsubmission.failure = lr_test\nsubmission.to_csv("0816201.csv", index=False)\nsubmission\n'