In [27]:
!pip install feature_engine

[0m

In [28]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import feature_engine as fe
from feature_engine.encoding import WoEEncoder
from colorama import Fore, Back, Style
import xgboost
import random
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LogisticRegression,HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import r2_score,roc_auc_score
from imblearn.under_sampling import NearMiss, ClusterCentroids
from imblearn.over_sampling import SMOTE

sns.set()

warnings.filterwarnings("ignore")

In [29]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv")

In [30]:
target = train.pop('failure')
data = pd.concat([train, test])
feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
nullValue_cols = [col for col in train.columns if train[col].isnull().sum()!=0]

In [31]:
def data_pre_processing(data):

    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']
    data['loading'] = np.log(data['loading']) #The loading feature seems to have right skewed distribution.
                                              #Let's apply log transformation to make the distribution more normal.
    data['count_null'] = data.isnull().sum(axis=1)

    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8','measurement_7'],
        'B': ['measurement_4','measurement_5','measurement_7','measurement_9'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8','measurement_9']
    }


    # collect the name of the next 10 best measurement columns sorted by correlation (except 17 already done above):
    col = [col for col in test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]

    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3)) # we add the 3 first lines of the correlation values to get the "most correlated"
        b.append(f'measurement_{x}')

    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
    # calculating top 4 correlated features for each measurement column w.r.t each product code

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:] # we select the next best correlated column 
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1)
                               .corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.5)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)
                     &(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            total_na_filled_by_linear_model += len(tmp_test)

        # others NA columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:train.shape[0],:]
    df_test = data.iloc[train.shape[0]:,:]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, target)
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)
    df_train['measurement(3*5)'] = df_train['measurement_3'] * df_train['measurement_5']
    df_test['measurement(3*5)'] = df_test['measurement_3'] * df_test['measurement_5']
    df_train['missing(3*5)'] = df_train['m5_missing'] * (df_train['m3_missing'])
    df_test['missing(3*5)'] = df_test['m5_missing'] * (df_test['m3_missing'])
    
    return df_train, df_test

In [32]:
df_train, df_test = data_pre_processing(data)
df_train['failure'] = target

In [33]:
def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

In [34]:
# features = ['loading','attribute_0' , 'measurement_17', 'measurement_0', 'measurement_1','measurement_2','area', 'm3_missing', 'm5_missing', 
#         'measurement_avg','measurement(3*5)','missing(3*5)'] # ,'count_null','ohe_a_7', 'ohe_a_6', 'ohe_a_8','measurement_4','measurement_9','measurement_7','measurement_6','measurement_8'

features = ['loading', 'measurement_17', 'm3_missing', 'm5_missing']

N_FOLDS = 15
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=0)
y_oof = np.zeros(df_train[features].shape[0])
y_test = np.zeros(df_test[features].shape[0])
logistic_auc = 0
ix = 0
lg_model=[]
sm = SMOTE(random_state = 0, n_jobs = -1)
clf = LogisticRegression(max_iter=1000, C=0.0005, penalty='l2',solver='newton-cg')

for train_ind, val_ind in skf.split(df_train[features], df_train[['failure']]):
    print(f"******* Fold {ix} ******* ")
    tr_x, val_x = (
        df_train[features].iloc[train_ind].reset_index(drop=True),
        df_train[features].iloc[val_ind].reset_index(drop=True),
    )
    tr_y, val_y = (
        df_train['failure'].iloc[train_ind].reset_index(drop=True),
        df_train['failure'].iloc[val_ind].reset_index(drop=True),
    )
    
    tr_x,val_x,test_x = scale(tr_x, val_x, df_test[features], features)
    tr_x, tr_y = sm.fit_resample(tr_x, tr_y)
    
    clf.fit(tr_x, tr_y)
    
    preds = clf.predict_proba(val_x)[:,1]
    
    roc_score = roc_auc_score(val_y, preds)
    
    logistic_auc += roc_score/N_FOLDS

    print('VAL_ROC-AUC:', round(roc_score, 5))
    
    y_oof[val_ind] = y_oof[val_ind] + preds

    preds_test = clf.predict_proba(test_x)[:,1]
    lg_model.append(preds_test)
    y_test = y_test + preds_test / N_FOLDS
    ix = ix + 1
    
print(f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(logistic_auc, 5)}{Style.RESET_ALL}")
print(f"{Fore.BLUE}{Style.BRIGHT}OOF auc = {round(roc_auc_score(df_train[['failure']], y_oof), 5)}{Style.RESET_ALL}")

******* Fold 0 ******* 
VAL_ROC-AUC: 0.57099
******* Fold 1 ******* 
VAL_ROC-AUC: 0.61327
******* Fold 2 ******* 
VAL_ROC-AUC: 0.6161
******* Fold 3 ******* 
VAL_ROC-AUC: 0.58769
******* Fold 4 ******* 
VAL_ROC-AUC: 0.58217
******* Fold 5 ******* 
VAL_ROC-AUC: 0.59751
******* Fold 6 ******* 
VAL_ROC-AUC: 0.56965
******* Fold 7 ******* 
VAL_ROC-AUC: 0.58988
******* Fold 8 ******* 
VAL_ROC-AUC: 0.59358
******* Fold 9 ******* 
VAL_ROC-AUC: 0.59867
******* Fold 10 ******* 
VAL_ROC-AUC: 0.58264
******* Fold 11 ******* 
VAL_ROC-AUC: 0.59998
******* Fold 12 ******* 
VAL_ROC-AUC: 0.56863
******* Fold 13 ******* 
VAL_ROC-AUC: 0.57455
******* Fold 14 ******* 
VAL_ROC-AUC: 0.6107
[32m[1mAverage auc = 0.5904[0m
[34m[1mOOF auc = 0.59031[0m


In [35]:
y_pred = df_train[['failure']].copy(deep=True)
y_pred = y_pred.rename(columns={"failure": "prediction"})
y_pred["prediction"] = y_oof

roc_auc_score(df_train[['failure']],y_pred)

0.5903058051739523

In [36]:
sub_log = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv')
sub_log['failure'] = y_test

sub_log.to_csv("submission1.csv", index=False)


In [37]:
filename = f'model1.sav'
pickle.dump(clf, open(filename, 'wb'))

In [38]:
clf = LGBMClassifier()


for train_ind, val_ind in skf.split(df_train[features], df_train[['failure']]):
    print(f"******* Fold {ix} ******* ")
    tr_x, val_x = (
        df_train[features].iloc[train_ind].reset_index(drop=True),
        df_train[features].iloc[val_ind].reset_index(drop=True),
    )
    tr_y, val_y = (
        df_train['failure'].iloc[train_ind].reset_index(drop=True),
        df_train['failure'].iloc[val_ind].reset_index(drop=True),
    )
    
    tr_x,val_x,test_x = scale(tr_x, val_x, df_test[features], features)
    tr_x, tr_y = sm.fit_resample(tr_x, tr_y)
    
    clf.fit(tr_x, tr_y)
    preds = clf.predict_proba(val_x)[:,1]
    
    roc_score = roc_auc_score(val_y, preds)
    
    logistic_auc += roc_score/N_FOLDS

    print('VAL_ROC-AUC:', round(roc_score, 5))
    
    y_oof[val_ind] = y_oof[val_ind] + preds

    preds_test = clf.predict_proba(test_x)[:,1]
    lg_model.append(preds_test)
    y_test = y_test + preds_test / N_FOLDS
    ix = ix + 1
    
print(f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(logistic_auc, 5)}{Style.RESET_ALL}")
print(f"{Fore.BLUE}{Style.BRIGHT}OOF auc = {round(roc_auc_score(df_train[['failure']], y_oof), 5)}{Style.RESET_ALL}")

******* Fold 15 ******* 
VAL_ROC-AUC: 0.54163
******* Fold 16 ******* 
VAL_ROC-AUC: 0.56476
******* Fold 17 ******* 
VAL_ROC-AUC: 0.60704
******* Fold 18 ******* 
VAL_ROC-AUC: 0.55248
******* Fold 19 ******* 
VAL_ROC-AUC: 0.56755
******* Fold 20 ******* 
VAL_ROC-AUC: 0.57373
******* Fold 21 ******* 
VAL_ROC-AUC: 0.56863
******* Fold 22 ******* 
VAL_ROC-AUC: 0.5714
******* Fold 23 ******* 
VAL_ROC-AUC: 0.57593
******* Fold 24 ******* 
VAL_ROC-AUC: 0.55973
******* Fold 25 ******* 
VAL_ROC-AUC: 0.54134
******* Fold 26 ******* 
VAL_ROC-AUC: 0.58006
******* Fold 27 ******* 
VAL_ROC-AUC: 0.54875
******* Fold 28 ******* 
VAL_ROC-AUC: 0.55675
******* Fold 29 ******* 
VAL_ROC-AUC: 0.58939
[32m[1mAverage auc = 1.15701[0m
[34m[1mOOF auc = 0.5808[0m


In [39]:
sub_log = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv')
sub_log['failure'] = y_test

sub_log.to_csv("submission2.csv", index=False)

In [40]:
filename = f'model2.sav'
pickle.dump(clf, open(filename, 'wb'))