In [1]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import svm

import xgboost as xgb
import lightgbm as lgb

from imblearn.over_sampling import RandomOverSampler

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 9485


In [2]:
train_raw = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test_raw = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')

X_train = train_raw.drop(columns=['failure'])
X_sub = test_raw

y_train = train_raw['failure']

In [3]:
int_cols = [f for f in train_raw.columns if train_raw[f].dtype == int and f != 'failure']
float_cols = [f for f in train_raw.columns if train_raw[f].dtype == float]
categorical_cols = ['attribute_0', 'attribute_1']

In [4]:
def ohe(X_train, X_test, columns):
    transformer = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore', 
                       drop='first', 
                       categories=[['material_5', 'material_7'],
                                   ['material_5', 'material_6', 'material_8']]), columns),
        remainder='passthrough')

    X_train = pd.DataFrame(
        transformer.fit_transform(X_train), 
        columns=transformer.get_feature_names()
    )
    X_test = pd.DataFrame(
        transformer.transform(X_test),
        columns=transformer.get_feature_names()
    )
    
    return X_train, X_test

In [5]:
def impute_per_product_code(X, imputer):
    """Impute missing values in TPS2208 data.
    
    Imputation is performed over separate "per-product-code" batches, and is designed to leave all non-imputed 
    data in the exact same format as before imputation."""
    # source: https://www.kaggle.com/code/purist1024/per-product-code-imputation
    def transform(X):
        return pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

    cats = ["product_code", "attribute_0", "attribute_1", "attribute_2", "attribute_3"]
    ints = ["measurement_0", "measurement_1", "measurement_2"]
    right = pd.concat([transform(gdf.drop(columns=cats)) for g, gdf in X.groupby("product_code")],
                      axis="rows")
    right[ints] = right[ints].round().astype(int)
    return pd.concat([X[cats], right], axis="columns").reindex(columns=X.columns)

In [6]:
def prepare_data(X_train, X_test, y_train, *, imputer, oversampler):
    if imputer != None:
        print("Imputing...")
        X_train = impute_per_product_code(X_train, imputer)
        X_test = impute_per_product_code(X_test, imputer)
        
    print("Dropping product code...")
    X_train = X_train.drop(columns=['product_code'], axis=1)
    X_test = X_test.drop(columns=['product_code'], axis=1)
    
    print("OHE...")
    X_train, X_test = ohe(X_train, X_test, categorical_cols)
    
    if oversampler != None:
        print("Oversampling...")
        X_train, y_train = oversample(X_train, y_train, oversampler)
    
    return X_train, X_test, y_train

In [7]:
X_train, X_sub, y_train = prepare_data(X_train, X_sub, y_train, imputer=KNNImputer(n_neighbors=3), oversampler=None)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.25, random_state=RANDOM_STATE)

Imputing...
Dropping product code...
OHE...


In [8]:
def test_model(model):
    scorers = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2', 'accuracy', 'roc_auc', 'neg_log_loss']
    scores = cross_validate(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring=scorers)
    
    print("-------------")
    print(scores)
    print("-------------")

    print("METRICS: ")
    print("ROC_AUC \t", end="")
    print("Scores: ", scores['test_roc_auc'], end="\t")
    print("Mean: ", scores['test_roc_auc'].mean())
    
    print("LOG_LOSS \t", end="")
    print("Scores: ", -1*scores['test_neg_log_loss'], end='\t')
    print("Mean: ", -1*scores['test_neg_log_loss'].mean())

In [9]:
clf = lgb.LGBMClassifier()
lgb_model = make_pipeline(StandardScaler(), clf)
test_model(lgb_model)

-------------
{'fit_time': array([1.04648471, 0.9423418 , 0.92545056, 0.96483827, 0.72898126]), 'score_time': array([0.07289362, 0.07331777, 0.07782745, 0.06910467, 0.05734277]), 'test_neg_mean_absolute_error': array([-0.21525339, -0.21174109, -0.21329987, -0.21430364, -0.21455458]), 'test_neg_root_mean_squared_error': array([-0.46395408, -0.46015334, -0.46184399, -0.46292941, -0.46320037]), 'test_r2': array([-0.29078119, -0.26971949, -0.278832  , -0.28485004, -0.28635454]), 'test_accuracy': array([0.78474661, 0.78825891, 0.78670013, 0.78569636, 0.78544542]), 'test_roc_auc': array([0.5547061 , 0.57963827, 0.54237843, 0.56754883, 0.54522359]), 'test_neg_log_loss': array([-0.52088412, -0.51317996, -0.52367991, -0.51681232, -0.5224441 ])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.5547061  0.57963827 0.54237843 0.56754883 0.54522359]	Mean:  0.5578990456240358
LOG_LOSS 	Scores:  [0.52088412 0.51317996 0.52367991 0.51681232 0.5224441 ]	Mean:  0.519400080684699


In [10]:
svc_linear = svm.SVC(kernel='linear', verbose=True, probability=True)
svc_linear_model = make_pipeline(StandardScaler(), svc_linear)
test_model(svc_linear_model)

..........................................

*
optimization finished, #iter = 12369
obj = -5337.999934, rho = 1.000006
nSV = 7266, nBSV = 4102
Total nSV = 7266
*
optimization finished, #iter = 12337
obj = -5359.999817, rho = 1.000001
nSV = 7296, nBSV = 4081
Total nSV = 7296
....
*
optimization finished, #iter = 12562
obj = -5406.000432, rho = 1.000003
nSV = 7276, nBSV = 4184
Total nSV = 7276
....
.*
optimization finished, #iter = 12456
obj = -5384.000493, rho = 0.999998
nSV = 7235, nBSV = 4200
Total nSV = 7235
....................................
..*
optimization finished, #iter = 12391
obj = -5353.999503, rho = 0.999999
nSV = 7258, nBSV = 4113
Total nSV = 7258
.
.*
optimization finished, #iter = 12652
obj = -5447.999956, rho = 1.000004
nSV = 7302, nBSV = 4227
Total nSV = 7302
..
.*
optimization finished, #iter = 12218
obj = -5323.999631, rho = 1.000006
nSV = 7279, nBSV = 4048
Total nSV = 7279
...
.*
optimization finished, #iter = 12492
obj = -5388.001038, rho = 1.000005
nSV = 7308, nBS

In [11]:
svc_sigmoid = svm.SVC(kernel='sigmoid', probability=True)
svc_sigmoid_model = make_pipeline(StandardScaler(), svc_sigmoid)
test_model(svc_sigmoid_model)

-------------
{'fit_time': array([138.91631103, 136.46825552, 151.72241664, 137.0678637 ,
        77.68220735]), 'score_time': array([8.00008225, 9.22992563, 5.67079735, 8.50964689, 5.89509964]), 'test_neg_mean_absolute_error': array([-0.30481686, -0.30180632, -0.30489335, -0.3174404 , -0.29686324]), 'test_neg_root_mean_squared_error': array([-0.55210222, -0.54936902, -0.55217149, -0.5634185 , -0.54485157]), 'test_r2': array([-0.82785448, -0.80980159, -0.82797751, -0.90320292, -0.77983325]), 'test_accuracy': array([0.69518314, 0.69819368, 0.69510665, 0.6825596 , 0.70313676]), 'test_roc_auc': array([0.50995018, 0.51574589, 0.51831385, 0.49864726, 0.51096649]), 'test_neg_log_loss': array([-0.51613585, -0.51540741, -0.51591622, -0.51635904, -0.51620543])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.50995018 0.51574589 0.51831385 0.49864726 0.51096649]	Mean:  0.5107247341314514
LOG_LOSS 	Scores:  [0.51613585 0.51540741 0.51591622 0.51635904 0.51620543]	Mean:  0.5160047884697513


In [12]:
for d in range(2, 10):
    print("D=", d)
    svc_d = svm.SVC(kernel='poly', probability=True, degree=d)
    svc_d_model = make_pipeline(StandardScaler(), svc_d)
    test_model(svc_d_model)
    print("=============================================================================")

D= 2
-------------
{'fit_time': array([85.40615463, 74.04613543, 82.46758914, 79.12405825, 47.1870656 ]), 'score_time': array([5.11140323, 6.3988831 , 6.66240597, 7.02953053, 3.70147347]), 'test_neg_mean_absolute_error': array([-0.21149022, -0.21149022, -0.21154329, -0.21154329, -0.21154329]), 'test_neg_root_mean_squared_error': array([-0.45988065, -0.45988065, -0.45993835, -0.45993835, -0.45993835]), 'test_r2': array([-0.26821508, -0.26821508, -0.26830045, -0.26830045, -0.26830045]), 'test_accuracy': array([0.78850978, 0.78850978, 0.78845671, 0.78845671, 0.78845671]), 'test_roc_auc': array([0.48464588, 0.48755354, 0.48018504, 0.4884487 , 0.48797111]), 'test_neg_log_loss': array([-0.51663819, -0.5161002 , -0.51614519, -0.51638504, -0.51615596])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.48464588 0.48755354 0.48018504 0.4884487  0.48797111]	Mean:  0.4857608534800977
LOG_LOSS 	Scores:  [0.51663819 0.5161002  0.51614519 0.51638504 0.51615596]	Mean:  0.5162849152195446
D= 3
------------