In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import make_column_transformer
from sklearn.impute import  KNNImputer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression, PoissonRegressor


import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 1

In [2]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')

X = train.drop(columns=['failure'])
y = train['failure']

In [3]:
# https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/342319
X['m_3_missing'] = X.measurement_3.isna()
X['m_5_missing'] = X.measurement_5.isna()

test['m_3_missing'] = test.measurement_3.isna()
test['m_5_missing'] = test.measurement_5.isna()

In [4]:
int_cols = [f for f in train.columns if train[f].dtype == int and f != 'failure']
float_cols = [f for f in train.columns if train[f].dtype == float]
categorical_cols = ['attribute_0', 'attribute_1']

In [5]:
def ohe(X_train, X_test, columns):
    transformer = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore', 
                       drop='first', 
                       categories=[['material_5', 'material_7'],
                                   ['material_5', 'material_6', 'material_8']]), columns),
        remainder='passthrough')

    X_train = pd.DataFrame(
        transformer.fit_transform(X_train), 
        columns=transformer.get_feature_names()
    )
    X_test = pd.DataFrame(
        transformer.transform(X_test),
        columns=transformer.get_feature_names()
    )
    
    return X_train, X_test

def impute(X, imputer):
    # source: https://www.kaggle.com/code/purist1024/per-product-code-imputation
    
    def transform(X):
        return pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

    cats = ["product_code", "attribute_0", "attribute_1", "attribute_2", "attribute_3"]
    ints = ["measurement_0", "measurement_1", "measurement_2"]
    right = pd.concat([transform(gdf.drop(columns=cats)) for g, gdf in X.groupby("product_code")],
                      axis="rows")
    right[ints] = right[ints].round().astype(int)
    return pd.concat([X[cats], right], axis="columns").reindex(columns=X.columns)

def oversample(X_train, y_train, oversampler):
    return oversampler.fit_resample(X_train, y_train)

def prepare_data(X_train, X_test, y_train, *, imputer, oversampler):
    if imputer != None:
        print("Imputing...")
        X_train = impute(X_train, imputer)
        X_test = impute(X_test, imputer)
    
    print("Dropping product code...")
    X_train = X_train.drop(columns=['product_code'], axis=1)
    X_test = X_test.drop(columns=['product_code'], axis=1)
    
    print("OHE...")
    X_train, X_test = ohe(X_train, X_test, categorical_cols)

    if oversampler != None:
        print("Oversampling...")
        X_train, y_train = oversampler(X_train, y_train, oversampler)
    
    return X_train, X_test, y_train

In [6]:
imputer = KNNImputer(n_neighbors=3)
oversampler = SMOTE(random_state=RANDOM_STATE)

X, test, y = prepare_data(X, test, y, imputer=imputer, oversampler=None)

print("Oversampling...")
X_oversampled, y_oversampled = oversampler.fit_resample(X, y)

Imputing...
Dropping product code...
OHE...
Oversampling...


In [7]:
def scale_data(data):
    scaler = StandardScaler()
    data.loc[:] = scaler.fit_transform(data)
    return data

X = scale_data(X)
X_oversampled = scale_data(X_oversampled)

test = scale_data(test)

In [8]:
# should've stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=RANDOM_STATE)

### Grid Search for Poisson Regressor

In [9]:
# https://www.kaggle.com/code/vencerlanz09/0-58624-poisson-regressor-eda-explanation
parameters = {
    "alpha":np.linspace(0.1, 5),
    "fit_intercept": [True],  # Important to be True since the PoissonRegressor do not have a predict_proba method
    "max_iter": [50*i for i in range(1, 29)],
    "tol":[1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    "verbose":[0],
}

In [10]:
%%time
model_poisson = PoissonRegressor()

model_poisson = GridSearchCV(model_poisson,
                             parameters,
                             cv=5,
                             scoring='roc_auc',
                             n_jobs=-1)
model_poisson.fit(X_train, y_train)

CPU times: user 50.4 s, sys: 3.38 s, total: 53.8 s
Wall time: 4min 37s


GridSearchCV(cv=5, estimator=PoissonRegressor(), n_jobs=-1,
             param_grid={'alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. ]),
                         'fit_intercept': [True],
                         'max_iter': [50, 100, 150, 200, 250, 300, 350, 400,
                                      450, 500, 550, 600, 650, 700, 750, 800,
                                      850, 900, 950, 1000, 1050, 1100, 1150,
                                      1200, 1250, 1300, 1350, 1400],
                         'tol': [0.1, 0.01, 0.001, 0.0001, 1e-05],
                         'verbose': [0]},
             scoring='roc_auc')

In [11]:
print(f'Best parameters {model_poisson.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_poisson.best_score_:.3f}'
)

Best parameters {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 50, 'tol': 0.001, 'verbose': 0}
Mean cross-validated accuracy score of the best_estimator: 0.589


In [12]:
%%time
model_poisson_oversampled = PoissonRegressor()

model_poisson_oversampled = GridSearchCV(model_poisson_oversampled,
                             parameters,
                             cv=5,
                             scoring='roc_auc',
                             n_jobs=-1)
model_poisson_oversampled.fit(X_train_oversampled, y_train_oversampled)

CPU times: user 1min 24s, sys: 7.52 s, total: 1min 32s
Wall time: 7min 3s


GridSearchCV(cv=5, estimator=PoissonRegressor(), n_jobs=-1,
             param_grid={'alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. ]),
                         'fit_intercept': [True],
                         'max_iter': [50, 100, 150, 200, 250, 300, 350, 400,
                                      450, 500, 550, 600, 650, 700, 750, 800,
                                      850, 900, 950, 1000, 1050, 1100, 1150,
                                      1200, 1250, 1300, 1350, 1400],
                         'tol': [0.1, 0.01, 0.001, 0.0001, 1e-05],
                         'verbose': [0]},
             scoring='roc_auc')

In [13]:
print(f'Best parameters {model_poisson_oversampled.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_poisson_oversampled.best_score_:.3f}'
)

Best parameters {'alpha': 0.2, 'fit_intercept': True, 'max_iter': 50, 'tol': 0.01, 'verbose': 0}
Mean cross-validated accuracy score of the best_estimator: 0.592


In [14]:
from sklearn.metrics import roc_auc_score
# 0.5954237526605417
y_pred_over = model_poisson_oversampled.predict(X_test_oversampled)
roc_auc_score(y_test_oversampled, y_pred_over)

0.5890320067801101

### LogisticRegression

In [15]:
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.linspace(-0.5, 0.5, 6),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [500 + 50*i for i in range(1, 6)]
}

In [16]:
model_logistic = LogisticRegression()

model_logistic = GridSearchCV(model_logistic,
                             parameters,
                             cv=5,
                             scoring='roc_auc',
                             n_jobs=-1)
model_logistic.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([-0.5, -0.3, -0.1,  0.1,  0.3,  0.5]),
                         'max_iter': [550, 600, 650, 700, 750],
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='roc_auc')

In [17]:
print(f'Best parameters {model_logistic.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_logistic.best_score_:.3f}'
)

Best parameters {'C': 0.10000000000000009, 'max_iter': 750, 'penalty': 'l1', 'solver': 'saga'}
Mean cross-validated accuracy score of the best_estimator: 0.590


In [18]:
%%time
model_logistic_oversampled = LogisticRegression()

model_logistic_oversampled = GridSearchCV(model_logistic_oversampled,
                             parameters,
                             cv=5,
                             scoring='roc_auc',
                             n_jobs=-1)
model_logistic_oversampled.fit(X_train_oversampled, y_train_oversampled)

CPU times: user 3.79 s, sys: 270 ms, total: 4.07 s
Wall time: 1min 3s


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([-0.5, -0.3, -0.1,  0.1,  0.3,  0.5]),
                         'max_iter': [550, 600, 650, 700, 750],
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='roc_auc')

In [19]:
print(f'Best parameters {model_logistic_oversampled.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_logistic_oversampled.best_score_:.3f}'
)

Best parameters {'C': 0.10000000000000009, 'max_iter': 750, 'penalty': 'l1', 'solver': 'liblinear'}
Mean cross-validated accuracy score of the best_estimator: 0.592


### Voting classifier without oversampling

In [20]:
poisson_best_params = {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 50, 'tol': 0.001, 'verbose': 0}
poisson_over_best_params = {'alpha': 0.2, 'fit_intercept': True, 'max_iter': 50, 'tol': 0.01, 'verbose': 0}

logistic_best_params = {'C': 0.10000000000000009, 'max_iter': 750, 'penalty': 'l1', 'solver': 'saga'}
logistic_over_best_params = {'C': 0.10000000000000009, 'max_iter': 700, 'penalty': 'l1', 'solver': 'liblinear'}

In [21]:
poisson = PoissonRegressor(**poisson_best_params)
poisson.fit(X_train, y_train)

poisson_over = PoissonRegressor(**poisson_over_best_params)
poisson_over.fit(X_train_oversampled, y_train_oversampled)

logistic = LogisticRegression(**logistic_best_params)
logistic.fit(X_train, y_train)

logistic_over = LogisticRegression(**logistic_over_best_params)
logistic_over.fit(X_train_oversampled, y_train_oversampled)

LogisticRegression(C=0.10000000000000009, max_iter=700, penalty='l1',
                   solver='liblinear')

In [22]:
for clf in [logistic, logistic_over]:
    print(clf)
    y_pred = clf.predict(X_test)

    print(roc_auc_score(y_pred, y_test))

LogisticRegression(C=0.10000000000000009, max_iter=750, penalty='l1',
                   solver='saga')
0.6162844276887632
LogisticRegression(C=0.10000000000000009, max_iter=700, penalty='l1',
                   solver='liblinear')
0.5457107399407236


### Submission

In [23]:
poisson = PoissonRegressor(**poisson_best_params)
poisson.fit(X, y)

poisson_over = PoissonRegressor(**poisson_over_best_params)
poisson_over.fit(X_oversampled, y_oversampled)

logistic = LogisticRegression(**logistic_best_params)
logistic.fit(X, y)

logistic_over = LogisticRegression(**logistic_over_best_params)
logistic_over.fit(X_oversampled, y_oversampled)

LogisticRegression(C=0.10000000000000009, max_iter=700, penalty='l1',
                   solver='liblinear')

In [24]:
def create_submission_data(model):
    probs = model.predict_proba(test)
    
    return probs[:, 1]

def create_submission_data_regression(model):
    probs = model.predict(test)
    
    return probs

def create_submission(model, filename, sub_method):
    test_probs = sub_method(model)
    
    submission = pd.DataFrame({'id': test.index+26570,
                               'failure': test_probs})
    submission.to_csv(filename, index=False)

In [25]:
create_submission(poisson_over, 'poiss_over.csv', create_submission_data_regression)

In [26]:
prob_log = create_submission_data(logistic)
prob_log_over = create_submission_data(logistic_over)

prob_poiss = create_submission_data_regression(poisson)
prob_poiss_over = create_submission_data_regression(poisson_over)

In [27]:
mean_probs = (prob_log*0.58468 + prob_log_over*0.58468 + prob_poiss*0.58565 + prob_poiss_over*0.58662)/(0.58468 + 0.58468 + 0.58565 + 0.58662)

In [28]:
submission = pd.DataFrame({'id': test.index+26570,
                               'failure': mean_probs})
submission.to_csv('mean', index=False)