In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import make_column_transformer
from sklearn.impute import  KNNImputer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression, PoissonRegressor


import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 1

In [2]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')

X = train.drop(columns=['failure'])
y = train['failure']

In [3]:
# https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/discussion/342319
X['m_3_missing'] = X.measurement_3.isna()
X['m_5_missing'] = X.measurement_5.isna()

test['m_3_missing'] = test.measurement_3.isna()
test['m_5_missing'] = test.measurement_5.isna()

In [4]:
int_cols = [f for f in train.columns if train[f].dtype == int and f != 'failure']
float_cols = [f for f in train.columns if train[f].dtype == float]
categorical_cols = ['attribute_0', 'attribute_1']

In [5]:
def ohe(X_train, X_test, columns):
    transformer = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore', 
                       drop='first', 
                       categories=[['material_5', 'material_7'],
                                   ['material_5', 'material_6', 'material_8']]), columns),
        remainder='passthrough')

    X_train = pd.DataFrame(
        transformer.fit_transform(X_train), 
        columns=transformer.get_feature_names()
    )
    X_test = pd.DataFrame(
        transformer.transform(X_test),
        columns=transformer.get_feature_names()
    )
    
    return X_train, X_test

def impute(X, imputer):
    # source: https://www.kaggle.com/code/purist1024/per-product-code-imputation
    
    def transform(X):
        return pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

    cats = ["product_code", "attribute_0", "attribute_1", "attribute_2", "attribute_3"]
    ints = ["measurement_0", "measurement_1", "measurement_2"]
    right = pd.concat([transform(gdf.drop(columns=cats)) for g, gdf in X.groupby("product_code")],
                      axis="rows")
    right[ints] = right[ints].round().astype(int)
    return pd.concat([X[cats], right], axis="columns").reindex(columns=X.columns)

def oversample(X_train, y_train, oversampler):
    return oversampler.fit_resample(X_train, y_train)

def prepare_data(X_train, X_test, y_train, *, imputer, oversampler):
    if imputer != None:
        print("Imputing...")
        X_train = impute(X_train, imputer)
        X_test = impute(X_test, imputer)
    
    print("Dropping product code...")
    X_train = X_train.drop(columns=['product_code'], axis=1)
    X_test = X_test.drop(columns=['product_code'], axis=1)
    
    print("OHE...")
    X_train, X_test = ohe(X_train, X_test, categorical_cols)

    if oversampler != None:
        print("Oversampling...")
        X_train, y_train = oversampler(X_train, y_train, oversampler)
    
    return X_train, X_test, y_train

In [6]:
imputer = KNNImputer(n_neighbors=3)
oversampler = SMOTE(random_state=RANDOM_STATE)

X, test, y = prepare_data(X, test, y, imputer=imputer, oversampler=None)

X['m_5_3_mult_null'] = X['m_3_missing']*X['m_5_missing']
X['measurement_3_5'] = X['measurement_3']*X['measurement_5']

test['m_5_3_mult_null'] = test['m_3_missing']*test['m_5_missing']
test['measurement_3_5'] = test['measurement_3']*test['measurement_5']

print("Oversampling...")
X_oversampled, y_oversampled = oversampler.fit_resample(X, y)

Imputing...
Dropping product code...
OHE...
Oversampling...


In [7]:
X.describe()

Unnamed: 0,onehotencoder__x0_material_7,onehotencoder__x1_material_6,onehotencoder__x1_material_8,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,m_3_missing,m_5_missing,m_5_3_mult_null,measurement_3_5
count,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,...,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0,26570.0
mean,0.802409,0.201091,0.40892,127.825788,6.754046,7.240459,7.415883,8.232518,6.256568,17.790553,...,11.704226,15.648266,16.042048,14.9995,16.458011,701.532666,0.014339,0.025442,0.000414,304.722203
std,0.39819,0.400824,0.491644,38.90244,1.471852,1.456493,4.11669,4.199401,3.309109,0.996526,...,1.459269,1.129384,1.458258,1.510824,1.661219,119.617351,0.118888,0.157467,0.020343,24.639553
min,0.0,0.0,0.0,33.16,5.0,5.0,0.0,0.0,0.0,13.968,...,5.167,10.89,9.14,9.104,9.701,196.787,0.0,0.0,0.0,217.050246
25%,1.0,0.0,0.0,100.1225,6.0,6.0,4.0,5.0,4.0,17.121,...,10.742,14.911,15.091,13.997,15.322,623.79425,0.0,0.0,0.0,287.842455
50%,1.0,0.0,0.0,122.45,6.0,8.0,7.0,8.0,6.0,17.784,...,11.717,15.624,16.028,14.978,16.438333,701.409167,0.0,0.0,0.0,304.2067
75%,1.0,0.0,1.0,149.02,8.0,8.0,10.0,11.0,8.0,18.461917,...,12.685,16.337,17.032,15.971,17.565,779.6255,0.0,0.0,0.0,320.9812
max,1.0,1.0,1.0,385.86,9.0,9.0,29.0,29.0,24.0,21.499,...,17.663,22.713,22.303,21.626,24.094,1312.794,1.0,1.0,1.0,403.084284


In [8]:
def scale_data(data):
    scaler = StandardScaler()
    data.loc[:] = scaler.fit_transform(data)
    return data

X = scale_data(X)
X_oversampled = scale_data(X_oversampled)

test = scale_data(test)

In [9]:
cols_to_drop = ["onehotencoder__x0_material_7", "onehotencoder__x1_material_6", "onehotencoder__x1_material_8", "attribute_2", "attribute_3"]
X = X.drop(columns=cols_to_drop, axis=1)
X_oversampled = X_oversampled.drop(columns=cols_to_drop, axis=1)
test = test.drop(columns=cols_to_drop, axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=RANDOM_STATE, stratify=y_oversampled)

In [11]:
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.linspace(-1, 1, 10),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    'max_iter': [500 + 50*i for i in range(1, 3)]
}

In [12]:
model_logistic = LogisticRegression()

model_logistic = GridSearchCV(model_logistic,
                             parameters,
                             cv=5,
                             scoring='roc_auc',
                             n_jobs=-1)
model_logistic.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([-1.        , -0.77777778, -0.55555556, -0.33333333, -0.11111111,
        0.11111111,  0.33333333,  0.55555556,  0.77777778,  1.        ]),
                         'max_iter': [550, 600], 'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='roc_auc')

In [13]:
print(f'Best parameters {model_logistic.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_logistic.best_score_:.3f}'
)


Best parameters {'C': 0.11111111111111116, 'max_iter': 600, 'penalty': 'l1', 'solver': 'liblinear'}
Mean cross-validated accuracy score of the best_estimator: 0.589


In [14]:
best_params = model_logistic.best_params_

submission_model = LogisticRegression(**best_params)
submission_model.fit(X_train, y_train)

LogisticRegression(C=0.11111111111111116, max_iter=600, penalty='l1',
                   solver='liblinear')

In [15]:
def create_submission_data(model):
    probs = model.predict_proba(test)
    
    return probs[:, 1]

def create_submission_data_regression(model):
    probs = model.predict(test)
    
    return probs

def create_submission(model, filename, sub_method):
    test_probs = sub_method(model)
    
    submission = pd.DataFrame({'id': test.index+26570,
                               'failure': test_probs})
    submission.to_csv(filename, index=False)

In [16]:
create_submission(submission_model, 'two_more_features_dropedalot.csv', create_submission_data)