In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

import xgboost as xgb
import lightgbm as lgb

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_raw = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test_raw = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')

X = train_raw.drop(columns=['failure'])
X_test = test_raw

y = train_raw['failure']

In [3]:
X.columns

Index(['product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2',
       'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17'],
      dtype='object')

In [4]:
int_cols = [f for f in train_raw.columns if train_raw[f].dtype == int and f != 'failure']
float_cols = [f for f in train_raw.columns if train_raw[f].dtype == float]
categorical_cols = ['attribute_0', 'attribute_1']

In [5]:
X_train_copy = X.copy()
X_test_copy = X_test.copy()

In [6]:
def preprocess(X_train, X_test):
    transformer = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore', 
                       drop='first', 
                       categories=[['material_5', 'material_7'],
                                   ['material_5', 'material_6', 'material_8']]), categorical_cols),
        remainder='passthrough')

    X_train = pd.DataFrame(
        transformer.fit_transform(X_train), 
        columns=transformer.get_feature_names()
    )
    X_test = pd.DataFrame(
        transformer.transform(X_test),
        columns=transformer.get_feature_names()
    )
    
    return X_train, X_test

In [7]:
a, b = preprocess(X_train_copy, X_test_copy)

In [8]:
X_train_impute = X.copy()
X_test_impute = X_test.copy()

In [9]:
X_train_impute.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885


In [10]:
pd.concat([X_train_impute[float_cols].isna().sum().rename('missing values in train'),
           X_test_impute[float_cols].isna().sum().rename('missing values in test')],
          axis=1)

Unnamed: 0,missing values in train,missing values in test
loading,250,223
measurement_3,381,329
measurement_4,538,409
measurement_5,676,508
measurement_6,796,624
measurement_7,937,720
measurement_8,1048,846
measurement_9,1227,904
measurement_10,1300,1067
measurement_11,1468,1136


In [11]:
def impute(X_train, X_valid):
    my_imputer = SimpleImputer()
    
    X_train["measurement_3"] = my_imputer.fit_transform(X_train[["measurement_3"]]).ravel()
    return X_train, X_valid
#     imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
#     imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

#     # Imputation removed column names; put them back
#     imputed_X_train.columns = X_train.columns
#     imputed_X_valid.columns = X_valid.columns
    
#     return imputed_X_train, imputed_X_valid
#     transformer = make_column_transformer(
#         (SimpleImputer(), ['measurement_3']),
#         remainder='passthrough')
#     print(X_train.columns)
#     print(X_test.columns)
    
#     print(X_train)
#     print(transformer.fit_transform(X_train))
#     X_train = pd.DataFrame(
#         transformer.fit_transform(X_train), 
#         columns=X_train.columns
#     )
#     X_test = pd.DataFrame(
#         transformer.transform(X_test),
#         columns=X_test.columns
#     )
    
#     return X_train, X_test

a, b = impute(X_train_impute, X_train_impute)

In [12]:
X_train_impute.columns

Index(['product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2',
       'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17'],
      dtype='object')

In [13]:
pd.concat([X_train_impute[float_cols].isna().sum().rename('missing values in train'),
           X_test_impute[float_cols].isna().sum().rename('missing values in test')],
          axis=1)

Unnamed: 0,missing values in train,missing values in test
loading,250,223
measurement_3,0,329
measurement_4,538,409
measurement_5,676,508
measurement_6,796,624
measurement_7,937,720
measurement_8,1048,846
measurement_9,1227,904
measurement_10,1300,1067
measurement_11,1468,1136


In [14]:
float_cols = [f for f in a.columns if a[f].dtype == float]
pd.concat([a[float_cols].isna().sum().rename('missing values in train'),
           b[float_cols].isna().sum().rename('missing values in test')],
          axis=1)

Unnamed: 0,missing values in train,missing values in test
loading,250,250
measurement_3,0,0
measurement_4,538,538
measurement_5,676,676
measurement_6,796,796
measurement_7,937,937
measurement_8,1048,1048
measurement_9,1227,1227
measurement_10,1300,1300
measurement_11,1468,1468


In [15]:
def impute(X_train, X_test, imputer, cols):
    X_train[cols] = imputer.fit_transform(X_train[cols])
    X_test[cols] = imputer.transform(X_test[cols])
    
    return X_train, X_test
    
X_train_impute = X.copy()
X_test_impute = X_test.copy()

a, b = impute(X_train_impute, X_test_impute, SimpleImputer(), float_cols)
display(a)
# transformer = make_column_transformer(
#         (SimpleImputer(), float_cols),
#         remainder='passthrough')
# X_train_impute.head()
# print(transformer.fit_transform(X_train_impute)[0])

# X_train = pd.DataFrame(
#     transformer.fit_transform(X_train_impute), 
#     columns=transformer.get_feature_names()
# )
# X_test = pd.DataFrame(
#     transformer.transform(X_test_impute),
#     columns=transformer.get_feature_names()
# )

# X_train.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.10,material_7,material_8,9,5,7,8,4,18.040,...,20.155,10.672000,15.859,17.594000,15.193,15.029,16.048444,13.034,14.684000,764.100
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,17.889,12.448000,17.947,17.915000,11.755,14.732,15.425000,14.395,15.631000,682.057
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,18.288,12.715000,15.607,19.172085,13.798,16.711,18.631000,14.094,17.946000,663.376
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,19.060,12.471000,16.346,18.377000,10.020,15.250,15.562000,16.154,17.172000,826.282
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,18.093,10.337000,17.082,19.932000,12.428,16.182,12.760000,13.153,16.412000,579.885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,E,158.95,material_7,material_6,6,9,6,16,4,16.301,...,19.354,11.430725,12.177,17.942000,10.112,15.795,18.572000,16.144,16.460727,729.131
26566,E,146.02,material_7,material_6,6,9,10,12,8,17.543,...,19.563,11.242000,14.179,20.564000,10.234,14.450,14.322000,13.146,16.471000,853.924
26567,E,115.62,material_7,material_6,6,9,1,10,1,15.670,...,19.279,11.407000,16.437,17.476000,8.668,15.069,16.599000,15.590,14.065000,750.364
26568,E,106.38,material_7,material_6,6,9,2,9,4,18.059,...,19.358,11.392000,17.064,17.814000,14.928,16.273,15.485000,13.624,12.865000,730.156


In [16]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

def oversample(X_train, y_train):
    oversample = RandomOverSampler()
    return oversample.fit_resample(X_train, y_train)

print(Counter(y))
X_over, y_over = oversample(X, y)
print(Counter(y_over))


Counter({0: 20921, 1: 5649})
Counter({0: 20921, 1: 20921})
