In [1]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

import xgboost as xgb
import lightgbm as lgb

from imblearn.over_sampling import RandomOverSampler

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 9485

In [2]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')

In [3]:
train.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [4]:
test.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,F,119.57,material_5,material_6,6,4,6,9,6,19.305,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
26571,F,113.51,material_5,material_6,6,4,11,8,0,17.883,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
26572,F,112.16,material_5,material_6,6,4,8,12,4,18.475,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
26573,F,112.72,material_5,material_6,6,4,8,11,10,16.518,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
26574,F,208.0,material_5,material_6,6,4,14,16,8,17.808,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [5]:
y = train['failure']
X = train.drop(columns=['failure'], axis=1)

In [6]:
int_cols = [f for f in train.columns if train[f].dtype == int and f != 'failure']
float_cols = [f for f in train.columns if train[f].dtype == float]
categorical_cols = ['attribute_0', 'attribute_1']

In [7]:
X['m_3_missing'] = X.measurement_3.isna()
X['m_5_missing'] = X.measurement_5.isna()

test['m_3_missing'] = test.measurement_3.isna()
test['m_5_missing'] = test.measurement_5.isna()


In [8]:
def ohe(X_train, X_test, columns):
    display(X_test)
    transformer = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore', 
                       drop='first', 
                       categories=[['material_5', 'material_7'],
                                   ['material_5', 'material_6', 'material_8']]), columns),
        remainder='passthrough')

    X_train = pd.DataFrame(
        transformer.fit_transform(X_train), 
        columns=transformer.get_feature_names()
    )
    X_test = pd.DataFrame(
        transformer.transform(X_test),
        columns=transformer.get_feature_names()
    )
    display(X_test)

    
    return X_train, X_test

def impute_per_product_code(X, imputer):
    """Impute missing values in TPS2208 data.
    
    Imputation is performed over separate "per-product-code" batches, and is designed to leave all non-imputed 
    data in the exact same format as before imputation."""
    # source: https://www.kaggle.com/code/purist1024/per-product-code-imputation
    def transform(X):
        return pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

    cats = ["product_code", "attribute_0", "attribute_1", "attribute_2", "attribute_3"]
    ints = ["measurement_0", "measurement_1", "measurement_2"]
    right = pd.concat([transform(gdf.drop(columns=cats)) for g, gdf in X.groupby("product_code")],
                      axis="rows")
    right[ints] = right[ints].round().astype(int)
    return pd.concat([X[cats], right], axis="columns").reindex(columns=X.columns)

def oversample(X_train, y_train, oversampler):
    return oversampler.fit_resample(X_train, y_train)


In [9]:
agg_dict = {key: ['mean', 'std'] for key in float_cols}
grouped_features_train = X.groupby("product_code").agg(agg_dict)
grouped_features_test = test.groupby("product_code").agg(agg_dict)

grouped_features = grouped_features_train.append(grouped_features_test, ignore_index=False)

In [10]:
grouped_features.loc['A']

loading         mean    127.469361
                std      39.093590
measurement_3   mean     17.808524
                std       0.998438
measurement_4   mean     11.716103
                std       0.998254
measurement_5   mean     17.143195
                std       0.990529
measurement_6   mean     17.505004
                std       1.009220
measurement_7   mean     11.735274
                std       1.001037
measurement_8   mean     19.034251
                std       1.009841
measurement_9   mean     11.436477
                std       0.991172
measurement_10  mean     16.124386
                std       0.995921
measurement_11  mean     19.439558
                std       1.638846
measurement_12  mean     12.249168
                std       1.425327
measurement_13  mean     15.585325
                std       0.997598
measurement_14  mean     16.110886
                std       1.546100
measurement_15  mean     14.456503
                std       1.592045
measurement_16  mean

In [11]:
X.apply(lambda row: grouped_features.loc[row['product_code']]['measurement_6']['mean'], axis=1)

id
0        17.505004
1        17.505004
2        17.505004
3        17.505004
4        17.505004
           ...    
26565    17.521759
26566    17.521759
26567    17.521759
26568    17.521759
26569    17.521759
Length: 26570, dtype: float64

In [12]:
def append_features(df):
    for feature in float_cols:
        mean_feature = f'{feature}_mean'
        std_feature = f'{feature}_std'
        area_feature = f'area'

        df[mean_feature] = df.apply(lambda row: grouped_features.loc[row['product_code']][feature]['mean'], axis=1)
        df[std_feature] = df.apply(lambda row: grouped_features.loc[row.product_code][feature]['std'], axis=1)
    df[area_feature] = df.apply(lambda row: row['attribute_2']*row['attribute_3'], axis=1)


In [13]:
def prepare_data(X_train, X_test, y_train, imputer, oversampler):
    if imputer != None:
        print("Imputing...")
        X_train = impute_per_product_code(X_train, imputer)
        X_test = impute_per_product_code(X_test, imputer)
    if oversampler != None:
        print("Oversampling...")
        X_train, y_train = oversample(X_train, y_train, oversampler)
    print("Appending new features...")
    append_features(X_train)
    append_features(X_test)
    
    print("Dropping product code...")
    X_train = X_train.drop(columns=['product_code'], axis=1)
    X_test = X_test.drop(columns=['product_code'], axis=1)

    print("OHE...")
    X_train, X_test = ohe(X_train, X_test, categorical_cols)

    return X_train, X_test, y_train

In [14]:
def create_submission_data(model, X_sub):
    probs = model.predict_proba(X_sub)
    
    return probs[:, 1]

def create_submission(trained_model, X_sub, filename):
    sub_probs = create_submission_data(trained_model, X_sub)
    
    submission = pd.DataFrame({'id': X_sub.index + 26570,
                               'failure': sub_probs})
    submission.to_csv(filename, index=False)

In [15]:
imputer = KNNImputer(n_neighbors=7)
oversampler = None
X_train, X_sub, y_train = prepare_data(X, test, y, imputer, oversampler)

Imputing...
Appending new features...
Dropping product code...
OHE...


Unnamed: 0_level_0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_13_std,measurement_14_mean,measurement_14_std,measurement_15_mean,measurement_15_std,measurement_16_mean,measurement_16_std,measurement_17_mean,measurement_17_std,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,119.57,material_5,material_6,6,4,6,9,6,19.305,10.178000,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24
26571,113.51,material_5,material_6,6,4,11,8,0,17.883,11.927000,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24
26572,112.16,material_5,material_6,6,4,8,12,4,18.475,10.481000,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24
26573,112.72,material_5,material_6,6,4,8,11,10,16.518,10.888000,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24
26574,208.00,material_5,material_6,6,4,14,16,8,17.808,12.693000,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,144.74,material_7,material_5,9,5,0,4,9,18.465,12.570000,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45
47341,74.53,material_7,material_5,9,5,4,8,7,18.900,9.896000,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45
47342,67.73,material_7,material_5,9,5,10,11,2,18.656,11.401571,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45
47343,126.15,material_7,material_5,9,5,8,16,11,16.536,11.226000,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45


Unnamed: 0,onehotencoder__x0_material_7,onehotencoder__x1_material_6,onehotencoder__x1_material_8,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13_std,measurement_14_mean,measurement_14_std,measurement_15_mean,measurement_15_std,measurement_16_mean,measurement_16_std,measurement_17_mean,measurement_17_std,area
0,0.0,1.0,0.0,119.57,6.0,4.0,6.0,9.0,6.0,19.305,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24.0
1,0.0,1.0,0.0,113.51,6.0,4.0,11.0,8.0,0.0,17.883,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24.0
2,0.0,1.0,0.0,112.16,6.0,4.0,8.0,12.0,4.0,18.475,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24.0
3,0.0,1.0,0.0,112.72,6.0,4.0,8.0,11.0,10.0,16.518,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24.0
4,0.0,1.0,0.0,208.00,6.0,4.0,14.0,16.0,8.0,17.808,...,1.691449,16.116106,1.007899,14.444293,1.381430,16.389055,1.637560,701.706242,165.424865,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20770,1.0,0.0,0.0,144.74,9.0,5.0,0.0,4.0,9.0,18.465,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45.0
20771,1.0,0.0,0.0,74.53,9.0,5.0,4.0,8.0,7.0,18.900,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45.0
20772,1.0,0.0,0.0,67.73,9.0,5.0,10.0,11.0,2.0,18.656,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45.0
20773,1.0,0.0,0.0,126.15,9.0,5.0,8.0,16.0,11.0,16.536,...,1.430253,16.442150,1.765719,15.129570,1.007715,16.706054,1.677022,701.336936,93.396601,45.0


In [16]:
def train_model(model, X_train, X_sub, y_train, filename="additional_features.csv"):
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.25, random_state=RANDOM_STATE)
    
    scores = cross_validate(model, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring=('roc_auc', 'neg_log_loss'))
    
    print("-------------")
    print(scores)
    print("-------------")

    print("METRICS: ")
    print("ROC_AUC \t", end="")
    print("Scores: ", scores['test_roc_auc'], end="\t")
    print("Mean: ", scores['test_roc_auc'].mean())
    
    print("LOG_LOSS \t", end="")
    print("Scores: ", -1*scores['test_neg_log_loss'], end='\t')
    print("Mean: ", -1*scores['test_neg_log_loss'].mean())
    
    model.fit(X_train, y_train)
    roc_scores = scores['test_roc_auc']
    
    create_submission(model, X_sub, filename)
    
    return (model, roc_scores)

In [17]:
%%time

log_reg_clf = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=RANDOM_STATE)
log_reg_model = make_pipeline(StandardScaler(), log_reg_clf)

trained_model, roc_scores = train_model(log_reg_model, X_train, X_sub, y_train, 'logreg.csv')
# 0.588045898170332

-------------
{'fit_time': array([0.09832311, 0.09049869, 0.10038471, 0.09463787, 0.09528279]), 'score_time': array([0.01198173, 0.01149917, 0.00902867, 0.01320338, 0.00905895]), 'test_roc_auc': array([0.57778512, 0.5966038 , 0.58100975, 0.60170211, 0.5908942 ]), 'test_neg_log_loss': array([-0.51003483, -0.5062727 , -0.50830697, -0.5073569 , -0.50822397])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.57778512 0.5966038  0.58100975 0.60170211 0.5908942 ]	Mean:  0.5895989958973786
LOG_LOSS 	Scores:  [0.51003483 0.5062727  0.50830697 0.5073569  0.50822397]	Mean:  0.5080390740650091
CPU times: user 243 ms, sys: 245 ms, total: 488 ms
Wall time: 1.65 s


In [18]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

QDA = QuadraticDiscriminantAnalysis()
QDA_model = make_pipeline(StandardScaler(), QDA)

trained_model, roc_scores = train_model(QDA_model, X_train, X_sub, y_train)

-------------
{'fit_time': array([0.13956332, 0.1396327 , 0.1304028 , 0.11171699, 0.07004809]), 'score_time': array([0.02981281, 0.03305364, 0.0291667 , 0.04239392, 0.01867533]), 'test_roc_auc': array([0.52154008, 0.49828744, 0.49377017, 0.51508812, 0.51164682]), 'test_neg_log_loss': array([-10.97541239,  -5.75799506, -16.67602236,  -6.97227774,
        -7.63606946])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.52154008 0.49828744 0.49377017 0.51508812 0.51164682]	Mean:  0.5080665264771642
LOG_LOSS 	Scores:  [10.97541239  5.75799506 16.67602236  6.97227774  7.63606946]	Mean:  9.603555401845336


after removing some features

In [19]:
len(X_train.columns) // 1.5

39.0

In [20]:
len(X_train.columns)


59

In [21]:
%%time
clf_reg = LogisticRegression()
log_reg_model_pca = make_pipeline(StandardScaler(), PCA(int(len(X_train.columns) // 2)), clf_reg)

# print(X_train.columns)
# X_train_dropped = X_train.drop(columns=['onehotencoder__x0_material_7', 'onehotencoder__x1_material_6', 'onehotencoder__x1_material_8', 'attribute_2', 'attribute_3'], axis=1)
# X_sub_dropped = X_sub.drop(columns=['onehotencoder__x0_material_7', 'onehotencoder__x1_material_6', 'onehotencoder__x1_material_8', 'attribute_2', 'attribute_3'], axis=1)

trained_model, roc_scores = train_model(log_reg_model_pca, X_train, X_sub, y_train)
# 0.588045898170332

-------------
{'fit_time': array([0.25507808, 0.26366711, 0.23897791, 0.23778725, 0.13810253]), 'score_time': array([0.01812696, 0.01395535, 0.01868796, 0.0193665 , 0.01165342]), 'test_roc_auc': array([0.58448023, 0.60235987, 0.57504532, 0.59726787, 0.58107619]), 'test_neg_log_loss': array([-0.5098627 , -0.50414167, -0.50943746, -0.50648637, -0.51028029])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.58448023 0.60235987 0.57504532 0.59726787 0.58107619]	Mean:  0.588045898170332
LOG_LOSS 	Scores:  [0.5098627  0.50414167 0.50943746 0.50648637 0.51028029]	Mean:  0.5080416981534861
CPU times: user 595 ms, sys: 876 ms, total: 1.47 s
Wall time: 960 ms


Try oversample

In [22]:
from imblearn.over_sampling import SMOTE

oversampler = SMOTE(sampling_strategy=0.4)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

log_reg_clf_over = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=RANDOM_STATE)
log_reg_model_over = make_pipeline(StandardScaler(), log_reg_clf_over)

trained_model, roc_scores = train_model(log_reg_clf_over, X_train_oversampled, X_sub, y_train_oversampled, 'oversampled.csv')

# 0.5852244617767834

-------------
{'fit_time': array([12.31027198,  3.85189223,  3.05867982,  3.68068695,  4.89397001]), 'score_time': array([0.00889611, 0.00735211, 0.01008701, 0.010818  , 0.00786328]), 'test_roc_auc': array([0.58252776, 0.59803905, 0.59407789, 0.5928351 , 0.59857874]), 'test_neg_log_loss': array([-0.59001709, -0.58792855, -0.58765961, -0.58946923, -0.58740764])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.58252776 0.59803905 0.59407789 0.5928351  0.59857874]	Mean:  0.5932117095092686
LOG_LOSS 	Scores:  [0.59001709 0.58792855 0.58765961 0.58946923 0.58740764]	Mean:  0.5884964228720873


powered loading

In [23]:
from sklearn.preprocessing import PowerTransformer

X_train_powered = X_train.copy()
X_sub_powered = X_sub.copy()

display(X_train_powered)

pt = PowerTransformer()
print(X_train_powered['loading'].values)

X_train_powered['loading'] = pt.fit_transform(X_train_powered[['loading']])
X_sub_powered['loading'] = pt.transform(X_sub_powered[['loading']])

Unnamed: 0,onehotencoder__x0_material_7,onehotencoder__x1_material_6,onehotencoder__x1_material_8,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13_std,measurement_14_mean,measurement_14_std,measurement_15_mean,measurement_15_std,measurement_16_mean,measurement_16_std,measurement_17_mean,measurement_17_std,area
0,1.0,0.0,1.0,80.10,9.0,5.0,7.0,8.0,4.0,18.040,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
1,1.0,0.0,1.0,84.89,9.0,5.0,14.0,3.0,3.0,18.213,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
2,1.0,0.0,1.0,82.43,9.0,5.0,12.0,1.0,5.0,18.057,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
3,1.0,0.0,1.0,101.07,9.0,5.0,13.0,2.0,6.0,17.295,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
4,1.0,0.0,1.0,188.06,9.0,5.0,9.0,2.0,8.0,19.346,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,1.0,1.0,0.0,158.95,6.0,9.0,6.0,16.0,4.0,16.301,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26566,1.0,1.0,0.0,146.02,6.0,9.0,10.0,12.0,8.0,17.543,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26567,1.0,1.0,0.0,115.62,6.0,9.0,1.0,10.0,1.0,15.670,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26568,1.0,1.0,0.0,106.38,6.0,9.0,2.0,9.0,4.0,18.059,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0


[ 80.1   84.89  82.43 ... 115.62 106.38 131.2 ]


In [24]:
display(X_train_powered)

Unnamed: 0,onehotencoder__x0_material_7,onehotencoder__x1_material_6,onehotencoder__x1_material_8,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13_std,measurement_14_mean,measurement_14_std,measurement_15_mean,measurement_15_std,measurement_16_mean,measurement_16_std,measurement_17_mean,measurement_17_std,area
0,1.0,0.0,1.0,-1.427932,9.0,5.0,7.0,8.0,4.0,18.040,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
1,1.0,0.0,1.0,-1.231648,9.0,5.0,14.0,3.0,3.0,18.213,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
2,1.0,0.0,1.0,-1.331022,9.0,5.0,12.0,1.0,5.0,18.057,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
3,1.0,0.0,1.0,-0.642441,9.0,5.0,13.0,2.0,6.0,17.295,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
4,1.0,0.0,1.0,1.447925,9.0,5.0,9.0,2.0,8.0,19.346,...,0.997598,16.110886,1.546100,14.456503,1.592045,16.410397,1.634601,702.646048,106.622368,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,1.0,1.0,0.0,0.883053,6.0,9.0,6.0,16.0,4.0,16.301,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26566,1.0,1.0,0.0,0.597676,6.0,9.0,10.0,12.0,8.0,17.543,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26567,1.0,1.0,0.0,-0.188674,6.0,9.0,1.0,10.0,1.0,15.670,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0
26568,1.0,1.0,0.0,-0.469632,6.0,9.0,2.0,9.0,4.0,18.059,...,0.981313,15.811408,1.608324,15.111993,1.009718,16.066552,1.676261,701.852697,140.141663,54.0


In [25]:
log_reg_clf_powered = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=RANDOM_STATE)
log_reg_model_powered = make_pipeline(StandardScaler(), log_reg_clf_powered)

trained_model, roc_scores = train_model(log_reg_model_powered, X_train_powered, X_sub_powered, y_train, 'power_transformed.csv')

-------------
{'fit_time': array([0.28931975, 0.20993137, 0.22634602, 0.23090339, 0.18235612]), 'score_time': array([0.02938342, 0.03080511, 0.03104448, 0.02378273, 0.02184319]), 'test_roc_auc': array([0.57685289, 0.59594482, 0.58197475, 0.60076883, 0.58963018]), 'test_neg_log_loss': array([-0.50986879, -0.5063662 , -0.50843251, -0.50654488, -0.50849172])}
-------------
METRICS: 
ROC_AUC 	Scores:  [0.57685289 0.59594482 0.58197475 0.60076883 0.58963018]	Mean:  0.589034293087295
LOG_LOSS 	Scores:  [0.50986879 0.5063662  0.50843251 0.50654488 0.50849172]	Mean:  0.5079408217215186
