In [1]:
# Viz
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

# Modeling 
from sklearn.ensemble import VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import cluster, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_validate, GridSearchCV, cross_val_score, StratifiedKFold, GroupKFold
from sklearn.linear_model import HuberRegressor
from colorama import Fore, Back, Style
# Other
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Data
train = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv')
target = train['failure']
test['failure'] = 1
test_target = test['failure'] 
train.drop('failure',axis=1, inplace = True)
test.drop('failure',axis=1, inplace = True)
data = pd.concat([train, test])
train.head()

## Pre-Processing 

In [3]:
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
#特征组合
data['area'] = data['attribute_2'] * data['attribute_3']
feature = [f for f in test.columns if f.startswith('measurement') or f=='loading']

In [4]:
data['attribute_2']

In [5]:
feature

In [6]:
# dictionnary of dictionnaries (for the 11 best correlated measurement columns), 
# we will use the dictionnaries below to select the best correlated columns according to the product code)
# Only for 'measurement_17' we make a 'manual' selection :
full_fill_dict ={}
full_fill_dict['measurement_17'] = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

In [7]:
col = [col for col in test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
col

In [8]:
col = [col for col in test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
a = []
b =[]
for x in range(3,17):
    corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
    a.append(np.round(np.sum(corr[1:4]),3)) 
    b.append(f'measurement_{x}')
c = pd.DataFrame()
c['Selected columns'] = b
c['correlation total'] = a
c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
print(f'Columns selected by correlation sum of the 3 first rows : ')
display(c.head(10))


In [9]:
x = 3
corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
corr

In [10]:
c

In [11]:
full_fill_dict

In [12]:
for i in range(10):
    measurement_col = 'measurement_' + c.iloc[i,0][12:] # we select the next best correlated column 
    fill_dict ={}
    for x in data.product_code.unique() : 
        corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
        measurement_col_dic = {}
        measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
        fill_dict[x] = measurement_col_dic[measurement_col]
    full_fill_dict[measurement_col] =fill_dict

In [13]:
feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
nullValue_cols = [col for col in train.columns if train[col].isnull().sum()!=0]
    
for code in data.product_code.unique():
    total_na_filled_by_linear_model = 0
    print(f'\n-------- Product code {code} ----------\n')
    print(f'filled by linear model :')
    for measurement_col in list(full_fill_dict.keys()):
        
        
        tmp = data[data.product_code==code]
        column = full_fill_dict[measurement_col][code] #top n most relevence featutes
        tmp_train = tmp[column+[measurement_col]].dropna(how='any')
        tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

        model = HuberRegressor(epsilon=1.9)
        model.fit(tmp_train[column], tmp_train[measurement_col])
        data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
        
        total_na_filled_by_linear_model += len(tmp_test)
        
    # others NA columns:
    NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
    model1 = KNNImputer(n_neighbors=3)
    data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
    print(f'\n{total_na_filled_by_linear_model} filled by linear model ') 
    print(f'{NA} filled by KNN ')
    
data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)

In [14]:
float_cols = [col for col in train.columns if train[col].dtypes == 'float64']

In [15]:
for i in float_cols:
    train[i] = train[i].apply(lambda x:np.log(x+1))
    test[i] = test[i].apply(lambda x:np.log(x+1))

In [16]:
!pip install feature_engine
from feature_engine.encoding import WoEEncoder

In [17]:
data_id = data['id']
data.drop(columns=['id'], inplace=True)

In [18]:
data

In [19]:
data['attribute_2'] = data['attribute_2'].apply(lambda x: str(x))
data['attribute_3'] = data['attribute_3'].apply(lambda x: str(x))

In [20]:
NUMERIC_FEATURE_NAMES = ['loading', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17', 'm3_missing',
       'm5_missing', 'area', 'measurement_avg']
for i in  NUMERIC_FEATURE_NAMES:
    data[i] = data[i].apply(lambda x:np.float32(x))

In [21]:
train = data.iloc[:train.shape[0],:]
test = data.iloc[train.shape[0]:,:]
print(train.shape, test.shape)

groups = train.product_code
X = train
y = target

In [22]:
X.columns

In [23]:
X

In [24]:
select_feature = ['loading',
                  'attribute_0',
                  'measurement_17',
                  'measurement_0',
                  'measurement_1',
                  'measurement_2',
                  'area',
                  'm3_missing',
                  'm5_missing',
                  'measurement_avg']

## Model Training

# TabTransformer

In [25]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import matplotlib.pyplot as plt

In [None]:
df.columns

In [26]:
NUMERIC_FEATURE_NAMES = [i for i in df.columns]
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    'feature2':['1', '2']
}


CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES else ["NA"]
    for feature_name in FEATURE_NAMES
]
TARGET_FEATURE_NAME = "label"
TARGET_LABELS = ["0", "1"]

# perpare input

In [28]:
def get_dataset_from_dataframe(data, batch_size=128, shuffle=False):
    x, y = data
    
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(x), y)
    )
    
    if shuffle:
        dataset = dataset.shuffle(len(x))
    
        
    return dataset.batch(batch_size).cache()

## Configure the hyperparameters

In [None]:
## optuna

In [29]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 265
NUM_EPOCHS = 15

NUM_TRANSFORMNUM_TRANSFORMER_BLOCKS = 3  
ER_BLOCKS = 3  
NUM_HEADS = 4  
EMBEDDING_DIMS = 16  
MLP_HIDDEN_UNITS_FACTORS = [
    2,
    1,
]  
NUM_MLP_BLOCKS = 2  

In [31]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

# Implement a training and evaluation procedure

In [344]:
def run_experiment(
    model,
    train_data,
    val_data,
    test_data,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):

    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.AUC()],
        run_eagerly=True,
    )

    train_dataset = get_dataset_from_dataframe(train_data, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_dataframe(val_data, batch_size)
    test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_data)).batch(BATCH_SIZE).cache()
    

    print("Start training the model...")
    history = model.fit(
        train_dataset, epochs=num_epochs, validation_data=validation_dataset
    )
    print("Model training finished")

    _, score = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation AUC: {round(score * 100, 2)}%")
    
    predict = model.predict(test_dataset)
    #print(predict)
    return history, np.array(predict).reshape(-1)

# Create model inputs

In [345]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

# Encode features

In [346]:
def encode_inputs(inputs, embedding_dims):

    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]


            lookup = layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int",
            )

            encoded_feature = lookup(inputs[feature_name])

            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )

            encoded_categorical_feature = embedding(encoded_feature)
            encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:

            numerical_feature = tf.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

# Implement an MLP block

In [347]:
def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [348]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):


    inputs = create_model_inputs()
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    encoded_categorical_features = tf.stack(encoded_categorical_feature_list, axis=1)
    numerical_features = layers.concatenate(numerical_feature_list)


    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = tf.range(start=0, limit=num_columns, delta=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )

    for block_idx in range(num_transformer_blocks):
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features)
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{block_idx}",
        )(x)
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)


    categorical_features = layers.Flatten()(encoded_categorical_features)
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    features = layers.concatenate([categorical_features, numerical_features])

    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)

    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())
keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")

In [349]:
tabtransformer_model.summary()

In [None]:
result = []
for step,cross_val in enumerate([StratifiedKFold(n_splits=5, shuffle=True, random_state=0),GroupKFold(n_splits=5)]) :
    print(f'\n******** cross validation strategy : {cross_val} ********\n')
    lr_oof = np.zeros(len(train))
    lr_test = np.zeros(len(test))
    lr_auc = 0
    importance_list = []
    
    kf = cross_val
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y,groups = train.product_code )):
        
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        history, prdict = run_experiment(
            model=tabtransformer_model,
            train_data=(x_train, y_train),
            val_data=(x_val, y_val),
            test_data=test,
            num_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            batch_size=BATCH_SIZE,
        )
        result.append(prdict)

In [359]:
result[0].shape

In [357]:
result_tab = np.mean(result, axis =0)

In [358]:
result_tab.shape

# Logistic Regression¶

In [360]:
# library for coding string values :
! pip install feature_engine
from feature_engine.encoding import WoEEncoder

In [361]:
# #Thanks to @MAXSARMENTO 
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(X, y)
X = woe_encoder.transform(X)
test = woe_encoder.transform(test)

In [363]:

result_0 = []
result_1 = []
for step,cross_val in enumerate([StratifiedKFold(n_splits=5, shuffle=True, random_state=0),GroupKFold(n_splits=5)]) :
    print(f'\n******** cross validation strategy : {cross_val} ********\n')
    lr_oof = np.zeros(len(train))
    lr_test = np.zeros(len(test))
    lr_auc = 0
    importance_list = []
    
    kf = cross_val
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y,groups = train.product_code )):
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        #x_train, x_val, x_test = _scale(x_train, x_val, test, select_feature)

        model = LogisticRegression(max_iter=200, C=0.0001, penalty='l2', solver='newton-cg')
        model.fit(x_train[select_feature], y_train)
        importance_list.append(model.coef_.ravel())

        val_preds = model.predict_proba(x_val[select_feature])[:, 1]
        print("FOLD: ", fold_idx+1, " ROC-AUC:", round(roc_auc_score(y_val, val_preds), 5))
        lr_auc += roc_auc_score(y_val, val_preds) / 5
        lr_test += model.predict_proba(test[select_feature])[:, 1] / 5
        lr_oof[val_idx] = val_preds
    
    if step == 0:
        result_0 = lr_test
    else:
        result_1 = lr_test
    print(f"\n{Fore.GREEN}{Style.BRIGHT}Average auc = {round(lr_auc, 5)}{Style.RESET_ALL}")
    print(f"{Fore.BLUE}{Style.BRIGHT}OOF auc     = {round(roc_auc_score(y, lr_oof), 5)}{Style.RESET_ALL}\n")

## Submission

In [365]:
# Remove id
#test = test.drop('id', axis = 1)

# LR GS  0.58786
sub1 = sample_submission.copy()
sub1.failure = (result_0 + result_1 + result_tab ) / 3 
sub1.to_csv('submission.csv', index = False)

In [367]:
sub1.failure.describe()

In [368]:
result_0