In [1]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations
import tensorflow as tf

In [2]:
# ====================================================
# Read data
# ====================================================
def read_data():
    input_dir = ''
    train = pd.read_parquet(input_dir + 'train_df_final.parquet')
    test = pd.read_parquet(input_dir + 'test_df_final.parquet')
    return train, test

In [3]:
# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)


In [4]:
# ====================================================
# tf amex metric
# ====================================================
def tf_amex_metric(y_pred, y_true):
    #y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

In [5]:
# ====================================================
# Training
# ====================================================
def train_(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    print("Encoding categorical variables")
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
        
    # Round last float features to 2 decimal place
    print("Rounding float features")
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    
    # Filling NaN
    print("printing NaN values: ")
    print(train.isna().sum())
    print(test.isna().sum())
    
    train = train.replace(np.nan, 0)
    test = test.replace(np.nan, 0)
    
    print("After the fillna")
    print(train.isna().sum())
    print(test.isna().sum())
        
    # Get feature list
    print("Getting feature list")
    features = [col for col in train.columns if col not in ['customer_ID', "target"]]
    
    
    print("KFOLD Algorithm")
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train["target"])):
        X_train, X_test = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_test = train["target"].iloc[trn_ind], train["target"].iloc[val_ind]
        
    print("Creating model")
    
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(224, activation = "relu"),
    tf.keras.layers.Dense(300, activation = 'relu'),
    tf.keras.layers.Dense(368, activation = "relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(240, activation = 'relu'),
    tf.keras.layers.Dense(180, activation = 'relu'),
    tf.keras.layers.Dense(164, activation = "relu"),
    tf.keras.layers.Dense(128, activation = "relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation = "relu"),
    tf.keras.layers.Dense(32, activation = "relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(8, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
    ])
    
    print("Compiling model")
    model.compile(loss = "binary_crossentropy", optimizer = "adam",
                 metrics = ["accuracy"])
    
    print("\n\nFitting model")
    model.fit(X_train, y_train, epochs = 10)
    
    return model, X_test, y_test, test, features

In [6]:
def evaluate():
    preds = model.predict(X_test)
    
    print("Calculating metric....")
    #preds = preds.squeeze(axis = 1)
    metric = tf_amex_metric(preds.squeeze(axis = 1), y_test)
    print(metric)

In [26]:
# ====================================================
# Predict
# ====================================================
def predict(test):
    predictions = model.predict(test[features])
    predictions = predictions.squeeze(axis = 1)
    
    sub_df = pd.DataFrame({"customer_ID": test["customer_ID"], "prediction":predictions})
    sub_df.isna().sum()
    sub_df.head()
    
    return sub_df, predictions

In [8]:
# ====================================================
# to_csv
# ====================================================
def to_csv(sub_df, name):
    sub_df.to_csv(name, index = False)

In [11]:
train_df, test_df = read_data()

In [12]:
model, X_test, y_test, test, features = train_(train_df, test_df)

Encoding categorical variables
Rounding float features
printing NaN values: 
customer_ID               0
P_2_mean               2434
P_2_std                7829
P_2_min                2434
P_2_max                2434
                      ...  
D_141_last_round2      2532
D_142_last_round2    373333
D_143_last_round2      2532
D_144_last_round2         0
D_145_last_round2      2532
Length: 1096, dtype: int64
customer_ID               0
P_2_mean               4175
P_2_std                9937
P_2_min                4175
P_2_max                4175
                      ...  
D_141_last_round2      4607
D_142_last_round2    752615
D_143_last_round2      4607
D_144_last_round2         0
D_145_last_round2      4607
Length: 1095, dtype: int64
After the fillna
customer_ID          0
P_2_mean             0
P_2_std              0
P_2_min              0
P_2_max              0
                    ..
D_141_last_round2    0
D_142_last_round2    0
D_143_last_round2    0
D_144_last_round2    0
D_145_

In [13]:
evaluate()

Calculating metric....
('amex_metric', 0.7862040881991049, True)


In [27]:
sub_df, predictions = predict(test)



In [17]:
test["customer_ID"].shape

(924621,)

In [30]:
sub_df

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.040084
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.012515
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.052648
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.281016
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.751047
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.029792
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.699897
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.388129
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.225238


In [31]:
to_csv(sub_df, "tensorflowclassification(withALLDATA).csv")

In [33]:
sub_df.isna().sum()

customer_ID    0
prediction     0
dtype: int64