In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score as roc
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
import keras_tuner as kt
import keras as kr
import gc
#Setting the data path
dataPath = '../Data/parquet_files/'

In [2]:
data = pd.read_parquet(dataPath + '/train_data.parquet')

In [3]:
def set_table_dtypes(df):
    for col_name in df.columns:
        if col_name[-1] in ("P", "A"):
            df[col_name] = df[col_name].astype(float)
    return df

def convert_strings_pandas(df: pd.DataFrame) -> pd.DataFrame:
    # Select columns that are of type 'object' or 'string'
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    
    # Apply transformation to each selected column
    for col in string_cols:
        # Convert column to 'category' type
        df[col] = df[col].astype("category")
        
        # Get current categories and add "Unknown"
        new_categories = df[col].cat.categories.tolist() + ["Unknown"]
        
        # Define new CategoricalDtype with "Unknown" included
        new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
        
        # Convert column to new dtype
        df[col] = df[col].astype(new_dtype)
    
    return df

In [4]:
data = set_table_dtypes(data)

In [5]:
data.shape

(1526659, 58)

In [6]:
data = convert_strings_pandas(data)

In [7]:
def prepar_data_set(data_df):
    category_features = data_df.select_dtypes('category').columns.tolist()
    numeric_features = data_df.select_dtypes('number').columns.tolist()
    
    for col in category_features:
        encoder = LabelEncoder()
        # Use 'fit_transform' to transform the column
        data_df[col] = encoder.fit_transform(data_df[col].astype(str))
    
    # Return the modified DataFrame and lists of features
    return data_df, category_features, numeric_features

In [8]:
train,cat_features,num_feature = prepar_data_set(data)

In [9]:
# Splitting the unique case_ids into train, test, and validation sets
case_ids = train["case_id"].unique()
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

# Assuming 'cols_pred' should contain all numerical features except 'case_id', 'WEEK_NUM', and 'target'
cols_pred = [col for col in num_feature if col not in ['case_id', 'WEEK_NUM', 'target']]

def from_ids_to_dataframes(case_ids):
    filtered_data = train[train["case_id"].isin(case_ids)]
    base_data = filtered_data[['case_id'] + ['WEEK_NUM'] + ['target']]
    X_data = filtered_data[cols_pred]
    y_data = filtered_data["target"]
    return base_data, X_data, y_data

base_train, X_train, y_train = from_ids_to_dataframes(case_ids_train)
base_valid, X_valid, y_valid = from_ids_to_dataframes(case_ids_valid)
base_test, X_test, y_test = from_ids_to_dataframes(case_ids_test)

# Ensure that num_feature only contains columns present in X_train
num_feature = [col for col in num_feature if col in X_train.columns]
cat_features = [col for col in cat_features if col in X_train.columns]

In [10]:
scaler = StandardScaler()
X_train[num_feature] = scaler.fit_transform(X_train[num_feature])
X_valid[num_feature] = scaler.transform(X_valid[num_feature])
X_test[num_feature] = scaler.transform(X_test[num_feature])

In [11]:
X_train.fillna(0,inplace=True)

In [14]:
def generate_categorical_feature_tf(categorical_features,num_features,data):
    models= []
    inputs = []
    for cat in categorical_features:
        vocab_size = data[cat].nunique()
        inpt = tf.keras.layers.Input(shape=(1,),name='input_'+'_'.join(cat.split(' ')))
        inputs.append(inpt)
        embed = tf.keras.layers.Embedding(vocab_size,200,\
                                          trainable=True,embeddings_initializer=tf.initializers.random_normal)(inpt)
        embed_rehsaped =tf.keras.layers.Reshape(target_shape=(200,))(embed)
        models.append(embed_rehsaped)
    num_input = tf.keras.layers.Input(shape=(len(num_features)),\
                                      name='input_number_features')
    inputs.append(num_input)
    models.append(num_input)
    merge_models= tf.keras.layers.concatenate(models)
    pre_preds = tf.keras.layers.Dense(1000)(merge_models)
    pre_preds = tf.keras.layers.BatchNormalization()(pre_preds)
    pre_preds = tf.keras.layers.Dense(1000)(pre_preds)
    pre_preds = tf.keras.layers.BatchNormalization()(pre_preds)
    pred = tf.keras.layers.Dense(1,activation='sigmoid')(pre_preds)
    model_full = tf.keras.models.Model(inputs= inputs,\
                                       outputs =pred)
    model_full.compile(loss=tf.keras.losses.binary_crossentropy,\
                       metrics=['accuracy'],
                       optimizer='adam')
    return model_full

In [None]:
model = generate_categorical_feature_tf(X_train)

In [None]:
# Train the model
fit_model = model.fit(X_train, y_train, epochs=100, initial_epoch= 0)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_valid,y_valid,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
predictions = model.predict(X_test)

In [None]:
auc_roc = roc(y_test, predictions)
print(f'AUC-ROC score: {auc_roc}')

In [None]:
model.save('home_credit_risk_model.h5')

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = model.predict(X)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc(base_test["target"], base_test["score"])}')  

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}') 