In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score as roc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import regularizers, optimizers
import tensorflow as tf
import keras_tuner as kt
import keras as kr
import gc
#Setting the data path
dataPath = '../Data/parquet_files/'

In [19]:
def set_table_dtypes(df):
    for col_name in df.columns:
        if col_name[-1] in ("P", "A"):
            df[col_name] = df[col_name].astype(float)
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    # Select columns that are of type 'object' or 'string'
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    
    # Apply transformation to each selected column
    for col in string_cols:
        # Convert column to 'category' type
        df[col] = df[col].astype("category")
        
        # Get current categories and add "Unknown"
        new_categories = df[col].cat.categories.tolist() + ["Unknown"]
        
        # Define new CategoricalDtype with "Unknown" included
        new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
        
        # Convert column to new dtype
        df[col] = df[col].astype(new_dtype)
    
    return df

In [24]:
# Reading the base table
train_base_table = pd.read_parquet(dataPath + 'train/train_base.parquet')

# Reading the first part of the static table
train_1 = pd.read_parquet(dataPath + 'train/train_static_0_0.parquet')

# Reading the second part of the static table
train_2 = pd.read_parquet(dataPath + 'train/train_static_0_1.parquet')

# Combining the two parts of the static table
# Ensure that both DataFrames have the same columns, filling missing ones with NaN
columns_union = train_1.columns.union(train_2.columns, sort=False)
train_1_aligned = train_1.reindex(columns=columns_union, fill_value=pd.NA)
train_2_aligned = train_2.reindex(columns=columns_union, fill_value=pd.NA)

# Concatenating aligned DataFrames
train_static = pd.concat([train_1_aligned, train_2_aligned], ignore_index=True)

# Reading additional tables
train_static_cb = pd.read_parquet(dataPath + 'train/train_static_cb_0.parquet')
train_person_1 = pd.read_parquet(dataPath + 'train/train_person_1.parquet')
train_credit_bureau_b_2 = pd.read_parquet(dataPath + 'train/train_credit_bureau_b_2.parquet')

In [25]:
# Aggregations for train_person_1_feats_1
train_person_1_feats_1 = train_person_1.groupby('case_id').agg(
    mainoccupationinc_384A_max=('mainoccupationinc_384A', 'max'),
    mainoccupationinc_384A_any_selfemployed=('incometype_1044T', lambda x: np.max(np.where(x == "SELFEMPLOYED", 1, 0)))
).reset_index()

# Filtering and renaming for train_person_1_feats_2
train_person_1_feats_2 = train_person_1[train_person_1['num_group1'] == 0][['case_id', 'housetype_905L']] \
                                        .rename(columns={'housetype_905L': 'person_housetype'})

# Aggregations for train_credit_bureau_b_2_feats
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.groupby('case_id').agg(
    pmts_pmtsoverdue_635A_max=('pmts_pmtsoverdue_635A', 'max'),
    pmts_dpdvalue_108P_over31=('pmts_dpdvalue_108P', lambda x: np.max(np.where(x > 31, 1, 0)))
).reset_index()

# Selecting columns that end with 'A' or 'M'
selected_static_cols = [col for col in train_static.columns if col.endswith('A') or col.endswith('M')]
selected_static_cb_cols = [col for col in train_static_cb.columns if col.endswith('A') or col.endswith('M')]

# Joining DataFrames
data = train_base_table.merge(train_static[['case_id'] + selected_static_cols], on='case_id', how='left') \
                       .merge(train_static_cb[['case_id'] + selected_static_cb_cols], on='case_id', how='left') \
                       .merge(train_person_1_feats_1, on='case_id', how='left') \
                       .merge(train_person_1_feats_2, on='case_id', how='left') \
                       .merge(train_credit_bureau_b_2_feats, on='case_id', how='left')

In [26]:
data = set_table_dtypes(data)

In [27]:
data.select_dtypes('number').columns

Index(['case_id', 'MONTH', 'WEEK_NUM', 'target',
       'amtinstpaidbefduel24m_4187115A', 'annuity_780A',
       'annuitynextmonth_57A', 'avginstallast24m_3658937A',
       'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A',
       'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A',
       'currdebtcredtyperange_828A', 'disbursedcredamount_1113A',
       'downpmt_116A', 'inittransactionamount_650A', 'lastapprcredamount_781A',
       'lastotherinc_902A', 'lastotherlnsexpense_631A',
       'lastrejectcredamount_222A', 'maininc_215A', 'maxannuity_159A',
       'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A',
       'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A',
       'maxpmtlast3m_4525190A', 'price_1097A', 'sumoutstandtotal_3546847A',
       'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A',
       'totinstallast1m_4525188A', 'pmtaverage_3A', 'pmtaverage_4527227A',
       'pmtaverage_4955615A', 'pmtssum_45A', 'main

In [28]:
data = convert_strings(data)

In [None]:
data.shape

In [None]:
data.to_csv('../Data/train.csv', index=False)

In [2]:
data = pd.read_csv('../Data/train.csv')
data.shape

  data = pd.read_csv('../Data/train.csv')


(1526659, 58)

In [3]:
cat_list = data.select_dtypes('category').columns
cat_list

Index([], dtype='object')

In [3]:
def prepar_data_set(data_df):
    category_features = data_df.select_dtypes('category').columns.tolist()
    numeric_features = data_df.select_dtypes('number').columns.tolist()
    
    for col in category_features:
        encoder = LabelEncoder()
        # Use 'fit_transform' to transform the column
        data_df[col] = encoder.fit_transform(data_df[col].astype(str))
    
    # Return the modified DataFrame and lists of features
    return data_df, category_features, numeric_features

In [4]:
train,cat_features,num_feature = prepar_data_set(data)

In [6]:
train.head()

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,...,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
0,0,2019-01-03,201901,0,0,,1917.6,0.0,,,...,,,,,,10800.0,0,,,
1,1,2019-01-03,201901,0,0,,3134.0,0.0,,,...,,,,,,10000.0,0,,,
2,2,2019-01-04,201901,0,0,,4937.0,0.0,,,...,,,,,,14000.0,0,,,
3,3,2019-01-03,201901,0,0,,4643.6,0.0,,,...,,,,,,10000.0,0,,,
4,4,2019-01-04,201901,0,1,,3390.2,0.0,,,...,,,,,,24000.0,0,,,


In [5]:
# Unique case_ids
case_ids = train["case_id"].unique()

# Initialize TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=2)  # Adjust n_splits based on your requirement
# Split the dataset into training and a combined test/validation set
train_indices, temp_test_indices = next(iter(tss.split(case_ids)))
case_ids_train = case_ids[train_indices]
temp_case_ids_test = case_ids[temp_test_indices]

# Split the temporary test set further into validation and test sets
valid_indices, test_indices = next(iter(tss.split(temp_case_ids_test)))
case_ids_valid = temp_case_ids_test[valid_indices]
case_ids_test = temp_case_ids_test[test_indices]

cols_pred = [col for col in num_feature if col not in ['case_id', 'WEEK_NUM', 'target']]

def from_ids_to_dataframes(case_ids, train_df, cols_pred):
    filtered_data = train_df[train_df["case_id"].isin(case_ids)]
    base_data = filtered_data[['case_id', 'WEEK_NUM', 'target']]
    X_data = filtered_data[cols_pred]
    y_data = filtered_data["target"]
    return base_data, X_data, y_data

base_train, X_train, y_train = from_ids_to_dataframes(case_ids_train, train, cols_pred)
base_valid, X_valid, y_valid = from_ids_to_dataframes(case_ids_valid, train, cols_pred)
base_test, X_test, y_test = from_ids_to_dataframes(case_ids_test, train, cols_pred)

num_feature = [col for col in num_feature if col in X_train.columns]
cat_features = [col for col in cat_features if col in X_train.columns]




In [6]:
base_test.fillna(0, inplace=True)
base_train.fillna(0, inplace=True)
base_valid.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)
X_train.fillna(0,inplace=True)
X_test.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
y_valid.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_valid.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.fillna(0, inplace=True)


In [7]:
scaler = StandardScaler()
X_train[num_feature] = scaler.fit_transform(X_train[num_feature])
X_valid[num_feature] = scaler.transform(X_valid[num_feature])
X_test[num_feature] = scaler.transform(X_test[num_feature])

In [9]:
def generate_categorical_feature_tf(categorical_features, num_features, data):
    models = []
    inputs = []

    # Existing code to create embedding layers for categorical features
    for cat in categorical_features:
        vocab_size = data[cat].nunique()
        inpt = tf.keras.layers.Input(shape=(1,), name='input_'+'_'.join(cat.split(' ')))
        inputs.append(inpt)
        embed = tf.keras.layers.Embedding(vocab_size, 500, trainable=True, embeddings_initializer=tf.initializers.random_normal())(inpt)
        embed_reshaped = tf.keras.layers.Reshape(target_shape=(500,))(embed)
        models.append(embed_reshaped)

    # Existing code to handle numerical features
    num_input = tf.keras.layers.Input(shape=(len(num_features)), name='input_number_features')
    inputs.append(num_input)
    models.append(num_input)

    # Merging categorical and numerical features
    merge_models = tf.keras.layers.concatenate(models)

    # Adjusted model architecture
    pre_preds = tf.keras.layers.Dense(512, kernel_regularizer=regularizers.l2(1e-4))(merge_models)
    pre_preds = tf.keras.layers.BatchNormalization()(pre_preds)
    pre_preds = tf.keras.layers.ReLU()(pre_preds)
    pre_preds = tf.keras.layers.Dropout(0.25)(pre_preds)  # Adjusted dropout

    pre_preds = tf.keras.layers.Dense(256, kernel_regularizer=regularizers.l2(1e-4))(pre_preds)
    pre_preds = tf.keras.layers.BatchNormalization()(pre_preds)
    pre_preds = tf.keras.layers.ReLU()(pre_preds)
    pre_preds = tf.keras.layers.Dropout(0.25)(pre_preds)  # Adjusted dropout

    # Final prediction layer remains unchanged
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(pre_preds)

    # Creating the full model
    model_full = tf.keras.models.Model(inputs=inputs, outputs=pred)

    # Learning rate scheduler
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-3,
        decay_steps=10000,
        decay_rate=0.9)

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    model_full.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)
    
    return model_full


In [10]:
model = generate_categorical_feature_tf(cat_features,num_feature,X_train)

In [11]:
# Train the model
fit_model = model.fit(X_train, y_train, epochs=20, initial_epoch= 0)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
total_params = model.count_params()
print(f'Total parameters in the model: {total_params}')

Total parameters in the model: 155649


In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_valid,y_valid,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5301/5301 - 15s - loss: 0.1247 - accuracy: 0.9711 - 15s/epoch - 3ms/step
Loss: 0.12470719963312149, Accuracy: 0.9711077213287354


In [14]:
predictions = model.predict(X_test)



In [15]:
auc_roc = roc(y_test, predictions)
print(f'AUC-ROC score: {auc_roc}')

AUC-ROC score: 0.7121192081317493


In [23]:
model.save('home_credit_risk_model.h5')

In [None]:
model = tf.keras.models.load_model('home_credit_risk_model.h5')

In [16]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = model.predict(X)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc(base_test["target"], base_test["score"])}')  

The AUC score on the train set is: 0.6871881128228371
The AUC score on the valid set is: 0.6876219875756466
The AUC score on the test set is: 0.7121192081317493


In [17]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    # Define a safe ROC calculation with a check for a single class
    def safe_roc(target, score):
        if len(np.unique(target)) < 2:  # Check if there's only one class
            return 0  # Return a default value or handle as needed
        else:
            return 2 * roc(target, score) - 1

    # Calculate Gini coefficients over time
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]] \
        .sort_values("WEEK_NUM") \
        .groupby("WEEK_NUM")[["target", "score"]] \
        .apply(lambda x: safe_roc(x["target"], x["score"])).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)  # Linear fit to the Gini coefficients over time
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    
    # Calculate the stability score considering falling rate and residual std
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

# Assume base_train, base_valid, and base_test are previously defined DataFrame variables
stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')


The stability score on the train set is: 0.3353169522396298
The stability score on the valid set is: 0.34364977174952804
The stability score on the test set is: 0.33572815572533565


In [20]:
# Reading the base table
test_basetable = pd.read_parquet(dataPath + "test/test_base.parquet")

# Reading and concatenating the static tables
test_1 = pd.read_parquet(dataPath + "test/test_static_0_0.parquet")
test_2 = pd.read_parquet(dataPath + "test/test_static_0_1.parquet")
test_3 = pd.read_parquet(dataPath + "test/test_static_0_2.parquet")
test_1 = set_table_dtypes(test_1)
test_2 = set_table_dtypes(test_2)
test_3 = set_table_dtypes(test_3)
# Concatenating test_1, test_2, test_3 with alignment on column names and allowing for missing columns
test_static = pd.concat([test_1, test_2, test_3], axis=0, ignore_index=True, join='outer')

# Reading additional tables
test_static_cb = pd.read_parquet(dataPath + "test/test_static_cb_0.parquet")
test_person_1 = pd.read_parquet(dataPath + "test/test_person_1.parquet")
test_credit_bureau_b_2 = pd.read_parquet(dataPath + "test/test_credit_bureau_b_2.parquet")

In [29]:
# GroupBy and aggregation for test_person_1_feats_1
test_person_1_feats_1 = test_person_1.groupby("case_id").agg(
    mainoccupationinc_384A_max=('mainoccupationinc_384A', 'max'),
    mainoccupationinc_384A_any_selfemployed=('incometype_1044T', lambda x: (x == 'SELFEMPLOYED').max().astype(int))
).reset_index()

# Filtering, dropping, and renaming for test_person_1_feats_2
test_person_1_feats_2 = test_person_1.loc[test_person_1["num_group1"] == 0, ["case_id", "housetype_905L"]] \
                                        .rename(columns={"housetype_905L": "person_housetype"})

# GroupBy and aggregation for test_credit_bureau_b_2_feats
test_credit_bureau_b_2_feats = test_credit_bureau_b_2.groupby("case_id").agg(
    pmts_pmtsoverdue_635A_max=('pmts_pmtsoverdue_635A', 'max'),
    pmts_dpdvalue_108P_over31=('pmts_dpdvalue_108P', lambda x: (x > 31).max().astype(int))
).reset_index()

# Joining DataFrames for data_submission
data_submission = test_basetable.merge(
    test_static[["case_id"] + selected_static_cols], on="case_id", how="left"
).merge(
    test_static_cb[["case_id"] + selected_static_cb_cols], on="case_id", how="left"
).merge(
    test_person_1_feats_1, on="case_id", how="left"
).merge(
    test_person_1_feats_2, on="case_id", how="left"
).merge(
    test_credit_bureau_b_2_feats, on="case_id", how="left"
)

In [36]:
data_submission

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,...,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
0,57543,2020-10-06,202010,92,,7637.2,0.0,,,,...,a55475b1,,,,,36000.0,0.0,,,
1,57549,2020-10-06,202010,92,,902.60004,0.0,,,,...,a55475b1,,,,,15000.0,0.0,,,
2,57551,2020-10-06,202010,92,,3610.2,0.0,,,,...,a55475b1,,,,,24000.0,0.0,,,
3,57552,2020-10-07,202010,92,,6964.4,0.0,,,,...,a55475b1,,,16327.0,,,,,,
4,57569,2020-10-06,202010,92,,5553.4,0.0,,,,...,a55475b1,,,16303.4,,,,,,
5,57630,2020-10-06,202010,92,,7404.8003,0.0,,,,...,a55475b1,,,,,,,,,
6,57631,2020-10-06,202010,92,,2872.8,0.0,,,,...,a55475b1,,,16863.0,,,,,,
7,57632,2020-10-06,202010,92,,6225.8003,0.0,,,,...,a55475b1,,,24565.8,,,,,,
8,57633,2020-10-06,202010,92,,7917.0,0.0,,,,...,a55475b1,,,,,,,,,
9,57634,2020-10-06,202010,92,,5894.0,0.0,,,,...,a55475b1,,,6917.0,,,,,,


In [30]:
def preprocess_data_for_model(data_df, train_df, cat_list, cols_pred):
    # Copy the data to avoid modifying the original dataframe
    X_submission = data_df[cols_pred].copy()

    # Initialize a LabelEncoder
    encoder = LabelEncoder()

    for col in cat_list:
        # Fit the encoder on the training data
        train_unique_values = train_df[col].astype(str).unique()
        encoder.fit(train_unique_values)
        
        # Transform both train and submission data to ensure consistency
        train_df[col] = encoder.transform(train_df[col].astype(str))
        
        # For submission data, transform known categories and set unknown to a specific code
        # Here, we handle unknown categories by first checking if they are in the encoder classes_
        known_classes = set(encoder.classes_)
        X_submission[col] = X_submission[col].astype(str).apply(lambda x: x if x in known_classes else 'Unknown')
        
        # Update encoder with 'Unknown' to handle unseen categories
        encoder_classes = encoder.classes_.tolist()
        if 'Unknown' not in encoder_classes:
            encoder_classes.append('Unknown')
            encoder.classes_ = np.array(encoder_classes)
        
        # Transform submission data
        X_submission[col] = encoder.transform(X_submission[col])
    
    return X_submission

In [31]:
X_submission = preprocess_data_for_model(data_submission, train, cat_features, cols_pred)

In [32]:
X_submission.fillna(0, inplace=True)

In [33]:
y_submission_pred = model.predict(X_submission)



In [34]:
y_submission_pred = y_submission_pred.ravel()

In [None]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")