In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score as roc
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras_tuner as kt
import keras as kr
import gc
#Setting the data path
dataPath = '../Data/parquet_files/'

In [2]:
def set_table_dtypes(df):
    for col_name in df.columns:
        if col_name[-1] in ("P", "A"):
            df[col_name] = df[col_name].astype(float)
    return df

def dummy_data(df, categorical_cols):
    for categorical_col in categorical_cols:
        dummies = pd.get_dummies(df[categorical_col], prefix=categorical_col)
        df = pd.concat([df, dummies], axis=1).drop(columns=[categorical_col])
    return df

In [3]:
# Reading the base table
train_base_table = pd.read_parquet(dataPath + 'train/train_base.parquet')

# Reading the first part of the static table
train_1 = pd.read_parquet(dataPath + 'train/train_static_0_0.parquet')

# Reading the second part of the static table
train_2 = pd.read_parquet(dataPath + 'train/train_static_0_1.parquet')

# Combining the two parts of the static table
# Ensure that both DataFrames have the same columns, filling missing ones with NaN
columns_union = train_1.columns.union(train_2.columns, sort=False)
train_1_aligned = train_1.reindex(columns=columns_union, fill_value=pd.NA)
train_2_aligned = train_2.reindex(columns=columns_union, fill_value=pd.NA)

# Concatenating aligned DataFrames
train_static = pd.concat([train_1_aligned, train_2_aligned], ignore_index=True)

# Reading additional tables
train_static_cb = pd.read_parquet(dataPath + 'train/train_static_cb_0.parquet')
train_person_1 = pd.read_parquet(dataPath + 'train/train_person_1.parquet')
train_credit_bureau_b_2 = pd.read_parquet(dataPath + 'train/train_credit_bureau_b_2.parquet')

In [None]:
# test_basetable = spark.read.parquet(dataPath + "test/test_base.parquet")

# test_1 = spark.read.parquet(dataPath + "test/test_static_0_0.parquet")
# test_2 = spark.read.parquet(dataPath + "test/test_static_0_1.parquet")
# test_3 = spark.read.parquet(dataPath + "test/test_static_0_2.parquet")

# test_1 = set_table_dtypes(test_1)
# test_2 = set_table_dtypes(test_2)
# test_3 = set_table_dtypes(test_3)

# test_static = test_1.unionByName(test_2, allowMissingColumns=True)
# test_static = test_static.unionByName(test_3, allowMissingColumns=True)

# test_static_cb = spark.read.parquet(dataPath + "test/test_static_cb_0.parquet")
# test_person_1 = spark.read.parquet(dataPath + "test/test_person_1.parquet")
# test_credit_bureau_b_2 = spark.read.parquet(dataPath + "test/test_credit_bureau_b_2.parquet")

In [4]:
train_person_1_feats_1 = train_person_1.groupby("case_id").agg(
    mainoccupationinc_384A_max=pd.NamedAgg(column="mainoccupationinc_384A", aggfunc="max"),
    mainoccupationinc_384A_any_selfemployed=pd.NamedAgg(column="incometype_1044T", aggfunc=lambda x: np.max(np.where(x == "SELFEMPLOYED", 1, 0)))
).reset_index()

train_person_1_feats_2 = train_person_1.loc[train_person_1["num_group1"] == 0, ["case_id", "housetype_905L"]] \
                                        .rename(columns={"housetype_905L": "person_housetype"})



train_credit_bureau_b_2_feats = train_credit_bureau_b_2.groupby("case_id").agg(
    pmts_pmtsoverdue_635A_max=pd.NamedAgg(column="pmts_pmtsoverdue_635A", aggfunc="max"),
    pmts_dpdvalue_108P_over31=pd.NamedAgg(column="pmts_dpdvalue_108P", aggfunc=lambda x: np.max(np.where(x > 31, 1, 0)))
).reset_index()


# Selecting columns that end with 'A' or 'M'
selected_static_cols = [col for col in train_static.columns if col.endswith('A') or col.endswith('M')]
selected_static_cb_cols = [col for col in train_static_cb.columns if col.endswith('A') or col.endswith('M')]

# Joining DataFrames
data = train_base_table.merge(train_static[["case_id"] + selected_static_cols], on="case_id", how="left") \
                       .merge(train_static_cb[["case_id"] + selected_static_cb_cols], on="case_id", how="left") \
                       .merge(train_person_1_feats_1, on="case_id", how="left") \
                       .merge(train_person_1_feats_2, on="case_id", how="left") \
                       .merge(train_credit_bureau_b_2_feats, on="case_id", how="left")



In [5]:
data = set_table_dtypes(data)

In [6]:
data.shape

(1526659, 58)

In [None]:
# test_person_1_feats_1 = test_person_1_df.groupBy("case_id").agg(
#     F.max("mainoccupationinc_384A").alias("mainoccupationinc_384A_max"),
#     F.max(F.when(F.col("incometype_1044T") == "SELFEMPLOYED", 1).otherwise(0)).alias("mainoccupationinc_384A_any_selfemployed")
# )

# test_person_1_feats_2 = test_person_1_df.select(["case_id", "num_group1", "housetype_905L"]) \
#                                       .filter(F.col("num_group1") == 0) \
#                                       .drop("num_group1") \
#                                       .withColumnRenamed("housetype_905L", "person_housetype")

# test_credit_bureau_b_2_feats = test_credit_bureau_b_2_df.groupBy("case_id").agg(
#     F.max("pmts_pmtsoverdue_635A").alias("pmts_pmtsoverdue_635A_max"),
#     F.max(F.when(F.col("pmts_dpdvalue_108P") > 31, 1).otherwise(0)).alias("pmts_dpdvalue_108P_over31")
# )

# data_submission = test_basetable_df.join(
#     test_static_df.select(["case_id"] + selected_static_cols), on="case_id", how="left"
# ).join(
#     test_static_cb_df.select(["case_id"] + selected_static_cb_cols), on="case_id", how="left"
# ).join(
#     test_person_1_feats_1, on="case_id", how="left"
# ).join(
#     test_person_1_feats_2, on="case_id", how="left"
# ).join(
#     test_credit_bureau_b_2_feats, on="case_id", how="left"
# )

In [None]:
data.to_parquet(dataPath + '/train_data.parquet')

In [2]:
data = pd.read_parquet(dataPath + '/train_data.parquet')

In [3]:
data = pd.get_dummies(data)

In [4]:
y = data["target"]
X = data.drop(columns=["target"])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 

In [5]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

MemoryError: Unable to allocate 12.2 GiB for an array with shape (1068661, 1536) and data type float64

In [None]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()
    activation = hp.Choice('activation', ['tanh', 'sigmoid', 'leaky_relu', 'elu', 'selu', 'PReLU'])
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
                                        min_value=1,
                                        max_value=100,
                                        step=2), activation=activation, input_dim=len(X_scaled[0])))
    for i in range(hp.Int('num_layers', 1, 10)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
                                        min_value=1,
                                        max_value=100,
                                        step=2), activation=activation))
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    loss = hp.Choice('loss', ['binary_crossentropy', 'mse'])
    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adamax', 'nadam'])
    nn_model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    
    return nn_model

In [None]:
tuner = kt.Hyperband(create_model, objective='val_accuracy', max_epochs=20, hyperband_iterations=10, directory='my_dir', project_name='home_credit_risk')

In [None]:
tuner.search(X_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))

In [None]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_scaled[0])
hidden_nodes_layer1 =  21
hidden_nodes_layer2 = 17
hidden_nodes_layer3 = 23
hidden_nodes_layer4 = 23
hidden_nodes_layer5 = 15
hidden_nodes_layer6 = 11
hidden_nodes_layer7 = 1




output_dim = 1



nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="selu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="selu"))
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="selu"))
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="selu"))
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="selu"))
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation="selu"))
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer7, activation="selu"))








# Output layer
nn_model.add(tf.keras.layers.Dense(output_dim, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

In [None]:
nn_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn_model.fit(X_scaled, y_train, epochs=100, initial_epoch= 0)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
predictions = nn_model.predict(X_test_scaled)

In [None]:
auc_roc = roc(y_test, predictions)
print(f'AUC-ROC score: {auc_roc}')

In [None]:
nn_model.save('home_credit_risk_model.h5')

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = xgb.predict(X)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc(base_test["target"], base_test["score"])}')  

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}') 