# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [None]:
!pip install numpy pandas matplotlib seaborn plotly scipy statsmodels scikit-learn xgboost category_encoders wandb optuna lightgbm catboost tensorflow tensorboard shap dask

In [None]:
# TODO: Change path to your actual direction

%cd '/content/drive/MyDrive/Colab Notebooks/[Dreamweavers] Workspace/Round 2_RBAC 2024/Xuân Thành/submit_code'
loan_df_path = '/content/drive/MyDrive/Colab Notebooks/[Dreamweavers] Workspace/Round 2_RBAC 2024/[RBAC 2024] Case Study Package/dataset/loan_origin.csv'
demographic_df_path = '/content/drive/MyDrive/Colab Notebooks/[Dreamweavers] Workspace/Round 2_RBAC 2024/[RBAC 2024] Case Study Package/dataset/demographic.csv'

loan_df = pd.read_csv(loan_df_path)
demographic_df = pd.read_csv(demographic_df_path)


joined_df_path = 'joined_dataset.csv'
joined_df_remap_path = 'joined_dataset_remap.csv'

from cleaner.check_data import *
from cleaner.remove_dup import *
from cleaner.standardize import *
from cleaner.impute import *
from cleaner.outlier import *
from cleaner.consistency import *
from cleaner.remap_values import *

# Clean Data

In [None]:
understand_df(loan_df, output_dir="demographic")
loan_quality_issues = validate_data_quality(loan_df, "Demographic Dataset")

In [None]:
understand_df(loan_df, output_dir="loan_origin")
loan_quality_issues = validate_data_quality(loan_df, "Loan Dataset")

## Demographic

In [None]:
dataset_type = 'demographic'
logger = setup_logger()
logger.info(f"Processing {dataset_type} dataset")

demographic_df = pd.read_csv(demographic_df_path)

# Standardize data formats
demographic_df = standardize_data_formats(demographic_df, dataset_name=dataset_type)

# Identify and handle duplicates
duplicate_summary = identify_duplicates(demographic_df, logger)
demographic_df = handle_duplicates(demographic_df, duplicate_summary, logger)

# Normalize missing values
demographic_df = normalize_missing_values(demographic_df)
# print('after normalized: ',df.head)

# Initialize and apply advanced imputer
imputer = AdvancedImputer(logger)
demographic_df_imputed = imputer.fit_transform(demographic_df)

# Apply domain-specific rules
rules = get_domain_specific_imputation_rules()
demographic_df_imputed = apply_domain_specific_rules(demographic_df_imputed, rules, logger)

# Apply constraints
demographic_df_imputed = apply_constraints(demographic_df_imputed)

# Final cleaning: Fill any remaining missing values
demographic_df_imputed.fillna(method='ffill', inplace=True)
demographic_df_imputed.fillna(method='bfill', inplace=True)

# Identify and handle outliers
outlier_summary = detect_outliers(demographic_df_imputed)
demographic_df,_ = handle_outliers(demographic_df_imputed, outlier_summary, strategy='cap')

# Check for consistency
remove_exact_duplicates(demographic_df, 'contract_no')
demographic_df = check_consistency(demographic_df, logger, dataset_type)

logger.info("Dataset processing completed")

## Loan Origin

In [None]:
dataset_type = 'loan_origin'
logger = setup_logger()
logger.info(f"Processing {dataset_type} dataset")

loan_df = pd.read_csv(loan_df_path)

# Standardize data formats
loan_df = standardize_data_formats(loan_df, dataset_name=dataset_type)

# Identify and handle duplicates
duplicate_summary = identify_duplicates(loan_df, logger)
loan_df = handle_duplicates(loan_df, duplicate_summary, logger)

# Normalize missing values
loan_df = normalize_missing_values(loan_df)
# print('after normalized: ',df.head)

# Initialize and apply advanced imputer
imputer = AdvancedImputer(logger)
loan_df_imputed = imputer.fit_transform(demographic_df)

# Apply domain-specific rules
rules = get_domain_specific_imputation_rules()
loan_df_imputed = apply_domain_specific_rules(loan_df_imputed, rules, logger)

# Apply constraints
loan_df_imputed = apply_constraints(loan_df_imputed)

# Final cleaning: Fill any remaining missing values
loan_df_imputed.fillna(method='ffill', inplace=True)
loan_df_imputed.fillna(method='bfill', inplace=True)

# Identify and handle outliers
outlier_summary = detect_outliers(loan_df_imputed)
loan_df,_ = handle_outliers(loan_df_imputed, outlier_summary, strategy='cap')

# Check for consistency
remove_exact_duplicates(loan_df, 'contract_no')
loan_df = check_consistency(loan_df, logger, dataset_type)

logger.info("Dataset processing completed")

## Join Dataset

In [None]:
def inner_join_datasets(loan_df, demographic_df, on_column='contract_no'):
  """
  Performs an inner join on two datasets based on the specified column.

  Args:
    loan_df_path (str): Path to the loan origin dataset CSV file.
    demographic_df_path (str): Path to the demographic dataset CSV file.clear
    on_column (str): Column name to join the datasets on.

  Returns:
    pd.DataFrame: The joined dataset.
  """
  try:
    joined_df = pd.merge(loan_df, demographic_df, on=on_column, how='inner')
    remove_exact_duplicates(joined_df, 'contract_no')
    if joined_df is not None:
      joined_df.to_csv('joined_dataset.csv', index=False)
      print(joined_df.head())
      print(joined_df.sample(5))
      print("Joined dataset saved as 'joined_dataset.csv'")
      return joined_df
    else:
      print("Error: Joined dataset is empty.")

  except FileNotFoundError as e:
    print(f"Error: One or both of the input files not found. {e}")
    return None
  except Exception as e:
    print(f"Error during inner join: {e}")
    return None

inner_join_datasets(loan_df, demographic_df)
joined_df = pd.read_csv(joined_df_path)

In [None]:
def plot_correlation_matrix(df, title="Correlation Matrix Heatmap", save_path=None):
    """
    Plots a correlation matrix heatmap of numerical features,
    excluding 'contract_no'.

    Args:
        df: The input DataFrame.
        title: The title of the plot.
        save_path: Optional; path to save the plot as an image file.
    """
    # Exclude 'contract_no' and select only numerical features
    numerical_features = df.select_dtypes(include=np.number).drop(columns=['contract_no'], errors='ignore').columns
    correlation_matrix = df[numerical_features].corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".3f", linewidths=.5)
    plt.title(title, fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure if a save_path is provided
    if save_path:
        plt.savefig(save_path, dpi=600)
        plt.close()

    # plt.show()

plot_correlation_matrix(joined_df, save_path='correlation_matrix.png')

# Analyse Questions

In [None]:
from question_support import *
run_analysis(joined_df_path)

In [None]:
joined_df = pd.read_csv(joined_df_path)
# joined_df.head()
joined_df.info()
# joined_df.describe()

In [None]:
def plot_correlation_matrix(df, title="Correlation Matrix Heatmap", save_path=None):
    """
    Plots a correlation matrix heatmap of numerical features,
    excluding 'contract_no'.

    Args:
        df: The input DataFrame.
        title: The title of the plot.
        save_path: Optional; path to save the plot as an image file.
    """
    # Exclude 'contract_no' and select only numerical features
    numerical_features = df.select_dtypes(include=np.number).drop(columns=['contract_no'], errors='ignore').columns
    correlation_matrix = df[numerical_features].corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".3f", linewidths=.5)
    plt.title(title, fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save the figure if a save_path is provided
    if save_path:
        plt.savefig(save_path, dpi=600)
        plt.close()

    # plt.show()

plot_correlation_matrix(joined_df, save_path='correlation_matrix.png')

# Modelling

## Time-Series Classification and Regression

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import optuna
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

output_file = 'model_evaluation_results.txt'
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

def preprocess_data(joined_df):
    """Preprocess the dataset for combined-year modeling with time-aware splits."""
    joined_df['disbursement_date'] = pd.to_datetime(joined_df['disbursement_date'], errors='coerce')
    joined_df['disbursement_year'] = joined_df['disbursement_date'].dt.year

    # Identify categorical columns and apply one-hot encoding
    categorical_columns = joined_df.select_dtypes(include='object').columns
    joined_df = pd.get_dummies(joined_df, columns=categorical_columns, drop_first=True)

    # Define target columns after encoding
    product_category_columns = [col for col in joined_df.columns if 'product_category_' in col]

    # Drop unnecessary columns
    columns_to_drop = ['disbursement_date', 'age']
    for col in columns_to_drop:
        if col in joined_df.columns:
            joined_df = joined_df.drop(columns=[col])

    # Standardize numerical features
    numerical_features = ['loan_amount', 'customer_income', 'insurance_rate', 'month_interest', 'weight', 'height']
    scaler = StandardScaler()
    joined_df[numerical_features] = scaler.fit_transform(joined_df[numerical_features])

    # Remove low-variance features
    X = joined_df.drop(columns=product_category_columns)
    X = drop_low_variance_features(X)
    y = joined_df[product_category_columns]
    y_loan = joined_df['loan_amount']
    y_rate = joined_df['rate']

    return X, y, y_loan, y_rate, product_category_columns

def drop_low_variance_features(df, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(df)
    return df[df.columns[selector.get_support(indices=True)]]

def tune_lightgbm(X, y):
    """Hyperparameter tuning for LightGBM using Optuna with time-series cross-validation."""
    def objective(trial):
        param = {
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'is_unbalance': True,
            'min_gain_to_split': 0.01
        }

        model = MultiOutputClassifier(lgb.LGBMClassifier(**param))
        tscv = TimeSeriesSplit(n_splits=5)
        aucs = []

        for train_index, valid_index in tscv.split(X):
            X_tr, X_val = X.iloc[train_index], X.iloc[valid_index]
            y_tr, y_val = y.iloc[train_index], y.iloc[valid_index]
            model.fit(X_tr, y_tr)
            y_val_pred_proba = model.predict_proba(X_val)

            fold_aucs = []
            for i in range(y_val.shape[1]):
                positive_class_probs = y_val_pred_proba[i][:, 1]
                auc = roc_auc_score(y_val.iloc[:, i], positive_class_probs)
                fold_aucs.append(auc)

            aucs.append(np.mean(fold_aucs))

        return np.mean(aucs)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    best_params = study.best_params
    print("Best parameters from Optuna:", best_params)
    return best_params

def build_nn(input_shape):
    """Create a neural network model for regression."""
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def train_nn_loan_model(X_train, y_train, X_test, y_test):
    nn_loan = build_nn(X_train.shape[1])
    tensorboard_callback = TensorBoard(log_dir="./logs", histogram_freq=1)
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)
    nn_loan.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[tensorboard_callback, early_stopping_callback])
    nn_loan.save(os.path.join(models_dir, 'nn_loan_model.h5'))
    nn_loan_preds = nn_loan.predict(X_test)
    print("Neural Network RMSE for Loan Amount:", mean_squared_error(y_test, nn_loan_preds, squared=False))
    return nn_loan

def evaluate_classification_model(y_test, y_pred, y_pred_proba, product_category_columns):
    auc_scores = []
    with open(output_file, 'a') as f:
        for i, col in enumerate(product_category_columns):
            positive_class_probs = y_pred_proba[i][:, 1]
            auc = roc_auc_score(y_test[col], positive_class_probs)
            auc_scores.append(auc)
            f.write(f"AUC for {col}: {auc}\n")

        accuracy_scores = [accuracy_score(y_test.iloc[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]
        f.write("Multi-Label Accuracy for each product category: " + str(accuracy_scores) + "\n")

def evaluate_regression_model(y_test, y_preds, metric_name):
    rmse = mean_squared_error(y_test, y_preds, squared=False)
    with open(output_file, 'a') as f:
        f.write(f"{metric_name} RMSE: {rmse}\n")

joined_df = pd.read_csv(joined_df_path)

# Preprocess the data
X, y, y_loan, y_rate, product_category_columns = preprocess_data(joined_df)

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

train_accuracies, test_accuracies = [], []

for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_loan_train, y_loan_test = y_loan.iloc[train_index], y_loan.iloc[test_index]

    # Tune LightGBM for classification
    best_lgbm_params = tune_lightgbm(X_train, y_train)
    base_model = lgb.LGBMClassifier(**best_lgbm_params)
    multi_target_model = MultiOutputClassifier(base_model)
    multi_target_model.fit(X_train, y_train)
    y_pred_proba = multi_target_model.predict_proba(X_test)
    y_pred = multi_target_model.predict(X_test)

    # Save each LightGBM model for each target label
    for i, estimator in enumerate(multi_target_model.estimators_):
        booster = estimator.booster_
        booster.save_model(os.path.join(models_dir, f'lgbm_classifier_model_target_{i}_fold_{fold}.txt'))

    # Evaluate and save model results
    train_accuracy = multi_target_model.score(X_train, y_train)
    test_accuracy = multi_target_model.score(X_test, y_test)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    evaluate_classification_model(y_test, y_pred, y_pred_proba, product_category_columns)

    # Train and evaluate Gradient Boosting Regressor for Loan Amount
    gbr_loan = xgb.XGBRegressor(learning_rate=0.05, n_estimators=200, max_depth=7, subsample=0.8)
    gbr_loan.fit(X_train, y_loan_train)
    loan_preds = gbr_loan.predict(X_test)
    evaluate_regression_model(y_loan_test, loan_preds, "Gradient Boosting")
    gbr_loan.save_model(os.path.join(models_dir, f'xgb_regressor_model_fold_{fold}.json'))

    # Train and evaluate Neural Network for Loan Amount
    nn_loan = train_nn_loan_model(X_train, y_loan_train, X_test, y_loan_test)

# Plot training and validation accuracy across folds
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, len(test_accuracies) + 1), test_accuracies, label='Validation Accuracy')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('LightGBM Classifier Accuracy During Cross-Validation')
plt.legend()
plt.savefig(os.path.join(models_dir, 'training_validation_accuracy.png'))
plt.show()

## RandomForest & Decision Tree Classification

In [None]:
def feature_importance_examination():
    df = joined_df
    question_3_data = df[(df['product_category'] == 0) | (df['product_category'] == 1) | (df['product_category'] == 2)]
    question_3_data.drop(['contract_no', 'permanent_address_province'], axis=1, inplace=True)

    scaler_1 = StandardScaler()
    question_3_data_scaled = scaler_1.fit_transform(question_3_data)

    y = question_3_data['product_category']
    X = question_3_data.drop(['product_category', 'creditibility'], axis=1)

    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    y_pred = model.predict(X)
    feature_names = X.columns
    coefficients = model.coef_

    feature_importance = pd.DataFrame(coefficients.T, index=feature_names, columns=model.classes_)
    feature_importance['Absolute Importance'] = feature_importance.abs().max(axis=1)
    feature_importance = feature_importance.sort_values(by='Absolute Importance', ascending=False)

    print("Feature Importance for Each Class:")
    print(feature_importance[['Absolute Importance'] + list(model.classes_)])

    model_1 = RandomForestClassifier()
    model_1.fit(X, y)
    y_pred = model_1.predict(X)

    feature_importances = model_1.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print("Feature Importance in Random Forest Classifier:")
    print(feature_importance_df)

    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance in Random Forest Classifier')
    plt.gca().invert_yaxis()
    plt.show()

feature_importance_examination()