In [42]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

In [43]:
application_record = pd.read_csv("application_record.csv")
application_record = application_record[["ID", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN", "AMT_INCOME_TOTAL", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "DAYS_BIRTH", "DAYS_EMPLOYED", "CNT_FAM_MEMBERS"]]

application_record["FLAG_OWN_CAR"] = application_record["FLAG_OWN_CAR"].map(lambda x: x == "Y")
application_record["FLAG_OWN_REALTY"] = application_record["FLAG_OWN_REALTY"].map(lambda x: x == "Y")

application_record["CNT_FAM_MEMBERS"] = application_record["CNT_FAM_MEMBERS"].astype(int)

application_record = pd.get_dummies(application_record, columns=["NAME_INCOME_TYPE"], prefix="INCOME_TYPE")
application_record = pd.get_dummies(application_record, columns=["NAME_EDUCATION_TYPE"], prefix="EDUCATION_TYPE")
application_record = pd.get_dummies(application_record, columns=["NAME_FAMILY_STATUS"], prefix="FAMILY_STATUS")
application_record = pd.get_dummies(application_record, columns=["NAME_HOUSING_TYPE"], prefix="HOUSING_TYPE")

bool_columns = application_record.select_dtypes(include="bool").columns
application_record[bool_columns] = application_record[bool_columns].astype(int)

application_record

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,CNT_FAM_MEMBERS,INCOME_TYPE_Commercial associate,INCOME_TYPE_Pensioner,...,FAMILY_STATUS_Married,FAMILY_STATUS_Separated,FAMILY_STATUS_Single / not married,FAMILY_STATUS_Widow,HOUSING_TYPE_Co-op apartment,HOUSING_TYPE_House / apartment,HOUSING_TYPE_Municipal apartment,HOUSING_TYPE_Office apartment,HOUSING_TYPE_Rented apartment,HOUSING_TYPE_With parents
0,5008804,1,1,0,427500.0,-12005,-4542,2,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5008805,1,1,0,427500.0,-12005,-4542,2,0,0,...,0,0,0,0,0,0,0,0,1,0
2,5008806,1,1,0,112500.0,-21474,-1134,2,0,0,...,1,0,0,0,0,1,0,0,0,0
3,5008808,0,1,0,270000.0,-19110,-3051,1,1,0,...,0,0,1,0,0,1,0,0,0,0
4,5008809,0,1,0,270000.0,-19110,-3051,1,1,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,0,1,0,135000.0,-22717,365243,1,0,1,...,0,1,0,0,0,1,0,0,0,0
438553,6840222,0,0,0,103500.0,-15939,-3007,1,0,0,...,0,0,1,0,0,1,0,0,0,0
438554,6841878,0,0,0,54000.0,-8169,-372,1,1,0,...,0,0,1,0,0,0,0,0,0,1
438555,6842765,0,1,0,72000.0,-21673,365243,2,0,1,...,1,0,0,0,0,1,0,0,0,0


In [44]:
credit_record = pd.read_csv("credit_record.csv")
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C
...,...,...,...
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C


In [45]:
def group_is_good(group):
    values = {'C': 1, 'X': 0, '0': -1, '1': -2, '2': -3, '3': -4, '4': -5, '5': -6}
    threshold = -0.75
    return int(group["STATUS"].map(lambda x: values[x]).mean() >= threshold)

is_good_map = credit_record.groupby("ID").apply(group_is_good).reset_index(name="IS_GOOD")

  is_good_map = credit_record.groupby("ID").apply(group_is_good).reset_index(name="IS_GOOD")


In [46]:
application_record = pd.merge(application_record, is_good_map, on="ID")
application_record

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,CNT_FAM_MEMBERS,INCOME_TYPE_Commercial associate,INCOME_TYPE_Pensioner,...,FAMILY_STATUS_Separated,FAMILY_STATUS_Single / not married,FAMILY_STATUS_Widow,HOUSING_TYPE_Co-op apartment,HOUSING_TYPE_House / apartment,HOUSING_TYPE_Municipal apartment,HOUSING_TYPE_Office apartment,HOUSING_TYPE_Rented apartment,HOUSING_TYPE_With parents,IS_GOOD
0,5008804,1,1,0,427500.0,-12005,-4542,2,0,0,...,0,0,0,0,0,0,0,1,0,1
1,5008805,1,1,0,427500.0,-12005,-4542,2,0,0,...,0,0,0,0,0,0,0,1,0,1
2,5008806,1,1,0,112500.0,-21474,-1134,2,0,0,...,0,0,0,0,1,0,0,0,0,1
3,5008808,0,1,0,270000.0,-19110,-3051,1,1,0,...,0,1,0,0,1,0,0,0,0,1
4,5008809,0,1,0,270000.0,-19110,-3051,1,1,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,1,1,0,315000.0,-17348,-2420,2,0,0,...,0,0,0,0,1,0,0,0,0,0
36453,5149834,0,1,0,157500.0,-12387,-1325,2,1,0,...,0,0,0,0,1,0,0,0,0,0
36454,5149838,0,1,0,157500.0,-12387,-1325,2,0,1,...,0,0,0,0,1,0,0,0,0,0
36455,5150049,0,1,0,283500.0,-17958,-655,2,0,0,...,0,0,0,0,1,0,0,0,0,0


In [47]:
application_record[["IS_GOOD"]].mean()

IS_GOOD    0.713964
dtype: float64

In [48]:
X, Y = np.array(application_record.iloc[:, 1:-1]), np.array(application_record.iloc[:, -1:]).ravel()
X = np.hstack((X, np.ones((X.shape[0], 1))))

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [50]:
scores = cross_val_score(RandomForestClassifier(random_state=42), X_train_scaled, Y_train_smote, cv=5)
print(f"Cross-validated accuracy: {scores.mean()}")

KeyboardInterrupt: 

In [None]:
scores = cross_val_score(SVC(random_state=42, max_iter=1000), X_train_scaled, Y_train_smote, cv=5)
print(f"Cross-validated accuracy: {scores.mean()}")



Cross-validated accuracy: 0.5310510941095933


In [None]:
scores = cross_val_score(LogisticRegression(random_state=42, max_iter=2000), X_train, Y_train, cv=5)
print(f"Cross-validated accuracy: {scores.mean()}")

Cross-validated accuracy: 0.7143304970891178


In [None]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), param_grid, cv=2)
grid_search.fit(X_train_scaled, Y_train_smote)

Y_pred = grid_search.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred))
print(grid_search.best_params_)

KeyboardInterrupt: 

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(SVC(random_state=42, class_weight='balanced'), param_grid, cv=5)
grid_search.fit(X_train_scaled, Y_train_smote)

Y_pred_svm = grid_search.predict(X_test_scaled)
accuracy_svm = accuracy_score(Y_test, Y_pred_svm)

print(f"SVM Accuracy: {accuracy_svm}")
print("SVM Classification Report:")
print(classification_report(Y_test, Y_pred_svm))
print(grid_search.best_params_)



SVM Accuracy: 0.5885902358749314
SVM Classification Report:
              precision    recall  f1-score   support

          -1       0.28      0.28      0.28      3138
           1       0.71      0.71      0.71      7800

    accuracy                           0.59     10938
   macro avg       0.50      0.50      0.50     10938
weighted avg       0.59      0.59      0.59     10938



In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'), param_grid, cv=5)
grid_search.fit(X_train_smote, Y_train_smote)

Y_pred_logreg = grid_search.predict(X_test_scaled)
accuracy_logreg = accuracy_score(Y_test, Y_pred_logreg)

print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print("Logistic Regression Classification Report:")
print(classification_report(Y_test, Y_pred_logreg))
print(grid_search.best_params_)

Logistic Regression Accuracy: 0.7131102578167855
Logistic Regression Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      3138
           1       0.71      1.00      0.83      7800

    accuracy                           0.71     10938
   macro avg       0.36      0.50      0.42     10938
weighted avg       0.51      0.71      0.59     10938



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=12,
                      n_estimators=250,
                      min_child_weight=8, 
                      subsample=0.8, 
                      learning_rate =0.02,    
                      seed=42)

model.fit(X_train, Y_train)
y_predict = model.predict(X_test)

print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, y_predict)))
print(pd.DataFrame(confusion_matrix(Y_test, y_predict)))

Accuracy Score is 0.72618
     0     1
0  297  2841
1  154  7646
