In [46]:
import pandas as pd
import numpy as np
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification



In [47]:
df_dev  = pd.read_csv("../dados/ready/dev.csv", sep=",")
df_comp = pd.read_csv("../dados/ready/comp.csv", sep=",")

In [48]:
df_dev.columns

Index(['loan_id', 'amount', 'duration', 'payments', 'status',
       'acc_age_at_loan', 'is_account_shared', 'sex', 'age', 'num_inhab',
       'perc_urban_inhab', 'avg_salary', 'enterp_per_1000', 'num_crimes',
       'unemployment', 'unemployment_delta', 'crimes_delta',
       'type_card_classic', 'type_card_gold', 'type_card_junior',
       'type_card_other', 'has_card', 'min_balance', 'avg_balance',
       'max_balance', 'sanctions', 'only_to_na'],
      dtype='object')

In [49]:
def get_x(df: pd.DataFrame) -> None:
    condition = (df.columns != 'status') & (df.columns != 'account_id') & (df.columns != 'loan_id') & (
        df.columns != 'client_id') & (df.columns != 'district_id') & (df.columns != 'disp_id') & (df.columns != 'district_id_y') & (
        df.columns != 'district_id_x') & (df.columns != 'id')
    return df.loc[:, condition]

def get_y(df: pd.DataFrame) -> None:
    return df.status

In [50]:
X = get_x(df_dev)
y = df_dev

In [51]:
X, y = make_classification(n_classes = 2, class_sep = 2,
weights=[0.85, 0.15], n_informative=3, n_redundant=1, flip_y=0,
n_features=25, n_clusters_per_class=1, n_samples = 248, random_state=10)


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123)
np.bincount(y_train)

array([171,  27])

In [52]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [53]:

model = DecisionTreeClassifier(max_depth=15)
model.fit(X_res, y_res)


pred_competition = model.predict_proba(get_x(df_comp))

prediction = pd.DataFrame({'Id': df_comp['loan_id'], 'Predicted': pred_competition[::, 1]})

predicted = model.predict_proba(X_test)[::, 1]
expected  = y_test
score = roc_auc_score(expected, predicted)

print(f"Score is {score}")
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



Score is 0.95
[[40  0]
 [ 1  9]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.90      0.95        10

    accuracy                           0.98        50
   macro avg       0.99      0.95      0.97        50
weighted avg       0.98      0.98      0.98        50



  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [54]:
prediction.to_csv("../predictions/decision-tree_balanced.csv", index = False)