In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# We're using scikit so load in all the required dependencies
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [None]:
# Load in the .csv files
data_dir = "input/gmsc"
train = pd.read_csv(f"{data_dir}/cs-training.csv")
test = pd.read_csv(f"{data_dir}/cs-test.csv")
submission = pd.read_csv(f"{data_dir}/sampleEntry.csv")

In [None]:
# Shows the shape and info of the dataframe
print("shape of dataframe is : ", train.shape)
train.info()

In [None]:
# Rename the first column "Unnamed" to be "ID"
data = [train, test]
for df in data:
    df.rename(columns = {'Unnamed: 0': 'ID'}, inplace=True)

In [None]:
# Replace the null values to round out the training dataframe
train['MonthlyIncome'].fillna(train['MonthlyIncome'].mean(), inplace=True)
train['NumberOfDependents'].fillna(train['NumberOfDependents'].mode()[0], inplace=True)

train.isnull().sum()

In [None]:
# Replace the null values to round out the test dataframe
test['MonthlyIncome'].fillna(test['MonthlyIncome'].mean(), inplace=True)
test['NumberOfDependents'].fillna(test['NumberOfDependents'].mode()[0], inplace=True)

test.isnull().sum()

In [None]:
# Drop the columns we don't need from our features and set our label
X = train.drop(['ID', 'SeriousDlqin2yrs'], axis=1)
y = train['SeriousDlqin2yrs']

X_test = test.drop(['ID', 'SeriousDlqin2yrs'], axis=1)
y_test = test['SeriousDlqin2yrs']

In [None]:
# Transform our numerical columns for training and test data
num_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
scale = StandardScaler()
scale.fit(X[num_columns])
X[num_columns] = scale.transform(X[num_columns]).copy()
X_test[num_columns] = scale.transform((X_test[num_columns])).copy()

In [None]:
# Normalise our labels
y.value_counts(normalize=True)
y_test.value_counts(normalize=True)

In [None]:
# Define our decision tree classifier. Using 10 nodes as it predicts well without overfitting
model = DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)
model.fit(X, y)

In [None]:
# Define Stratified sampling for out data
cv = StratifiedKFold(n_splits= 10, shuffle= True, random_state=13)
roc_auc_scores = cross_val_score(model, X, y, scoring= 'roc_auc', cv=cv, n_jobs = -1)
roc_auc_scores.mean()

In [None]:
# Obtain our predictions
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:,1]
y_test_pred_proba = model.predict_proba(X_test)[:,1]

In [None]:
# roc_aoc_score for our predictions
roc_auc_score(y, y_pred_proba)

In [None]:
# Plot the roc curve
def plot_roc(y, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y, y_pred_proba)
    plt.plot(fpr, tpr)
    plt.plot(fpr, fpr, linestyle = '--', color = 'k')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')

In [None]:
plot_roc(y, y_pred_proba)

In [None]:
print(classification_report(y, y_pred))

In [None]:
# Display our decision tree
from sklearn import tree
fig = plt.figure(figsize=(25,20))
# tree.plot_tree(model)

tree.plot_tree(model,
    feature_names=X.columns,
    class_names=True,
    filled=True)
