In [36]:
print('Hello World!')

Hello World!


# Import Data

In [None]:
data = CADataConnector.read_data(path=".my_folders/Default.csv")

In [None]:
data.head()

# Import Modules

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline

# Modelling

In [None]:
# One hot encode data
one_hot_cols = ['Default', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', \
               'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', \
               'Streaming Movies', 'Paperless Billing']

for col in one_hot_cols:
    data[col] = data[col].apply(lambda x: 1 if x == "Yes" else 0)

# Create X and y vars
X = pd.get_dummies(data.drop(['Customer ID', 'Default'], axis=1))    
X['Total Charges'].fillna(0, inplace=True)
y = data['Default']

# Create train test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

# Build pipelines
pipelines = {}
pipelines['rf'] = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234))
pipelines['gb'] = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234))

# Setup hyperparameters
hyperparams = {}
hyperparams['rf'] = {'randomforestclassifier__n_estimators':[100,200,300]}
hyperparams['gb'] = {'gradientboostingclassifier__n_estimators':[100,200,300]}

# Fit models
fit_models = {}
for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparams[algo], n_jobs=-1, cv=10)
    model.fit(X_train, y_train)
    fit_models[algo] = model

# Evaluate Models

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve

In [None]:
for algorithm, model in fit_models.items():
    yhat = model.predict(X_test)
    print("{} Metrics - F1: {}, Precision: {}, Recall: {}".format(algorithm, str(f1_score(y_test, yhat)), str(precision_score(y_test, yhat)), str(recall_score(y_test, yhat))))

In [None]:
from matplotlib import pyplot as plt
fpr, tpr, _ = roc_curve(y_test.values, fit_models['rf'].predict(X_test))
plt.plot(fpr, tpr, marker='.', label='Random Forest')

fpr, tpr, _ = roc_curve(y_test.values, fit_models['gb'].predict(X_test))
plt.plot(fpr, tpr, marker='.', label='Gradient Boosted')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()