In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
%matplotlib inline

In [2]:
df = pd.read_csv('bigml_59c28831336c6604c800002a.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'bigml_59c28831336c6604c800002a.csv'

In [None]:
df.info()

In [None]:
## missing values
print(f"\n {'Number of null values in every column'.title()} \n {df.isnull().sum()}")

## duplicate values
print(f"\n {'number of duplicate values'.title()} :- {len(df.loc[df.duplicated()])}")

## target value count 
print(f"\n {'count of each value of target column'.title()} \n {df.churn.value_counts()}")

In [None]:
sns.countplot(x='churn', data=df);

In [None]:
#All "Charge" columns are highly correlated with their "minutes" column. So we will drop the charge columns.
#Area Code is a column that has innacurate data, so that will be dropped as well.
df.corr()

## Data Cleaning

In [None]:
#df1 = df1[[x for x in df1 if not x.endswith('_charge')]]
df.columns = df.columns.str.replace(' ', '_')
df = df.drop([x for x in df.columns if x.endswith('_charge')],axis=1)
df = df.drop(['phone_number', 'area_code'], axis=1)

## Defining X and y as Our Targets and Features

In [None]:
y = df['churn']
X = df.drop(columns='churn', axis=1)

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30, stratify=y)

## Grabbing All the Categorical Columns

In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtypes not in ['float64', 'int64']]
cat_cols

## Now for the Numerical Columns

In [None]:
num_cols = [col for col in X_train.columns if col not in cat_cols]
num_cols

## Setting Up a Categorical Pipe 

In [None]:
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

## Now Let's Do the Same for a Numerical Pipe 

In [None]:
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=0, strategy='mean')),
    ('scaler', StandardScaler())
])

## Creating a Column Transformer 

In [None]:
preprocess = ColumnTransformer(transformers=[
    ('cat_cleaning', cat_pipe, cat_cols),
    ('num_cleaning', num_pipe, num_cols)
])
#preprocess

In [None]:
our_pipe = imbPipeline(steps=[
    ('preprocess', preprocess),
    ('smote', SMOTE()),
    ('log_reg', LogisticRegression())
])

our_pipe.fit(X_train, y_train)

# Functions For Fitting Training Data and Printing Scores

In [None]:
def pipe_model_choice(abrv,model):
    model_choice = imbPipeline(steps=[
    ('preprocess', preprocess),
    ('smote', SMOTE()),
    (abrv, model)])
    
    return model_choice

In [None]:
def fit_and_print_scores(model_choice):
    fitted = model_choice.fit(X_train, y_train)
    
    y_trn_pred = fitted.predict(X_train)
    y_tst_pred = fitted.predict(X_test)
    
    print('\t\tThe Train Results')
    print(classification_report(y_train, y_trn_pred))
    print('\n\t\tThe Test Results')
    print(classification_report(y_test, y_tst_pred))
    print('Test Score: ', model_choice.score(X_test, y_test))
    cv_score = cross_val_score(model_choice, X_train, y_train)
    print('Cross Validation Score: ', cv_score.mean())

# BASELINE MODEL

In [None]:
dummy_model = DummyClassifier(strategy="stratified")# We chose 'stratified' because it respects class distributions

fit_and_print_scores(pipe_model_choice('baseline', dummy_model))

In [None]:
fig, ax = plt.subplots()
fig.suptitle("Dummy Model")
plot_confusion_matrix(dummy_model, X_train, y_train, ax=ax, cmap="plasma");

In [None]:
plot_roc_curve(dummy_model,X_train,y_train);

# First Simple Model (Logistic Regression)

In [None]:
fit_and_print_scores(pipe_model_choice('log_reg', LogisticRegression()))

# Decision Tree

In [None]:
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=4)

fit_and_print_scores(pipe_model_choice('dt', dt))

# =================================================

# Random Forest Classifier

In [None]:
rft = RandomForestClassifier(max_depth=6, 
max_features=11,
min_samples_leaf=5,
min_samples_split=12,
n_estimators=75)

fit_and_print_scores(pipe_model_choice('rf', rft))
#pull out feature importance

## Making a RandomForest Gridsearch

In [None]:
our_pipe = imbPipeline(steps=[
    ('preprocess', preprocess),
    ('smote', SMOTE()),
    ('rft', RandomForestClassifier(max_depth=6, max_features=11,
                                        min_samples_leaf=4,
                                        min_samples_split=11,
                                        n_estimators=58,
                                  n_jobs=-1,
                                  verbose=3))])

## Setting Parameters

In [None]:
params = {
    'rft__max_depth': [5,6],
    'rft__max_features': [11,12],
    'rft__min_samples_leaf': [4,5],
    'rft__min_samples_split': [10,11],
    'rft__n_estimators': [58, 75],
    
}

## Instantiate GridSearchCV

In [None]:
grid_pipe = GridSearchCV(our_pipe, params, n_jobs = -1, verbose=3, cv=2) 

## Fit Grid_Pipe to X_train, y_train

In [None]:
grid_pipe.fit(X_train, y_train)

## Gives the Max Result. This is Our Model

In [None]:
grid_pipe.best_estimator_

In [None]:
grid_pipe = grid_pipe.best_estimator_

#  Machine Modeling on a Tuesday Afternoon 

## Confusion Matrix for Gridsearch Random Forest

In [None]:
y_predict = grid_pipe.predict(X_test)

In [None]:
fig, ax = plt.subplots()
fig.suptitle("Random Forest Model")
plot_confusion_matrix(grid_pipe, X_test, y_test, ax=ax, cmap="plasma");

# ==============================================================

# The Multi-Layered Model

In [None]:
layer_one_estimators = [
     
    ('rf_1', RandomForestClassifier(n_estimators=58, 
                                     random_state=42,
                                    min_samples_leaf=4,
                                    min_samples_split=13,
                                    max_features=11)),
    
    ('knn_1', KNeighborsClassifier(n_neighbors=13)),
    
    ('Ada_1', AdaBoostClassifier(n_estimators=50, learning_rate=1,random_state=42)),
]



layer_two_estimators = [
    
    ('dt_2', DecisionTreeClassifier(random_state=42,
                                           criterion = 'entropy',
                                           max_depth = 25,
                                           min_samples_split = 2,
                                           min_samples_leaf = 2,
                                           max_leaf_nodes = 15)),
     
    ('rf_2', RandomForestClassifier(n_estimators=58, 
                                     random_state=42,
                                    min_samples_leaf=4,
                                    min_samples_split=11,
                                    max_features=2))
]



layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression(C=0.9,
                                                                                                   penalty='l2',
                                                                                                  random_state=42,
                                                                                                  solver='saga',
                                                                                                  max_iter=10000,
                                                                                                   warm_start=True,))
     
# Creating the multi-layered model
layers = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

## Getting the Scores and Fitting 

In [None]:
fit_and_print_scores(pipe_model_choice('layers', layers))

In [None]:
our_layers_pipe = imbPipeline(steps=[
    ('preprocess', preprocess),
    ('smote', SMOTE()),
    ('layers', layers)
    
])

# Multi-layered Confusion Matrices

In [None]:
fig, ax = plt.subplots()
fig.suptitle('Train Confusion Matrix')
plot_confusion_matrix(our_layers_pipe, X_train, y_train, ax=ax, normalize='true')

fig, ax = plt.subplots()
fig.suptitle('Test Confusion Matrix')
plot_confusion_matrix(our_layers_pipe, X_test, y_test, ax=ax, normalize='true');

# Visualizations

In [None]:
#Getting the features that impact Churn the most
cat_ohe_cols = grid_pipe.named_steps['preprocess'].named_transformers_['cat_cleaning'].named_steps['ohe'].get_feature_names(cat_cols).tolist()
feat_cols = cat_ohe_cols + num_cols
X_train_tf = pd.DataFrame(grid_pipe.named_steps['preprocess'].fit_transform(X_train), columns=feat_cols)
important_feats = pd.Series(grid_pipe.named_steps['rft'].feature_importances_, index=feat_cols)
top_feats = important_feats.sort_values(ascending=False).head(10)

In [None]:
plt.Figure(figsize=(5, 5))
top_feats.sort_values().plot(kind='barh')
plt.xlabel('Level of Importance')
plt.ylabel('Features')
plt.title('Most Important Features that Affect Churn');

In [None]:
metrics = pd.DataFrame(columns=['Model', 'Recall', 'Churn'])
metrics.loc[len(metrics.index)] = ['Simple', .78, 0]
metrics.loc[len(metrics.index)] = ['Simple', .81, 1]
metrics.loc[len(metrics.index)] = ['Complex', .94, 0] 
metrics.loc[len(metrics.index)] = ['Complex', .85, 1] 

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.barplot(x = 'Model', y = 'Recall',
            hue= 'Churn', ax=ax, data=metrics, palette='rocket', ci=None)
plt.title("Metrics Comparison Between More Complex Models")
plt.legend(['Predicting No Churn', 'Predicting Churn']);
plt.ylabel("Predictability");
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))
#if you already have ticks in the 0 to 1 range. Otherwise see their answer

In [None]:
state_totals = df.groupby('state').count()['churn']
state_trues = df.groupby('state')['churn'].sum().sort_values()
state_totals = df.groupby('state').count()['churn']

churn_perc = state_trues / state_totals
churn_perc.sort_values()
areas = pd.DataFrame(columns=['State', 'Churn Rate'])
areas.loc[len(areas.index)] = ['California', 26.4]
areas.loc[len(areas.index)] = ['Colorado', 13.6] 
areas.loc[len(areas.index)] = ['Hawaii', 5.6] 
fig, ax = plt.subplots(figsize=(10,10))
sns.barplot(x = 'State', y = 'Churn Rate', ax=ax, data=areas, palette='rocket', ci=None)
plt.title("Churn Rate of Highest, Lowest, and Average State");

In [None]:
df_churn = df.loc[df['churn'] == 1]
fig, ax = plt.subplots(figsize=(10,10))
sns.barplot(x = 'customer_service_calls', y = 'total_day_minutes'
            , ax=ax, data=df_churn, palette='rocket')
plt.title("Customers who Churn")
plt.xlabel("Number of Customer Service Calls")
plt.ylabel("Minutes Used");

In [None]:
ax = sns.barplot(x="churn", y="total_day_minutes", data=df, ci=False, palette='rocket')
plt.title('Customers that Churn based on their Minutes Usage')
plt.xlabel('Churn')
plt.ylabel('Minutes Used');