In [None]:
import pandas as pd
pd.set_option('display.max_columns', None) # to display all columns
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, f1_score
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn import tree
from sklearn.metrics import plot_confusion_matrix

In [None]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

#### Key Takeaways:
###### SeniorCitizens column is marked as an integer. one of the only binary classification rows to be marked as such.
###### TotalCharges is marked as a 'non-null object' and should be a float64

In [None]:
# having an understanding for how datatypes are printed
print(df['PaymentMethod'].dtype)
print(df['MonthlyCharges'].dtype)
print(df['SeniorCitizen'].dtype)

In [None]:
def preliminary_research(df):
    for col in df.columns:
        unique_vals = df[col].unique()
        if len(unique_vals) < 10:
            print("Unique values for column {}: {}".format(col, unique_vals))
        else:
            if df[col].dtype == 'object':
                print("column {} has values string type".format(col))
            elif df[col].dtype == 'int64':
                print("column {} is numerical".format(col))
            elif df[col].dtype == 'float64':
                print("column {} is numerical".format(col))

In [None]:
preliminary_research(df)

In [None]:
# Looking into TotalCharges, which has a value string type for some strange reason
df['TotalCharges'].describe()

## A Breakdown of the Columns

##### 'CustomerID'
* ID numbers which have no impact on churn

##### 'Gender' is binary
* Male
* Female

##### 'SeniorCitizen' is binary
* 0 - no
* 1 - yes

##### 'Partner' is binary
* Yes - customer has a partner
* No - customer do not have a partner

##### 'Dependents' is binary
* Yes - customer has dependent(s)
* No - customer does not have dependents

##### 'Tenure' is numerical
* Represents how long the customer has been using the service

##### 'PhoneService' is binary
* Yes - customer has phone service with company
* no - customer do not have phone service with company

##### 'MultipleLines' is categorical
* Yes - customer has multiple line subscriptions
* No - customer has only 1 line subscription
* No phone service - customer do not have phone service with company

##### 'InternetService' is categorical
* DSL
* Fiber optic
* No - customer do not have internet service with company

##### 'OnlineSecurity' is categorical
* Yes - customer has online security with company
* No - customer do not have online security with company
* No internet service - customer do not have internet service with company

##### 'OnlineBackup' is categorical
* Yes - customer has online backup with company
* No - customer do not have online backup with company
* No internet service - customer do not have internet service with company

##### 'DeviceProtection' is categorical
* Yes - customer has device protection with company
* No - customer do not have device protection with company
* No internet service - customer do not have internet service with company

##### 'TechSupport' is categorical
* Yes - customer has technical support with company
* No - customer do not have technical support with company
* No internet service - customer do not have internet service with company

##### 'StreamingTV' is categorical
* Yes - customer has streaming TV service with company
* No - customer do not have streaming TV service with company
* No internet service - customer do not have internet service with company

##### 'StreamingMovies' is categorical
* Yes - customer has streaming movies service with company
* No - customer do not have streaming movies with company
* No internet service - customer do not have internet service with company

##### 'Contract' is categorical
* Month-to-month - customer is on a no-commitment plan
* One year - customer is on a 1-year contract commitment
* Two year - customer is on a 2-year contract commitment

##### 'PaperlessBilling' is binary
* Yes - only receives bills via email
* No - receives letters in mail with bill

##### 'PaymentMethod' is categorical 
* Electronic check
* Mailed check 
* Bank transfer (automatic)
* Credit card (automatic)

##### 'MonthlyCharges' is numerical
* count    7043.000000
* mean       64.761692
* std        30.090047
* min        18.250000
* 25%        35.500000
* 50%        70.350000
* 75%        89.850000
* max       118.750000

##### 'TotalCharges' -- NEEDS ATTENTION
* Says that dtype is a string. However, this should be numerical...

##### 'Churn' is binary
* No - customer is still an active customer
* Yes - customer has left service

## Work that is cut out for me in the cleaning process:
1. change column names to all lowercase and no spaces
2. convert all "yes / no" binary options to numerical 0s and 1s
3. convert df['totalcharges'] values from objects to a floats
4. drop the 'customerid' column

In [None]:
# change column names to all lowercase
df.columns = map(str.lower, df.columns)

In [None]:
# convert "No / Yes" binary options to numerical 0s and 1s
df.gender = df.gender.map({'Male':0 ,'Female':1})
df.partner = df.partner.map({'No':0 ,'Yes':1})
df.dependents = df.dependents.map({'No':0 ,'Yes':1})
df.phoneservice = df.phoneservice.map({'No':0 ,'Yes':1})
df.paperlessbilling = df.paperlessbilling.map({'No':0 ,'Yes':1})
df.churn = df.churn.map({'No':0, 'Yes':1})

# convert the categorical variables that have numeric significance into numerical
df.multiplelines = df.multiplelines.map({'No phone service':0, 'No':1, 'Yes':2})
df.contract = df.contract.map({'Month-to-month':0, 'One year':1, 'Two year':2})

In [None]:
# create dummy variables
df = pd.get_dummies(df, drop_first=True, dtype=int)

In [None]:
df.head()

In [None]:
# convert "No / Yes" binary options to numerical 0s and 1s
binary_columns = ['partner', 'dependents', 'phoneservice', 'paperlessbilling', 'churn']
df[binary_columns] = df[binary_columns].eq('Yes').mul(1)

In [None]:
# convert "Male / Female" binary options to numerical 0s and 1s
df['gender'] = df['gender'].eq('Female').mul(1)

In [None]:
# THIS INTENTIALLY PRODUCES AN ERROR CODE TO DEMONSTRATE MY THOUGHT PROCESS
# convert df['totalcharges'] values from objects to a floats
df['totalcharges'] = df.totalcharges.astype(float)

> Notes: Unable to convert to a float. This could be because there are null values. Let's check.

In [None]:
df['totalcharges'].isnull().values.any()

> Notes: Strange. There appear to be none. Let's check if there are any "blank" values.

In [None]:
print(sum(df.totalcharges == ' '))

> Notes: There appear to be 11 blank values. Let's replace them for NaN and then convert column to dtype float.

In [None]:
df['totalcharges'] = df['totalcharges'].replace(' ', np.nan, regex=True) # replaces blank to NaN
df['totalcharges'] = df.totalcharges.astype(float) # converts to float

> Notes: Regardless, let's drop the 11 rows. Will have very little statistical significance on the model to drop 11 rows out of 7k+ rows and will make processing and modeling substatially easier.

In [None]:
df = df.dropna()

In [None]:
# drop 'customerid' column.
df = df.drop(columns = 'customerid')

## Run a vanilla model based upon the information given. Use that as a benchmark.


In [None]:
# convert all categorical string variables into numerical values using LabelEncoder()
categorical_strings_columns = ['multiplelines', 'internetservice', 'onlinesecurity',
                               'onlinebackup', 'deviceprotection', 'techsupport',
                               'streamingtv', 'streamingmovies', 'contract', 'paymentmethod']

def labelencoder_conversion(df):
    le = preprocessing.LabelEncoder()
    for col in df[categorical_strings_columns]:
        df[col] = le.fit_transform(df[col].values)
    return print('Columns with dtype objects have successfully been encoded as integers.')

labelencoder_conversion(df)

In [None]:
df.info()

In [None]:
# Create features and labels
X = df.drop('churn', axis=1)  
y = df['churn'] 

In [None]:
# Perform an train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Instantiate
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

# Fit the model to training data
model_log = logreg.fit(X_train, y_train)

# Predict on test set
y_hat_test = logreg.predict(X_test)

In [None]:
# Calculate the probability scores of each point in the training set
y_train_score = model_log.decision_function(X_train)

# Calculate the fpr, tpr, and thresholds for the training set
train_fpr, train_tpr, thresholds = roc_curve(y_train, y_train_score)

# Calculate the probability scores of each point in the test set
y_test_score = model_log.decision_function(X_test)

# Calculate the fpr, tpr, and thresholds for the test set
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_score)

In [None]:
# Make predictions for test data
y_pred_test = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

print('Training Precision: ', precision_score(y_train, y_pred_train))
print('Testing Precision: ', precision_score(y_test, y_pred_test))
print('\n\n')

print('Training Recall: ', recall_score(y_train, y_pred_train))
print('Testing Recall: ', recall_score(y_test, y_pred_test))
print('\n\n')

print('Training Accuracy: ', accuracy_score(y_train, y_pred_train))
print('Testing Accuracy: ', accuracy_score(y_test, y_pred_test))
print('\n\n')

print('Training F1-Score: ', f1_score(y_train, y_pred_train))
print('Testing F1-Score: ', f1_score(y_test, y_pred_test))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is :{0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred_test, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
plot_confusion_matrix(logreg, X, y, values_format='.4g')
plt.grid(False)
plt.show()

In [None]:
sns.set_style('darkgrid', {'axes.facecolor': '0.9'}) #seaborn styling

print('AUC: {}'.format(auc(train_fpr, train_tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(train_fpr, train_tpr, color='darkorange',
         lw=lw, label='ROC curve for the trained data')
plt.plot(test_fpr, test_tpr, color='red',
         lw=lw, label='ROC curve for the test data')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

> Notes: Beyond accuracy, I think that it would be beneficial to minimize the instances where the model predicts someone who will not churn, but actually leaves the service [churns].

### Goals:
* Increase Precision. Reduce Type II errors (i.e. false negatives). We want to be able to identify everyone who will likely churn and prevent them from churning before the action occurs
* Simplify the model by reducing features (20 features is a lot imo[?!])

### Methods:
1. Address class imbalance (SMOTE)
2. Simplify the model by identifying and reducing unimportant features (LASSO - least absolute shrinkage and selection operator - L1 Regularization)
2. Attempt different types of modeling techniques (KNN, Decision Trees, and Random Forests)

In [None]:
# Class frequency of target variable 
df['churn'].value_counts()

In [None]:
smote = SMOTE(random_state=0, sampling_strategy=1)
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_smote).value_counts()) 

In [None]:
# Comparing a few different ratios of minority class to majority class
ratios = [0.4, 0.5, 0.7, 1]
names = ['0.4', '0.5', '0.7','even']
colors = sns.color_palette('Set2')

plt.figure(figsize=(10, 8))

for n, ratio in enumerate(ratios):
    # Fit a model
    smote = SMOTE(sampling_strategy=ratio)
    X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train) 
    logreg = LogisticRegression(fit_intercept=False, C=1e12, solver ='liblinear')
    model_log = logreg.fit(X_train_smote, y_train_smote)
    print(model_log)

    # Predict
    y_hat_test = logreg.predict(X_test)

    y_score = logreg.decision_function(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    
    print('AUC for {}: {}'.format(names[n], auc(fpr, tpr)))
    print('-------------------------------------------------------------------------------------')
    lw = 2
    plt.plot(fpr, tpr, color=colors[n],
             lw=lw, label='ROC curve {}'.format(names[n]))

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# One-hot encode the training data and show the resulting DataFrame with proper column names
# ohe = OneHotEncoder()

# ohe.fit(X_train)
# X_train_ohe = ohe.transform(X_train).toarray()

In [None]:
# ohe_df = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names(X_train.columns))

In [None]:
# Train a DT classifier
classifier = DecisionTreeClassifier(random_state=0, max_depth=4)  
classifier.fit(X_train, y_train)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (12,5), dpi=300)
tree.plot_tree(classifier, fontsize=3,
               feature_names = df.columns, 
               class_names=np.unique(y).astype('str'),
               filled = True)
plt.show()

In [None]:
# y_score = classifier.fit(X_train, y_train).decision_function(X_test)
y_score = classifier.predict_proba(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)

## Visualizing Relationships

In [None]:
# Looking at the relationship between the total amount of revenue earned and whether or not they churned
plot1 = pd.concat([df['totalcharges'], df['churn']], axis=1)
f, ax = plt.subplots(figsize=(8, 8))
fig = sns.boxplot(x='churn', y='totalcharges', data=plot1)
plt.title("Total Amount of Revenue Collected vs. Churn");
df.groupby('churn')[['totalcharges']].describe()

In [None]:
# Looking at the relationship between the monthly bill and whether or not they churned
plot1 = pd.concat([df['monthlycharges'], df['churn']], axis=1)
f, ax = plt.subplots(figsize=(8, 8))
fig = sns.boxplot(x='churn', y='monthlycharges', data=plot1)
plt.title("Monthly Rate vs. Churn");
df.groupby('churn')[['monthlycharges']].describe()

## Looking at class imbalance

In [None]:
#gender: binary, male or female
sns.countplot(df['gender']);

In [None]:
#SeniorCitizen: binary, 0 (no) or 1 (yes)
sns.countplot(df['seniorcitizen']);
# notes this is a class inbalance

In [None]:
sns.countplot(df['partner']);
#looks good

In [None]:
sns.countplot(df['dependents']);
# class inbalance

In [None]:
sns.distplot(df['tenure']);
# data looks relatively good. I am a little bit surprised that the shape
# is bimodal. I would have expected the longest tenure to be the least dense