In [None]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline 

In [None]:
df = pd.read_csv('data/telco-churn.csv')
print (df.shape)

In [None]:
df.head()

In [None]:
# Load data
df.head(3)

In [None]:
cnt = df["churn"].value_counts()
sns.barplot(cnt.index, cnt.values)

In [None]:
cnt_True = df["churn"][df["churn"] == True]
print ("Churn Percentage = "+str( (cnt_True.shape[0] / df["churn"].shape[0]) * 100 ))

In [None]:
df.describe()

### Churn By Area Code

In [None]:
df.groupby(["area code", "churn"]).size().unstack().plot(kind='bar', stacked=True, figsize=(5,5)) 

### Churn By Customers with International plan

In [None]:
df.groupby(["international plan", "churn"]).size().unstack().plot(kind='bar', stacked=True, figsize=(5,5)) 

### Churn By Customers with Voice mail plan 

In [None]:
df.groupby(["voice mail plan", "churn"]).size().unstack().plot(kind='bar', stacked=True, figsize=(5,5)) 

### Handle Categorical Cols - Label Encode

In [None]:
# Discreet value integer encoder
label_encoder = preprocessing.LabelEncoder()

In [None]:
# State is string and we want discreet integer values
df['state'] = label_encoder.fit_transform(df['state'])
df['international plan'] = label_encoder.fit_transform(df['international plan'])
df['voice mail plan'] = label_encoder.fit_transform(df['voice mail plan'])
df['churn'] = label_encoder.fit_transform(df['churn'])

#print (df['Voice mail plan'][:4])
print (df.dtypes)

### Tackling imbalanced classes

In [None]:
# Class count
count_class_0, count_class_1 = df.churn.value_counts()

# Divide by class
df_class_0 = df[df['churn'] == 0]
df_class_1 = df[df['churn'] == 1]

In [None]:
# Undersampling
df_class_0_under = df_class_0.sample(count_class_1)
df_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_under.churn.value_counts())

df_under.churn.value_counts().plot(kind='bar', title='Count (churn)');

In [None]:
# Oversampling
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_over.churn.value_counts())

df_over.churn.value_counts().plot(kind='bar', title='Count (churn)');

In [None]:
df_over.shape

In [None]:
y = df_over['churn']

In [None]:
df_over.drop(["phone number","churn"], axis = 1, inplace=True)

In [None]:
X = df_over

In [None]:
# creating the train and validation split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,fbeta_score

In [None]:
# pass 1 : Logistic Regression
model1 = LogisticRegression()

model1.fit(X_train,y_train)
y_pred = model1.predict(X_test)
print("The accuracy score of Logistic Regression model is %s" %(accuracy_score(y_test,y_pred)))
print("The fbeta score of Logistic Regression model is %s" %(fbeta_score(y_test,y_pred,beta=0.5)))

In [None]:
model2 = DecisionTreeClassifier()
model2.fit(X_train,y_train)
y_pred = model2.predict(X_test)
print("The accuracy score of DecisionTreeClassifier model is %s" %(accuracy_score(y_test,y_pred)))
print("The fbeta score of DecisionTreeClassifier model is %s" %(fbeta_score(y_test,y_pred,beta=0.5)))

In [None]:
model3 = RandomForestClassifier()
model3.fit(X_train,y_train)
y_pred = model3.predict(X_test)
print("The accuracy score of RandomForestClassifier model is %s" %(accuracy_score(y_test,y_pred)))
print("The fbeta score of RandomForestClassifier model is %s" %(fbeta_score(y_test,y_pred,beta=0.5)))

In [None]:
# Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.metrics import make_scorer,classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV,ShuffleSplit

In [None]:
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=10, random_state=0, test_size=0.2, train_size=None)

    # Create a random forest classifier object
    model = RandomForestClassifier()

    # Create a dictionary for the parameters 'max_depth',min_samples_split and min_samples_leaf
    params = {'max_depth':range(2,12,2),
              'min_samples_split':range(2,12,2),
              'min_samples_leaf':range(2,12,2)}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(fbeta_score, beta=0.5)

    # Create the grid search cv object --> GridSearchCV()
    grid = GridSearchCV(model,params,scoring_fnc,cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [None]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
reg.score

# Produce the values for 'max_depth',min_samples_split and min_samples_leaf
print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
print("Parameter 'min_samples_split' is {} for the optimal model.".format(reg.get_params()['min_samples_split']))
print("Parameter 'min_samples_leaf' is {} for the optimal model.".format(reg.get_params()['min_samples_leaf']))

In [None]:
# applying best hyperparameter values to RandomForestClassfier
rf_clf = RandomForestClassifier(max_depth=10,min_samples_split=6,min_samples_leaf=2)
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)
print("The accuracy score of hypertuned RandomForestClassifier model is %s" %(accuracy_score(y_test,y_pred)))
print("The fbeta score of hypertuned RandomForestClassifier model is %s" %(fbeta_score(y_test,y_pred,beta=0.5)))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
# exhibiting feature importance
features = X.columns
importances = rf_clf.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()