# Project

In this project, the aim is to build a model for predicting churn. Churn is the percentage of customers that stopped using your company's product or service during a certain time frame. Thus, in the given dataset, the label or target variable will be `Churn` column.

## Steps
- Read the `churn.csv` file and describe it.
- Make at least 4 different analysis on Exploratory Data Analysis section.
- Pre-process the dataset to get ready for ML application. (Check missing data and handle them, do we need to do scaling or feature extraction etc.)
- Define appropriate evaluation metric for our case (classification).
- Train and evaluate Logistic Regression, Decision Trees and one other appropriate algorithm which you can choose from scikit-learn library.
- Is there any overfitting and underfitting? Interpret your results and try to overcome if there is any problem in a new section.
- Create confusion metrics for each algorithm and display Accuracy, Recall, Precision and F1-Score values.
- Analyse and compare results of 3 algorithms.
- Select best performing model based on evaluation metric you chose on test dataset.

# MARK SILAS

# Exploratory Data Analysis

In [None]:
#importing necessary libray for data analysis and visualisation
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Read csv
data = pd.read_csv("churn.csv")
data.head()

In [None]:
#Checking for shape of data
data.shape

In [None]:
#viewing columns of data
data.columns

In [None]:
#Viewing the summary descriptive statistics
data.describe()

In [None]:
#Checking data information
data.info()

In [None]:
#Cheking for total of missing values
data.isnull().sum()

In [None]:
#Checking for duplicate values
data.duplicated().sum()

In [None]:
#Value counts for churn column
data['Churn'].value_counts()

In [None]:
#Visualising the target variable(Churn)
ax = sns.countplot(x="Churn", data=data)

In [None]:
#Visualising AccountWeeks column
sns.displot(data=data, x="AccountWeeks")

In [None]:
#Value count for Contract renewal
data['ContractRenewal'].value_counts()

In [None]:
#Value counts for DataPlan column
data['DataPlan'].value_counts()

In [None]:
#Visualising DataUSage column
sns.displot(data=data, x="DataUsage")

In [None]:
#Visualising CustServCalls column
sns.displot(data=data, x="CustServCalls", kde=True)

In [None]:
#Value counts for DayMins
pd.qcut(data['DayMins'],5).value_counts()

In [None]:
#Value counts for DayCalls
pd.qcut(data['DayCalls'],5).value_counts()

In [None]:
#Value count for MonthlyCharge
pd.qcut(data['MonthlyCharge'],5).value_counts()

In [None]:
#Visualising OverageFee column
sns.displot(data=data, x="OverageFee", kind="kde")

In [None]:
#Visualising RoamMins column
sns.displot(data=data, x="RoamMins")

In [None]:
#Visualisation for Churn w.r.t AccountWeeks
sns.displot(data=data, x="AccountWeeks", hue="Churn", multiple="stack")

In [None]:
#checking the correlation between features
sns.heatmap(data.corr(),annot=True,linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(20,12)
plt.show()

In [None]:
#Applying Z-score to find unusual data point values such as outliers
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(data))
z

In [None]:
#Checking for outliers(outliers always have z-scores  above 3)
outliers = list(set(np.where(z > 3)[0]))
len(outliers)

In [None]:
#Removing Outliers
Data = data[(z < 3).all(axis=1)]
Data

In [None]:
#Instantiating a threadpool to handle accuracy score and classification report
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import classification_report, confusion_matrix

def model_testing(md):
    with ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(md.fit, X_train, y_train)
    print("Training Accuracy: ", md.score(X_train, y_train), '\n')
    print("Testing Accuracy: ", md.score(X_test, y_test), '\n')
    print(classification_report(y_test, md.predict(X_test)))
    return md.predict(data.drop(columns=['Churn']))

# MACHINE LEARNING MODEL

# Applying Various Classification Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#Setting your columns into attributes and label
X = Data[['AccountWeeks', 'ContractRenewal', 'DataPlan', 'DataUsage', 'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee', 'RoamMins']]
y = Data['Churn']

In [None]:
#Importing training and testsing, standadisation and pipeline module 
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#Splitting data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
#Creating the object of the models model 
log = LogisticRegression(solver="liblinear", C=10, random_state=42, n_jobs=-1).fit(X_train, y_train)
dct= tree.DecisionTreeClassifier(max_depth=4 , random_state=42).fit(X_train, y_train)
svc = SVC(C=10, probability=True).fit(X_train, y_train)
rfc = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
knn = KNeighborsClassifier().fit(X_train, y_train)
gbc = GradientBoostingClassifier().fit(X_train, y_train)
gsc = GridSearchCV(estimator = log, param_grid  = {"C": [0.1,0.01,0.001,10,1]}, cv= 5).fit(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
#Shape of training label
y_train.shape

In [None]:
def trainer(X_train, y_train,):
    classifiers = [log, dct, svc, rfc, knn, gbc]
    for classifier in classifiers:
        metrics = model_testing(classifier)
        print(metrics)
        print(classifier)

trainer(X_train, y_train)

# Logistcis Regression

In [None]:
#Predicting the probabilities of training attributes
log.predict_proba(X_train)

In [None]:
#Confusion matrix
confusion_matrix(y_train, log.predict(X_train))

In [None]:
#Visulaising confusion matrix
cm = confusion_matrix(y_train, log.predict(X_train))
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True)

In [None]:
#Area under the roc curve measures the performance of the model. ROC curve above 0.50 shows good performace.
from sklearn.metrics import roc_curve, auc

y_pred_prop = log.predict_proba(X_test)[:,1]

fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_prop)
roc_auc_log = auc(fpr_log, tpr_log)

sns.set_style("white")
plt.figure(figsize=(5, 3))
plt.plot(fpr_log, tpr_log, color='darkorange',
         label='ROC curve (area = %0.2f)' % roc_auc_log)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()

In [None]:
#Print best parameter after tuning 
gsc.best_params_

In [None]:
#Print best score after tunning 
gsc.best_score_

In [None]:
#Score of test data after tunning
gsc.score(X_test, y_test)

# Decision Tree

In [None]:
dct.predict_proba(X_train)

In [None]:
confusion_matrix(y_train, dct.predict(X_train))

In [None]:
c_matrix = confusion_matrix(y_train, dct.predict(X_train))
plt.figure(figsize=(5,3))
sns.heatmap(c_matrix, annot=True)

# Support Vector

In [None]:
svc.predict_proba(X_train)

In [None]:
confusion_matrix(y_train, svc.predict(X_train))

In [None]:
con_matrix = confusion_matrix(y_train, svc.predict(X_train))
plt.figure(figsize=(5, 3))
sns.heatmap(con_matrix, annot=True)

# Random Forest

In [None]:
rfc.predict_proba(X_train)

In [None]:
confusion_matrix(y_train, rfc.predict(X_train))

In [None]:
con_matrix = confusion_matrix(y_train, rfc.predict(X_train))
plt.figure(figsize=(5, 3))
sns.heatmap(con_matrix, annot=True)

# KNearest Neighbours

In [None]:
knn.predict_proba(X_train)

In [None]:
confusion_matrix(y_train, knn.predict(X_train))

In [None]:
con_matrix = confusion_matrix(y_train, knn.predict(X_train))
plt.figure(figsize=(5, 3))
sns.heatmap(con_matrix, annot=True)

# Gradient Boosting

In [None]:
gbc.predict_proba(X_train)

In [None]:
confusion_matrix(y_train, gbc.predict(X_train))

In [None]:
con_matrix = confusion_matrix(y_train, gbc.predict(X_train))
plt.figure(figsize=(5,3))
sns.heatmap(con_matrix, annot=True)

# Evaluation

There appears to be no overfitting or underfitting as train data performed well in all model

The best performing model is Random Forest with:

Training Accuracy: 1.0

Testing Accuracy: 0.952054794520548

Hypertuning the model would improve it