# Customer Churn in Telecom Companies
![power_bi_report.JPG](attachment:power_bi_report.JPG)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float', '{:.4f}'.format)

In [None]:
data = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

# 1. Exploratory Data Analysis (EDA)

In [None]:
data.describe()

In [None]:
data.info()

## `customerID`

In [None]:
data.customerID.nunique()

The `customerID` is unique for each customers and will not be useful to our model.

## `Churn`: Target variable

In [None]:
data.Churn.value_counts()

In [None]:
sns.countplot(x='Churn', data=data)

## `gender`, `SeniorCitizen`, `Partner`, `Dependents`, `PhoneService`, `PaperlessBilling`
- `gender`: Whether the customer is a male or a female
- `SeniorCitizen`: Whether the customer is a senior citizen or not (1, 0)
- `Partner`: Whether the customer has a partner or not (Yes, No)
- `Dependents`: Whether the customer has dependents or not (Yes, No)
- `PhoneService`: Whether the customer has a ohone service or not (Yes, No)
- `PaperlessBilling`: Whether the customer has paperless billing or not (Yes, No)

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(3, 2, 1)
sns.countplot(x='gender', data=data, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot(x='SeniorCitizen', data=data, hue='Churn')

plt.subplot(3, 2, 3)
sns.countplot(x='Partner', data=data, hue='Churn')

plt.subplot(3, 2, 4)
sns.countplot(x='Dependents', data=data, hue='Churn')

plt.subplot(3, 2, 5)
sns.countplot(x='PhoneService', data=data, hue='Churn')

plt.subplot(3, 2, 6)
sns.countplot(x='PaperlessBilling', data=data, hue='Churn')

It seems that the gender column doesn't have a big effect on the Chur rate. 
- Churn: 50.73% Males, 49.26% Females
- Not Churn: 50.24% Males, 49.75% Females

The senior citizen are more likely to churn

## `tenure`, `MonthlyCharges`, `TotalCharges`
- `tenure`: The number of months the customer has stayed with the company
- `MonthlyCharges`: The amount charged to the customer monthly
- `TotalCharges`: The total amount charged to the customer

In [None]:
data.tenure.value_counts()

In [None]:
data.tenure.max()

In [None]:
data['TotalCharges'] = data.TotalCharges.replace(' ', np.nan)
data['TotalCharges'] = data.TotalCharges.astype(float)

In [None]:
data.TotalCharges.dtype

In [None]:
plt.figure(figsize=(14, 14))

plt.subplot(3, 2, 1)
data[data.Churn == 'No'].tenure.hist(bins=35, alpha=0.6, label='Churn=No')
data[data.Churn == 'Yes'].tenure.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.legend()
plt.xlabel('Number of months with company')

plt.subplot(3, 2, 2)
data[data.Churn == 'No'].tenure.value_counts().hist(bins=50, alpha=0.6, label='Churn=No')
data[data.Churn == 'Yes'].tenure.value_counts().hist(bins=50, alpha=0.6, label='Churn=Yes')
plt.legend()

plt.subplot(3, 2, 3)
data[data.Churn == 'No'].MonthlyCharges.hist(bins=35, alpha=0.6, label='Churn=No')
data[data.Churn == 'Yes'].MonthlyCharges.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.xlabel('Monthly Payment')
plt.legend()

plt.subplot(3, 2, 4)
data[data.Churn == 'No'].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=No')
data[data.Churn == 'Yes'].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.xlabel('Total Payment')
plt.legend()

## `OnlineSecurity`, `OnlineBackup`, `InternetService`, `MultipleLines`, `DeviceProtection`, `TechSupport`

- `OnlineSecurity`: Whether the customer has online security or not (Yes, No, No internet service)
- `OnlineBackup`: Whether the customer has online backup or not (Yes, No, No internet service)
- `InternetService`: Customer's internet service provider (DSL, Fiber optic, No)
- `MultipleLines`: Whether the customer has Multiple Lines or not (Yes, No, No phone service)
- `DeviceProtection`: Whether the customer has device protection or not (Yes, No, No internet service)
- `TechSupport`: Whether the customer has tech support or not (Yes, No, No internet service)

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(3, 2, 1)
sns.countplot(x='OnlineBackup', data=data, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot(x='OnlineSecurity', data=data, hue='Churn')

plt.subplot(3, 2, 3)
sns.countplot(x='InternetService', data=data, hue='Churn')

plt.subplot(3, 2, 4)
sns.countplot(x='MultipleLines', data=data, hue='Churn')

plt.subplot(3, 2, 5)
sns.countplot(x='DeviceProtection', data=data, hue='Churn')

plt.subplot(3, 2, 6)
sns.countplot(x='TechSupport', data=data, hue='Churn')

## `StreamingMovies`, `StreamingTV`, `PaymentMethod`, `Contract`
- `StreamingMovies`: Whether the customer has streaming movies or not (Yes, No, No internet service)
- `StreamingTV`: Whether the customer has streaming TV or not (Yes, No, No internet service)
- `PaymentMethod`: The customer's payment method (Electronic check, Mailed check, Bank Transfer (automatic), Credit card (automatic))
- `Contract`: The contract term of the customer (Month-to-month, One year, Two year)

In [None]:
plt.figure(figsize=(15, 18))

plt.subplot(3, 2, 1)
sns.countplot(x='StreamingMovies', data=data, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot(x='StreamingTV', data=data, hue='Churn')

plt.subplot(3, 2, 3)
g = sns.countplot(x='PaymentMethod', data=data, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);

plt.subplot(3, 2, 4)
g = sns.countplot(x='Contract', data=data, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);

# 2. Feature Engineering

## Missing Values

In [None]:
data.isna().sum()

In [None]:
data['TotalCharges'] = data.TotalCharges.fillna(value=data.TotalCharges.median())

In [None]:
data.TotalCharges.median()

## `OnlineSecurity`, `OnlineBackup`, `InternetService`, `DeviceProtection`, `TechSupport`, `StreamingTV`, `StreamingMovies`

In [None]:
columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
           'TechSupport', 'StreamingTV', 'StreamingMovies']
for column in columns:
    data[column] = data[column].replace({'No internet service':'No'})

In [None]:
data.head()

## Binary Columns

In [None]:
binary_columns = [column for column in data.columns if data[column].nunique() == 2]
len(binary_columns)

In [None]:
columns = ['Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup',  
           'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
           'PaperlessBilling', 'Churn']
for col in columns:
    data[col] = data[col].map({'Yes':1, 'No':0})
    
data['gender'] = data.gender.map({'Male':1, 'Female':0})

In [None]:
for col in binary_columns:
    print(data[col].unique())

## `customerID`
we are going to drop the customers ID

In [None]:
customerID = data.customerID
data.drop('customerID', axis=1, inplace=True)

## `tenure`

In [None]:
def tenure_lab(period) :
    
    if period <= 12 :
        return 1
    elif (period > 12) & (period <= 24 ):
        return 2
    elif (period > 24) & (period <= 36) :
        return 3
    elif (period > 36) & (period <= 48) :
        return 3
    elif (period > 48) & (period <= 60) :
        return 4
    elif period > 60 :
        return 5
    
data['tenure'] = data.tenure.apply(tenure_lab)

In [None]:
data.tenure.value_counts()

## `MultipleLines`, `InternetService`, `Contract`, `PaymentMethod`
We are going to transfer these columns into dummy variables

In [None]:
dummy_col = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'tenure']
data_1 = pd.get_dummies(data, columns=dummy_col, drop_first=True, dtype='uint8')

In [None]:
data_1.head()

## Correlation Analysis

In [None]:
plt.figure(figsize=(20, 18))
sns.heatmap(data_1.corr(), annot=False)

In [None]:
data_1.drop('Churn', axis=1).corrwith(data.Churn).plot(kind='barh', figsize=(10, 7))

## Checking for duplicate rows and columns

In [None]:
print(data_1.shape)

# Remove duplicate Features
data_1 = data_1.T.drop_duplicates()
data_1 = data_1.T

# Remove Duplicate Rows
data_1.drop_duplicates(inplace=True)

print(data_1.shape)

# 3. Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

X = data_1.drop('Churn', axis=1)
y = data_1.Churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


scaler = MinMaxScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_std = scaler.transform(X)

# 4. Model Building
## Base line model

We an imbalanced problem here. If we predict that all our customers will not churn, we will have an accuracy of `73.84%`.

In [None]:
(y_test.value_counts()[0] / y_test.shape)[0]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

## 1. Logistic Regression

In [None]:
lr_clf = LogisticRegression(solver='liblinear', penalty='l1')
lr_clf.fit(X_train_std, y_train)

evaluate(lr_clf, X_train_std, X_test_std, y_train, y_test)

### Cross Validation Score for Logistic Regression

In [None]:
scores = cross_val_score(lr_clf, X_std, y, cv=3, scoring='f1')
print(scores)
print(f"Logistic Regression f1_score: {scores.mean() * 100:.2f}% +/- ({scores.std() * 100:.2f})")

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
    lr_clf, X_test_std, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Stay', 'Churn']
)

In [None]:
disp = RocCurveDisplay.from_estimator(
    lr_clf, X_test_std, y_test
)

## 2. Random Forest Classifier

In [None]:
stay = (y_train.value_counts()[0] / y_train.shape)[0]
churn = (y_train.value_counts()[1] / y_train.shape)[0]

print(f"Staying Rate: {stay * 100:.2f}%")
print(f"Churning Rate: {churn * 100 :.2f}%")

In [None]:
rand_forest = RandomForestClassifier(n_estimators=1000)
rand_forest.fit(X_train, y_train)

evaluate(rand_forest, X_train, X_test, y_train, y_test)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100)

param_grid = {
    'n_estimators':[550, 600, 650],
    'max_depth':[3, 5, 7, 10, 15, None], 
    'min_samples_split':[2, 3, 10], 
    'min_samples_leaf':[1, 3, 5, 7, 10], 
    'criterion':["gini", "entropy"]
}

rf_grid_cv = GridSearchCV(rf_clf, param_grid, scoring="f1", n_jobs=-1, verbose=1, cv=3)
rf_grid_cv.fit(X_train, y_train)

best_params = rf_grid_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)
evaluate(rf_clf, X_train, X_test, y_train, y_test)

### Cross Validation Score for Random Forest Classifier

In [None]:
scores = cross_val_score(rf_clf, X, y, cv=3, scoring='f1')
print(scores)
print(f"Random Forest F1_score: {scores.mean() * 100:.2f}% +/- ({scores.std() * 100:.2f})")

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
    rf_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Stay', 'Churn']
)

In [None]:
disp = RocCurveDisplay.from_estimator(lr_clf, X_test_std, y_test)
RocCurveDisplay.from_estimator(rf_clf, X_test, y_test, ax=disp.ax_)

## 3. Support Vector Machine

In [None]:
svm_clf = SVC(kernel='rbf')
svm_clf.fit(X_train_std, y_train)

evaluate(svm_clf, X_train_std, X_test_std, y_train, y_test)

In [None]:
param_grid = {
    'C':[0.001, 0.01, 0.1, 1, 10, 100],
    'gamma':[1, 0.1, 0.01, 0.001],
    'kernel':['rbf']
}

grid_cv = GridSearchCV(SVC(), param_grid, verbose=1, cv=5, n_jobs=-1)
grid_cv.fit(X_train_std, y_train)

best_params = grid_cv.best_params_
print(f"Best parameters: {best_params}")

svm_clf = SVC(**best_params)
svm_clf.fit(X_train_std, y_train)
evaluate(svm_clf, X_train_std, X_test_std, y_train, y_test)

### Cross Validation Score for Support Vector Machine

In [None]:
scores = cross_val_score(svm_clf, X_std, y, cv=3, scoring='f1')
print(scores)
print(f"Support Vector Machine f1_score: {scores.mean() * 100:.2f}% +/- ({scores.std() * 100:.2f})")

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
    svm_clf, X_test_std, y_test, 
    cmap='Blues', values_format='d',
    display_labels=['Stay', 'Churn']
)

In [None]:
disp = RocCurveDisplay.from_estimator(lr_clf, X_test_std, y_test)
RocCurveDisplay.from_estimator(rf_clf, X_test, y_test, ax=disp.ax_)
RocCurveDisplay.from_estimator(svm_clf, X_test_std, y_test, ax=disp.ax_)

## 4. XGBoost Classifier

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
evaluate(xgb_clf, X_train, X_test, y_train, y_test)

In [None]:
hyperparameter_grid = {
    'n_estimators': [75, 90, 100, 125], 
    'learning_rate' : [0.09, 0.1, 0.2], 
    'base_score' : [0.15],
    'colsample_bytree':[0.85, 0.95, 1],
    'colsample_bylevel':[0.85, 0.95, 1],
    'colsample_bynode':[0.85, 0.95, 1]
}

xgb_clf = XGBClassifier()
xgb_cv = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=hyperparameter_grid, 
    cv=3, 
    scoring='f1', 
    n_jobs =-1, 
    verbose=1
)

xgb_cv.fit(X_train, y_train)

best_params = xgb_cv.best_params_
print(f"Best parameters: {best_params}")

xgb_clf = XGBClassifier(**best_params)
xgb_clf.fit(X_train, y_train)
evaluate(xgb_clf, X_train, X_test, y_train, y_test)

### Cross Validation Score For XGBoost

In [None]:
scores = cross_val_score(xgb_clf, X, y, cv=3, scoring='f1')
print(scores)
print(f"XGBoost F1_score: {scores.mean() * 100:.2f}% +/- ({scores.std() * 100:.2f})")

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
    xgb_clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Stay', 'Churn']
)

In [None]:
disp = RocCurveDisplay.from_estimator(lr_clf, X_test_std, y_test)
RocCurveDisplay.from_estimator(rf_clf, X_test, y_test, ax=disp.ax_)
RocCurveDisplay.from_estimator(svm_clf, X_test_std, y_test, ax=disp.ax_)
RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test, ax=disp.ax_)

## Feature Importance

In [None]:
def feature_imp(df, model):
    fi = pd.DataFrame()
    fi["feature"] = df.columns
    fi["importance"] = model.feature_importances_
    return fi.sort_values(by="importance", ascending=False)

In [None]:
df = feature_imp(data_1.drop('Churn', axis=1), xgb_clf)
df.set_index('feature', inplace=True)
df.plot(kind='barh', figsize=(10, 6))
plt.title('Feature Importance according to XGBoost')

In [None]:
df = feature_imp(data_1.drop('Churn', axis=1), rf_clf)
df.set_index('feature', inplace=True)
df.plot(kind='barh', figsize=(10, 6))
plt.title('Feature Importance according to Random Forest')

In [None]:
rfe = RFE(lr_clf, n_features_to_select=1)
rfe.fit(X_train, y_train)

In [None]:
rfe_ranking = dict(zip(rfe.ranking_.tolist(), X_train.columns.tolist()))
print(rfe_ranking)