# Telecom Churn

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Data Preparation

In [None]:
# Load Train set
df = pd.read_csv("/kaggle/input/traincsv/train.csv")
df.head()

In [None]:
df.shape

In [None]:
# Load Test set
df_test = pd.read_csv("/kaggle/input/testcsv/test.csv")
df_test.head()

In [None]:
df_test.shape

### 1. Check data types

In [None]:
# Train set
df.info(verbose=1)

In [None]:
# Test set
df_test.info(verbose=1)

### 2. Drop redundant columns

In [None]:
df.describe()

id is redundant </br>
circle_id seems to have a single value for all rows </br>
loc_og_t2o_mou, std_og_t2o_mou, loc_ic_t2o_mou seem to have only zeros </br>

In [None]:
# get a list of columns with only 1 value
drop_cols = ['id']
for col in df:
    if df[col].nunique() == 1:
        drop_cols.append(col)
print(drop_cols)  

In [None]:
# Train set
# dropping the columns
df.drop(drop_cols, axis=1, inplace=True)
df.describe()

In [None]:
# Test set
# dropping the columns
df_result = pd.DataFrame(df_test['id'])
df_test.drop(drop_cols, axis=1, inplace=True)
df_test.describe()

### 3. Renaming columns

In [None]:
# Train set
df = df.rename({"jun_vbc_3g": "vbc_3g_6", "jul_vbc_3g": "vbc_3g_7", "aug_vbc_3g":"vbc_3g_8"}, axis=1)
df.info(verbose=1)

In [None]:
# Test set
df_test = df_test.rename({"jun_vbc_3g": "vbc_3g_6", "jul_vbc_3g": "vbc_3g_7", "aug_vbc_3g":"vbc_3g_8"}, axis=1)
df_test.info(verbose=1)

### 4. Missing Value Treatment

##### Column Missing Values

In [None]:
# Function to get missing value percentage
def missing_value_percentage(df, thresh=0):
    miss = df.isnull().sum() * 100 / len(df)
    mod = miss.loc[(miss > thresh)]
    print(len(mod))
    return mod.sort_values(ascending=False)  

In [None]:
# Train set
print(missing_value_percentage(df, 50))  

In [None]:
# Test set
missing_value_percentage(df_test, 50)

In [None]:
# List of recharge column
recharge_columns =  df.columns[df.columns.str.contains('rech_amt|rech_data')]
print(recharge_columns)

In [None]:
df[recharge_columns].describe()

In [None]:
# create a list of numeric recharge columns where we will impute missing values with zeroes since there no recharge
zero_impute = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8',
        'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8',
        'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8']

In [None]:
# Train set
# impute missing values with 0
df[zero_impute] = df[zero_impute].apply(lambda x: x.fillna(0))

In [None]:
# now, check if imputed correctly
print("Missing value percentage:\n")
print(df[zero_impute].isnull().sum()*100/len(df))

# summary
print("\n\nSummary statistics\n")
print(df[zero_impute].describe(include='all'))

In [None]:
# Test set
# impute missing values with 0
df_test[zero_impute] = df_test[zero_impute].apply(lambda x: x.fillna(0))

In [None]:
# now, check if imputed correctly
print("Missing value percentage:\n")
print(df_test[zero_impute].isnull().sum()*100/len(df_test))

# summary
print("\n\nSummary statistics\n")
print(df_test[zero_impute].describe(include='all'))

In [None]:
# Get date columns
date_columns =  df.columns[df.columns.str.contains('date')]
print(date_columns)

In [None]:
# Train set, drop date columns
df.drop(columns=date_columns, axis=1, inplace=True)
df.shape

In [None]:
# Test set, drop date columns
df_test.drop(columns=date_columns, axis=1, inplace=True)
df_test.shape

In [None]:
# Train set
missing_value_percentage(df, 50)

In [None]:
# Test Set
missing_value_percentage(df_test, 50)

In [None]:
# get the list of categorical variables with 0/1 values
cat_cols =  ['fb_user_6',
 'fb_user_7',
 'fb_user_8',
 'night_pck_user_6',
 'night_pck_user_7',
 'night_pck_user_8']

churn_col = ["churn_probability"]


In [None]:
# Train set
# create anew category -1 for missing values in categorical columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(-1))

In [None]:
# Test set
# create anew category -1 for missing values in categorical columns
df_test[cat_cols] = df_test[cat_cols].apply(lambda x: x.fillna(-1))

In [None]:
missing_value_percentage(df, 50)

In [None]:
missing_value_percentage(df_test, 50)

In [None]:
#Train set
# dropping columns with high missing value percentage > 50%
df.drop(missing_value_percentage(df, 50).keys(), axis=1, inplace=True)
df.shape

In [None]:
#Test set
# dropping columns with high missing value percentage > 50%
df_test.drop(missing_value_percentage(df_test,50).keys(), axis=1, inplace=True)
df_test.shape

In [None]:
# Train set
print(missing_value_percentage(df))

In [None]:
# Test set
print(missing_value_percentage(df_test))

##### Row Missing Values

In [None]:
# Train set
# Count the rows having more than 50% missing values
df_missing_rows_50 = df[(df.isnull().sum(axis=1)) > (len(df.columns)//2)]
df_missing_rows_50.shape

In [None]:
# Deleting the rows having more than 50% missing values
df = df.drop(df_missing_rows_50.index)
df.shape

In [None]:
missing_value_percentage(df)

##### Impute Missing Values

In [None]:
# Get numeric columns
num_cols = [column for column in df.columns if column not in cat_cols + churn_col]
num_cols

In [None]:
# Train set
# impute missing values with mean
for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)
missing_value_percentage(df)
df.info(verbose=1)

In [None]:
# Test set
# impute missing values with mean
for col in num_cols:
    df_test[col].fillna(df_test[col].mean(), inplace=True)
missing_value_percentage(df_test)
df_test.info(verbose=1)

### 5. High Value Customers

In [None]:
# Creating column avg_rech_amt_6_7 by averaging the total recharge amount of month 6 and 7.
df['avg_rech_amt_6_7'] = (df['total_rech_amt_6'] + df['total_rech_amt_7'])/2

In [None]:
# finding 70th percentile of new column
df_high_thresh = df['avg_rech_amt_6_7'].quantile(0.7)
df_high_thresh

In [None]:
# filtering customers more than equal to df_high
df = df[df['avg_rech_amt_6_7'] >= df_high_thresh]
df.head()

In [None]:
df.drop(["avg_rech_amt_6_7"], axis=1, inplace=True)
df.shape

### 6. Outlier Treatment

In [None]:
# Train set
# Looking at quantiles from 0.90 to 1. 
df.quantile(np.arange(0.9,1.01,0.01)).style.bar()

In [None]:
# Removing outliers below 10th and above 90th percentile
for col in num_cols: 
    q_s = df[col].quantile(0.10)
    q_e = df[col].quantile(0.90)
    iqr = q_e-q_s
    range_s  = q_s-1.5*iqr
    range_e = q_e+1.5*iqr
    # Assigning the filtered dataset into data
    data = df.loc[(df[col] > range_s) & (df[col] < range_e)]

data.shape

# EDA

### 1. Univariate Analysis 

In [None]:
 sns.distplot(data.arpu_6)

In [None]:
sns.distplot(data.loc_og_t2t_mou_6)

In [None]:
sns.distplot(data.onnet_mou_8)

In [None]:
data['avg_rech_amt'] = (data['total_rech_amt_7'] + data['total_rech_amt_8'])/2
data['diff_rech_amt'] = data['avg_rech_amt'] - data['arpu_6']
# Checking whether the recharge amount has decreased in future month
data['reduced_rech_amt'] = np.where(data['diff_rech_amt'] < 0, 1, 0)

In [None]:
sns.countplot(data.reduced_rech_amt)

### 2. Bi-Vairate analysis

In [None]:
sns.scatterplot(x=data['total_og_mou_6'],y=data['total_og_mou_8'],hue=data['churn_probability'])

In [None]:
sns.scatterplot(x=data['total_og_mou_6'],y=data['total_og_mou_8'],hue=data['churn_probability'])

The customers with lower total_og_mou in 6th and 8th months are more likely to Churn compared to the ones with higher total_og_mou.

In [None]:
# Creating churn dataframe
data_churn = data[data['churn_probability'] == 1]
# Creating not churn dataframe
data_non_churn = data[data['churn_probability'] == 0]

In [None]:
# Distribution plot
ax = sns.distplot(data_churn['avg_rech_amt'],label='churn',hist=False)
ax = sns.distplot(data_non_churn['avg_rech_amt'],label='not churn',hist=False)
ax.set(xlabel='Avg Recharge amount')

We can see from the above plot, that the churn rate is more for the customers, whose recharge amount is low.

In [None]:
data.drop(["avg_rech_amt", "reduced_rech_amt", "diff_rech_amt"], axis=1, inplace=True)
data.shape

#  Class Imbalance

In [None]:
100*data['churn_probability'].value_counts()/len(data["churn_probability"])

High class imbalance observed. Needs to be handled.

# Model Building

### Test-Train Split

In [None]:
# Import library
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variables into X
X = data.drop(['churn_probability'], axis=1)

In [None]:
# Putting target variable to_probability y
y = data['churn_probability']

In [None]:
# Splitting data into train and test set 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

### Class Imbalance

In [None]:
# Create samples using SMOTE
# Imporing SMOTE
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

In [None]:
# Instantiate SMOTE
sm = SMOTE(random_state=42)

In [None]:
# Fittign SMOTE to the train set
X_train, y_train = sm.fit_resample(X_train, y_train)

### Standardization

In [None]:
# Standardization package import
from sklearn.preprocessing import StandardScaler

In [None]:
# Instantiate Scaler
scaler = StandardScaler()

In [None]:
# List of the numeric columns
cols_to_scale = X_train.columns.to_list()
cols_to_scale = [col for col in cols_to_scale if col in num_cols]
cols_to_scale

In [None]:
# Fit the data into scaler and transform
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_train.head()

In [None]:
# Scaling the test set
# Transform 
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
X_test.head()

In [None]:
# Scaling the Test set
# Transform the test set
df_test[cols_to_scale] = scaler.transform(df_test[cols_to_scale])
df_test.head()

### PCA

In [None]:
# Fit train set on PCA
from sklearn.decomposition import PCA 
pca = PCA(random_state = 42) 
pca.fit(X_train) 
pca.components_

In [None]:
# Cumuliative varinace of the PCs
variance_cumu = np.cumsum(pca.explained_variance_ratio_)
print(variance_cumu)

In [None]:
#Scree Plot
fig = plt.figure(figsize = (10,6))
plt.plot(variance_cumu)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance')
plt.axhline(0.95,color='r')
plt.text(10,0.96,'0.95')

#### ~60 components explain 95% variance

#### PCA with 60 components

In [None]:
# Importing incremental PCA
from sklearn.decomposition import IncrementalPCA

In [None]:
# Instantiate PCA with 60 components
pca_final = IncrementalPCA(n_components=60)

In [None]:
# Fit and transform the X_train
X_train_pca = pca_final.fit_transform(X_train)

In [None]:
# Applying transformation on test set
X_test_pca = pca_final.transform(X_test)
df_test_pca = pca_final.transform(df_test)

### Logistic Regression with PCA

In [None]:
# Function for Performance Metrics
import math
def mod_metrics(matrix) :
    TN = matrix[0][0]
    TP = matrix[1][1]
    FP = matrix[0][1]
    FN = matrix[1][0]
    accuracy = round((TP + TN)/float(TP+TN+FP+FN),3)
    print('Accuracy:' ,accuracy )
    sensitivity = round(TP/float(FN + TP),3)
    print('Sensitivity/Recall:', sensitivity)
    specificity = round(TN/float(TN + FP),3)
    print('Specificity: ', specificity)
    precision = round(TP/float(TP + FP),3)
    print('Precision:', precision)
    print('F1-score :', round(2*precision*sensitivity/(precision + sensitivity),3))

In [None]:
# Importing scikit logistic regression module
from sklearn.linear_model import LogisticRegression
# Impoting metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# Importing libraries for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
lr_pca = LogisticRegression(random_state=100, class_weight='balanced')
lr_pca.fit(X_train_pca,y_train) 

In [None]:
# y_train predictions
y_train_pred_lr_pca = lr_pca.predict(X_train_pca)
y_train_pred_lr_pca[:5]

In [None]:
# Test Prediction
y_test_pred_lr_pca = lr_pca.predict(X_test_pca)
y_test_pred_lr_pca[:5]

In [None]:
train_matrix = confusion_matrix(y_train, y_train_pred_lr_pca)
test_matrix = confusion_matrix(y_test, y_test_pred_lr_pca)

print('Train Set:')
mod_metrics(train_matrix)

print('\nTest Set:')
mod_metrics(test_matrix)

#### HyperParameter Tuning

In [None]:
# Creating KFold object with 5 splits
folds = KFold(n_splits=5, shuffle=True, random_state=4)

# Specify params
params = {"C": [0.01, 0.1, 1, 10, 100, 1000]}

lr_pca = LogisticRegression(random_state=100, class_weight='balanced')

# Specifing score as recall as we are more focused on acheiving the higher sensitivity than the accuracy
model_cv = GridSearchCV(lr_pca,
                        param_grid = params, 
                        scoring= 'recall', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

# Fit the model
model_cv.fit(X_train_pca, y_train)

In [None]:
# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
# Optimum Hyperparameters
print('Best ROC-AUC score :', model_cv.best_score_)
print('Best Parameters :', model_cv.best_params_)

In [None]:
# plot of C versus train and validation scores

plt.figure(figsize=(8, 6))
plt.plot(cv_results['param_C'], cv_results['mean_test_score'])
plt.plot(cv_results['param_C'], cv_results['mean_train_score'])
plt.xlabel('C')
plt.ylabel('sensitivity')
plt.legend(['test result', 'train result'], loc='upper left')
plt.xscale('log')

In [None]:
# Best score with best C
best_score = model_cv.best_score_
best_C = model_cv.best_params_['C']

print(" The highest test sensitivity is {0} at C = {1}".format(best_score, best_C))

Highest test sensitivity observed is 0.84

In [None]:
# Modelling using the best LR-PCA estimator 
lr_pca_best = model_cv.best_estimator_
lr_pca_best_fit = lr_pca_best.fit(X_train_pca, y_train)

# Prediction on Train set
y_train_pred_lr_pca_best = lr_pca_best_fit.predict(X_train_pca)

In [None]:
# Prediction on test set
y_test_pred_lr_pca_best = lr_pca_best_fit.predict(X_test_pca)

In [None]:
## Model Performance after Hyper Parameter Tuning

train_matrix = confusion_matrix(y_train, y_train_pred_lr_pca_best)
test_matrix = confusion_matrix(y_test, y_test_pred_lr_pca_best)

print('Train Set:')
mod_metrics(train_matrix)

print('\nTest Set:')
mod_metrics(test_matrix)

#### Model summary

#### Train set
Accuracy = 0.81 </br>
Sensitivity = 0.84 </br>
Specificity = 0.81 </br>
#### Test set 
Accuracy = 0.83 </br>
Sensitivity = 0.80 </br>
Specificity = 0.80 </br> </br>
Overall, the model is performing well in the test set, what it had learnt from the train set.

### Random Forest with PCA

In [None]:
# Importing random forest classifier
from sklearn.ensemble import RandomForestClassifier

#### HyperParamater Tuning

In [None]:
param_grid = {
    'max_depth': range(5,10,5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),
    'n_estimators': [100,200,300], 
    'max_features': [10, 20]
}
# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, 
                           param_grid = param_grid, 
                           cv = 3,
                           n_jobs = -1,
                           verbose = 1, 
                           return_train_score=True)

# Fit the model
grid_search.fit(X_train_pca, y_train)

In [None]:
# Optimum Hyperparameters
print('Best ROC-AUC score:', grid_search.best_score_)
print('Best Parameters:', grid_search.best_params_)

ROC score for best estimator is 0.91

In [None]:
# Modelling using the best PCA-RandomForest Estimator 
pca_rf_best = grid_search.best_estimator_
pca_rf_best_fit = pca_rf_best.fit(X_train_pca, y_train)

# Prediction on Train set
y_train_pred_pca_rf_best = pca_rf_best_fit.predict(X_train_pca)

In [None]:
# Prediction on test set
y_test_pred_pca_rf_best = pca_rf_best_fit.predict(X_test_pca)

In [None]:
## Model Performance - Hyper Parameter Tuned
train_matrix = confusion_matrix(y_train, y_train_pred_pca_rf_best)
test_matrix = confusion_matrix(y_test, y_test_pred_pca_rf_best)

print('Train Set:')
mod_metrics(train_matrix)

print('\nTest Set:')
mod_metrics(test_matrix)

#### Model summary

#### Train set
Accuracy = 0.91 </br>
Sensitivity = 0.11 </br>
Specificity = 0.99 </br>
#### Test set
Accuracy = 0.91 </br>
Sensitivity = 0.6 </br>
Specificity = 0.99 </br> </br>
We can see that the Sensitivity has decreased.

### PCA + XGBoost

In [None]:
!pip install xgboost
import xgboost as xgb
# Ratio of classes 
class_0 = y[y == 0].count()
class_1 = y[y == 1].count()

pca_xgb = xgb.XGBClassifier(random_state=42, scale_pos_weight= class_0/class_1 ,
                                    tree_method='hist', 
                                   objective='binary:logistic')  # scale_pos_weight takes care of class imbalance
pca_xgb.fit(X_train_pca, y_train)

#### Hyper parameter Tuning

In [None]:
parameters = {
              'learning_rate': [0.1, 0.2, 0.3],
              'gamma' : [10,20,50],
              'max_depth': [2,3,4],
              'min_child_weight': [25,50],
              'n_estimators': [150,200,500]}
pca_xgb_grid = GridSearchCV(estimator=pca_xgb , param_grid=parameters,scoring='roc_auc', cv=folds, n_jobs=-1, verbose=1)
pca_xgb_grid.fit(X_train_pca, y_train)
pca_xgb_grid

In [None]:
# Optimum Hyperparameters
print('ROC-AUC score:', pca_xgb_grid.best_score_)
print('Best Parameters:', pca_xgb_grid.best_params_)

In [None]:
# Modelling using the best PCA-XGBoost Estimator 
pca_xgb_best = pca_xgb_grid.best_estimator_
pca_xgb_best_fit = pca_xgb_best.fit(X_train_pca, y_train)

# Prediction on Train set
y_train_pred_pca_xgb = pca_xgb_best_fit.predict(X_train_pca)

In [None]:
# Prediction on test set
y_test_pred_pca_xgb = pca_xgb_best_fit.predict(X_test_pca)

In [None]:
## PCA - XGBOOST [Hyper parameter tuned] Model Performance

train_matrix = confusion_matrix(y_train, y_train_pred_pca_xgb)
test_matrix = confusion_matrix(y_test, y_test_pred_pca_xgb)

print('Train Set:')
mod_metrics(train_matrix)

print('\nTest Set:')
mod_metrics(test_matrix)

#### Model summary
Train set: </br>
Accuracy = 0.86 </br>
Sensitivity = 0.89 </br>
Specificity = 0.86 </br>

Test set: </br>
Accuracy = 0.84 </br>
Sensitivity = 0.77 </br>
Specificity = 0.85 </br>

Sensitivity and Accuracy observed are good with XGBoost.

### Conclusion with PCA

After trying several models we can see that for acheiving the best sensitivity the classic Logistic regression preforms well. For both the models the sensitivity was approx 80%. Also we have good accuracy of apporx 80-85%.

### Feature Selection

In [None]:
##### Importing stats model
import statsmodels.api as sm

In [None]:
# Instantiate the model
# Adding the constant to X_train
log_no_pca = sm.GLM(y_train,(sm.add_constant(X_train)), family=sm.families.Binomial())

In [None]:
# Fit the model
log_no_pca = log_no_pca.fit().summary()
log_no_pca

Many features have higher p-values and hence became insignificant in the model.

We will first eliminate some features using Recursive Feature Elimination (RFE), and once we have reached a small set of variables, we will use manual feature elimination based on the p-values and VIFs.

### Feature Selection using RFE

In [None]:
# Importing logistic regression from sklearn
from sklearn.linear_model import LogisticRegression
# Intantiate the logistic regression
logreg = LogisticRegression()

#### RFE 15 variables

In [None]:
# Importing RFE
from sklearn.feature_selection import RFE

# Intantiate RFE with 15 columns
rfe = RFE(logreg, n_features_to_select=15)

# Fit the rfe model with train set
rfe = rfe.fit(X_train, y_train)

In [None]:
# RFE selected columns
rfe_cols = X_train.columns[rfe.support_]
print(rfe_cols)

#### Model -1 : RFE columns

In [None]:
# Adding constant to X_train
X_train_sm_1 = sm.add_constant(X_train[rfe_cols])

#Instantiate the model
log_no_pca_1 = sm.GLM(y_train, X_train_sm_1, family=sm.families.Binomial())

# Fit the model
log_no_pca_1 = log_no_pca_1.fit()

log_no_pca_1.summary()

#### VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[rfe_cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[rfe_cols].values, i) for i in range(X_train[rfe_cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Removing column loc_ic_t2m_mou_8, loc_ic_t2t_mou_8 which is insignificant as it has the highest p-value 0.319, 0.132.

In [None]:
# Removing og_others_8 column 
log_cols = rfe_cols.to_list()
log_cols.remove('total_rech_data_8')
log_cols.remove('sachet_2g_8')
print(log_cols)

#### Model 2

In [None]:
# Adding constant to X_train
X_train_sm_2 = sm.add_constant(X_train[log_cols])

#Instantiate the model
log_no_pca_2 = sm.GLM(y_train, X_train_sm_2, family=sm.families.Binomial())

# Fit the model
log_no_pca_2 = log_no_pca_2.fit()

log_no_pca_2.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[log_cols].columns
vif['VIF'] = [variance_inflation_factor(X_train[log_cols].values, i) for i in range(X_train[log_cols].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

As we can see from the model summary that all the variables p-values are significant so model 2 is our final model.

### Model Performance

##### Train Set

In [None]:
# Getting the predicted value on the train set
y_train_pred_no_pca = log_no_pca_2.predict(X_train_sm_2)
y_train_pred_no_pca.head()

In [None]:
y_train_pred_final = pd.DataFrame({'churn':y_train.values, 'churn_prob':y_train_pred_no_pca.values})
y_train_pred_final.head()

In [None]:
# Creating a column with name "predicted", which is the predicted value for 0.5 cutoff 
y_train_pred_final['predicted'] = y_train_pred_final['churn_prob'].map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Confusion metrics
no_pca_train_matrix = confusion_matrix(y_train_pred_final['churn'], y_train_pred_final['predicted'])
print('Train Set:')
mod_metrics(no_pca_train_matrix)

##### Test Set

In [None]:
# Taking a copy of the test set
X_test_log = X_test.copy()
# Taking only the columns, which are selected in the train 
X_test_log = X_test_log[log_cols]

In [None]:
# Adding constant on the test set
X_test_sm = sm.add_constant(X_test_log)

In [None]:
# Predict on the test set
y_test_pred_no_pca = log_no_pca_2.predict(X_test_sm)

In [None]:
y_test_pred_no_pca.head()

In [None]:
y_test_pred_final = pd.DataFrame({'churn':y_test.values, 'churn_prob':y_test_pred_no_pca.values})
y_test_pred_final.head()

In [None]:
# Creating a column with name "predicted", which is the predicted value for 0.5 cutoff 
y_test_pred_final['predicted'] = y_test_pred_final['churn_prob'].map(lambda x: 1 if x > 0.5 else 0)
y_test_pred_final.head()

In [None]:
# Confusion metrics
no_pca_test_matrix = confusion_matrix(y_test_pred_final['churn'], y_test_pred_final['predicted'])
print('Test Set:')
mod_metrics(no_pca_test_matrix)

An accuracy of 0.92 has been achieved and model shows good performance.

# Prediction for the test set - submission.csv

In [None]:
# Prediction on test set usnig Logistic Regression + PCA 
df_test_pred_lr_pca_best = lr_pca_best_fit.predict(df_test_pca)

In [None]:
# Create the pandas DataFrame and writing to csv
df_result['churn_probability'] = pd.Series(df_test_pred_lr_pca_best)
df_result.to_csv('submission.csv', index=False)
df_result.head()

# Conclusion

#### Top predictors
Following are the top variables selected in the logistic regression model:
onnet_mou_7		
onnet_mou_8	
loc_og_t2t_mou_7		
std_og_t2t_mou_7	
std_og_t2m_mou_8		
total_og_mou_8		
loc_ic_mou_7		
loc_ic_mou_8		
std_ic_mou_8		
total_ic_mou_8		

Models with high sensitivity are the best for predicting churn. Using the PCA + Logistic Regression model to predict churn. It has an ROC score of 0.84 and sensitivity of 0.80.