In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as mtick 
import matplotlib.pyplot as plt

# Prediction libaries - Logistic regression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data = pd.read_csv('telecomdata.csv',index_col=0)

In [3]:
# Convert Total Charges to numerical
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors='coerce')
# Fill NA churn reason with Not Churn
data['ChurnReasonCategory'] = data['ChurnReasonCategory'].fillna('Not Churn')
data['ChurnReason'] = data['ChurnReason'].fillna('Not Churn')
data.dropna(inplace = True)

# Logistic Regression

In [4]:
#Remove columns related to info after knowing churn
data.drop(columns=['ChurnReason', 'ChurnReasonCategory', 'ChurnScore'],inplace = True)

Only predict on customers with Satisfactory Score =3

In [5]:
data = data.loc[data['SatisfactionScore'] == 3]
data.drop(columns=['SatisfactionScore'],inplace = True)

Categorical variables

In [6]:
non_numeric_features = ['Gender', 'SeniorCitizen', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod', 'ChurnValue', 'Under30', 'Married', 'ReferredaFriend', 'Offer', 
                       'InternetType', 'DeviceProtectionPlan', 'PremiumTechSupport', 'StreamingMusic', 'UnlimitedData']

for feature in non_numeric_features:     
    # Encode target labels with value between 0 and n_classes-1
    data[feature] = LabelEncoder().fit_transform(data[feature])

In [7]:
#Remove customer IDs, city, zipcode from the data set
data = data.iloc[:,3:]

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2662 entries, 2 to 7039
Data columns (total 40 columns):
Gender                           2662 non-null int64
SeniorCitizen                    2662 non-null int64
Dependents                       2662 non-null int64
TenureMonths                     2662 non-null int64
PhoneService                     2662 non-null int64
MultipleLines                    2662 non-null int64
InternetService                  2662 non-null int64
OnlineSecurity                   2662 non-null int64
OnlineBackup                     2662 non-null int64
DeviceProtection                 2662 non-null int64
TechSupport                      2662 non-null int64
StreamingTV                      2662 non-null int64
StreamingMovies                  2662 non-null int64
Contract                         2662 non-null int64
PaperlessBilling                 2662 non-null int64
PaymentMethod                    2662 non-null int64
MonthlyCharges                   2662 non-nul

In [9]:
cat_features = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod', 'Offer', 'InternetType']

encoded_features = []

for feature in cat_features:
    # Encode categorical features as a one-hot numeric array
    encoded_feat = OneHotEncoder().fit_transform(data[feature].values.reshape(-1, 1)).toarray()
    n = data[feature].nunique()
    cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
    encoded_df = pd.DataFrame(encoded_feat, columns=cols)
    encoded_df.index = data.index
    encoded_features.append(encoded_df)
data = pd.concat([data, *encoded_features], axis=1)
    
print('Number of encoded feautes:', len(encoded_features))

# Drop columns that generated one-hot encoded variables
data2 = data.copy()
drop_cols = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod', 'Offer', 'InternetType']
data.drop(columns=drop_cols, inplace=True)

Number of encoded feautes: 14


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the On

In [10]:
X = data.drop(columns=['ChurnValue']).values
y = data['ChurnValue'].values

# Splitting the data
# 75% train
# 25% test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25,  stratify=y, random_state=22)
print('X_train shape: {}'.format(x_train.shape))
print('X_test shape: {}'.format(x_test.shape))

X_train shape: (1996, 70)
X_test shape: (666, 70)


In [11]:
%%time

# Provides train/test indices to split data in train/test sets.
skf = StratifiedKFold(n_splits=4)
val_auc_scores = []

for train_index, valid_index in skf.split(x_train, y_train):
    x_pseudo_train, x_pseudo_valid = x_train[train_index], x_train[valid_index]
    y_pseudo_train, y_pseudo_valid = y_train[train_index], y_train[valid_index]
    # Standardize features by removing the mean and scaling to unit variance
    ss = StandardScaler()
    # Fit to data, then transform it.
    x_pseudo_train_scaled = ss.fit_transform(x_pseudo_train)
    # Perform standardization by centering and scaling
    x_pseudo_valid_scaled = ss.transform(x_pseudo_valid)
    # Logistic Regression
    lr = LogisticRegression() # Using default parameters
    # Fit the model according to the given training data
    lr.fit(x_pseudo_train_scaled, y_pseudo_train)
    # Predict logarithm of probability estimates.
    y_pred_valid_probs = lr.predict_proba(x_pseudo_valid_scaled)[:, 1]
    # Compute Receiver operating characteristic (ROC)
    val_fpr, val_tpr, val_thresholds = roc_curve(y_pseudo_valid, y_pred_valid_probs)
    # Compute Area Under the Curve (AUC) using the trapezoidal rule
    val_auc_score = auc(val_fpr, val_tpr)
    val_auc_scores.append(val_auc_score)

CPU times: user 488 ms, sys: 9.34 ms, total: 497 ms
Wall time: 164 ms




In [12]:
# Standardize features by removing the mean and scaling to unit variance
ss = StandardScaler()
# Fit to data, then transform it.
x_train_scaled = ss.fit_transform(x_train)
# Perform standardization by centering and scaling
x_test_scaled = ss.transform(x_test)

# Applying logistic regression classifier
lr = LogisticRegression()        # Using default parameters
lr.fit(x_train_scaled, y_train)  # Training the model with X_train, y_train

# Generate Confusion Matrix
y_pred = lr.predict(x_test_scaled)
y_pred = pd.Series(y_pred)
y_test = pd.Series(y_test)
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)



True,0,1,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,536,44,580
1,23,63,86
All,559,107,666


In [13]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       559
           1       0.73      0.59      0.65       107

    accuracy                           0.90       666
   macro avg       0.83      0.77      0.80       666
weighted avg       0.89      0.90      0.89       666

