### Customer Churn study
#### Using logistic regression to try and predict if a customer will switch telecom providers
#### There are 21 predictor variable that are used to try and make predictions

In [None]:
# Imports
import pandas as pd
import numpy as np

In [None]:
# Import datasets
churn_data = pd.read_csv("churn_data.csv")
customer_data = pd.read_csv("customer_data.csv")
internet_data = pd.read_csv("internet_data.csv")

In [None]:
# Merging (Similar to SQL joining and done here on cutomerID)
df_1 = pd.merge(churn_data, customer_data, how="inner", on="customerID")

In [None]:
df_1.info()

In [None]:
df_1.describe()

In [None]:
df_1

In [None]:
# Another merge which will contain all predictor variables
telecom_data = pd.merge(df_1, internet_data, how="inner", on="customerID")
telecom_data

In [None]:
# Examine the dataframe structure
telecom_data.head(20)

In [None]:
telecom_data.describe()

In [None]:
telecom_data.info()

#### Data Preparation

In [None]:
# Convert Yes values to 1 and No values to 0
telecom_data['PhoneService'] = telecom_data['PhoneService'].map({'Yes': 1, 'No':0})
telecom_data['PaperlessBilling'] = telecom_data['PaperlessBilling'].map({'Yes': 1, 'No':0})
telecom_data['Churn'] = telecom_data['Churn'].map({'Yes': 1, 'No':0})
telecom_data['Partner'] = telecom_data['Partner'].map({'Yes': 1, 'No':0})
telecom_data['Dependents'] = telecom_data['Dependents'].map({'Yes': 1, 'No':0})

In [None]:
# Create dummy variable for the variable Contract and drop the first one
cont = pd.get_dummies(telecom_data['Contract'], prefix='Contract', drop_first=True)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, cont], axis=1)

In [None]:
# Check the dataframe again
telecom_data

In [None]:
# Create a dummy variable for PaymentMethod, again dropping the first one
pm = pd.get_dummies(telecom_data['PaymentMethod'], prefix='PaymentMethod', drop_first=True)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, pm], axis=1)

In [None]:
# Again check the dataframe
telecom_data

In [None]:
# Creating a dummy variable for Gender, drop the first one
gend = pd.get_dummies(telecom_data['gender'], prefix='gender', drop_first=True)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, gend], axis=1)

# Creating a dummy variable for MultipleLines
ml = pd.get_dummies(telecom_data['MultipleLines'], prefix='MultipleLines')
# Drop the MultipleLines_No service column
ml1 = ml.drop(['MultipleLines_No phone service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, ml1], axis=1)

# Creating a dummy variable for InternetService, drop the first one
iser = pd.get_dummies(telecom_data['InternetService'], prefix='InternetService', drop_first=True)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, iser], axis=1)

In [None]:
telecom_data

In [None]:
# Creating a dummy variable for OnlineSecurity
os = pd.get_dummies(telecom_data['OnlineSecurity'], prefix='OnlineSecurity')
os1 = os.drop(['OnlineSecurity_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, os1], axis=1)

# Creating a dummy variable for OnlineBackup
ob = pd.get_dummies(telecom_data['OnlineBackup'], prefix='OnlineBackup')
ob1 = ob.drop(['OnlineBackup_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, ob1], axis=1)

# Creating a dummy variable for DeviceProtection
dp = pd.get_dummies(telecom_data['DeviceProtection'], prefix='DeviceProtection')
dp1 = dp.drop(['DeviceProtection_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, iser], axis=1)

In [None]:
telecom_data

In [None]:
# Creating a dummy variable for TechSupport
ts = pd.get_dummies(telecom_data['TechSupport'], prefix='TechSupport')
ts1 = ts.drop(['TechSupport_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, ts1], axis=1)

# Creating a dummy variable for StreamingTV
st = pd.get_dummies(telecom_data['StreamingTV'], prefix='StreamingTV')
st1 = st.drop(['StreamingTV_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, st1], axis=1)


# Creating a dummy variable for StreamingMovies
sm = pd.get_dummies(telecom_data['StreamingMovies'], prefix='StreamingMovies')
sm1 = sm.drop(['StreamingMovies_No internet service'], 1)
# Add the results to the main dataframe
telecom_data = pd.concat([telecom_data, sm1], axis=1)

In [None]:
telecom_data

In [None]:
# As dummies have been created some columns can be dropped
telecom_data = telecom_data.drop(['Contract', 'PaymentMethod', 'gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'], 1)

In [None]:
# This field needs to be converted form string to float
pd.to_numeric(telecom_data['TotalCharges'], errors='coerce')

In [None]:
telecom_data.info()

#### Check for outliers

In [None]:
# Check for outliers in continuous variables
num_telecom = telecom_data[['tenure', 'MonthlyCharges', 'SeniorCitizen', 'TotalCharges']]

In [None]:
# Check at certain percentage points
num_telecom.describe(percentiles=[.25, .5, .75, .90, .95, .99])

#### Check for missing values and input them

In [None]:
# Add up the missing values column-wise
telecom_data.isnull().sum()

In [None]:
# Checks the percentage of missing values
round(100*(telecom_data.isnull().sum()/len(telecom_data.index)), 2)

In [None]:
# Remove the TotalCharges rows with missing values
telecom_data.dropna(how='all')

In [None]:
# Checks the percentage of missing values after removing the missing values
round(100*(telecom_data.isnull().sum()/len(telecom_data.index)), 2)

#### Standardise Features

In [None]:
df = telecom_data[['tenure', 'MonthlyCharges', 'TotalCharges']]

In [None]:
normalized_df = (df-df.mean())/df.std()
normalized_df

In [None]:
telecom_data = telecom_data.drop(['tenure', 'MonthlyCharges', 'TotalCharges'], 1)

In [None]:
telecom_data = pd.concat([telecom_data, normalized_df], axis=1)

In [None]:
telecom_data

#### Check the churn rate

In [None]:
churn_rate = (sum(telecom_data['Churn'])/len(telecom_data['Churn'].index))*100
churn_rate

#### The churn rate above is almost 27%

### Building Models
#### Start by splitting data into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Set a feature variable for X
X = telecom_data.drop(['Churn', 'customerID'], axis=1)

# Add a response variabe for y
y = telecom_data['Churn']

In [None]:
y.head()

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

#### Run first training model

In [None]:
import statsmodels.api as sm

In [None]:
# Logistic Regression model
logistic_mod1= sm.GLM(np.asarray(y_train), (sm.add_constant(np.asarray(X_train))), family=sm.families.Binomial())
logistic_mod1.fit().summary()