## Predict the Customer Churn for a Telco Company: <br> Process Data for ML Models

In [1]:
# Import python data science libraries

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Sci-Learn Library
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
#setting the data directory
os.chdir('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/unprocessed')

In [3]:
customer_data = pd.read_csv('Telco-Customer-Churn.csv')

In [4]:
# Drop customerID - This column is not needed
customer_data.drop('customerID',axis=1,inplace=True)

In [5]:
# Churn - target column
customer_data['Churn'].replace('Yes',1, inplace = True)
customer_data['Churn'].replace('No',0, inplace = True)

### Change the Monthly Charges from an object to int64
* Code adapted from IBM sample notebook, telco-customer-churn-on-icp4d.ipynb
https://github.com/IBM/telco-customer-churn-on-icp4d/blob/master/notebooks/Telco-customer-churn-ICP4D.ipynb


In [6]:
# Create an index for TotalCharges  
totalCharges = customer_data.columns.get_loc("TotalCharges")

# Convert the TotalCharges to numeric values
new_col = pd.to_numeric(customer_data.iloc[:, totalCharges], errors='coerce')

# Update customer_data dataframe
customer_data.iloc[:, totalCharges] = pd.Series(new_col)

In [7]:
# Fill the TotalCharges missing values
customer_data.TotalCharges.fillna(customer_data.TotalCharges.mean(), inplace = True)

In [8]:
# Create feature category for customer tenure
def tenure_lab(customer_data) :
    if customer_data["tenure"] <= 12 :
        return "Tenure_0-12"
    elif (customer_data["tenure"] > 12) & (customer_data["tenure"] <= 24 ):
        return "Tenure_12-24"
    elif (customer_data["tenure"] > 24) & (customer_data["tenure"] <= 48) :
        return "Tenure_24-48"
    elif (customer_data["tenure"] > 48) & (customer_data["tenure"] <= 60) :
        return "Tenure_48-60"
    elif customer_data["tenure"] > 60 :
        return "Tenure_gt_60"

In [9]:
# Add the tenure group to the dataset:
customer_data["tenure_group"] = customer_data.apply(lambda customer_data:tenure_lab(customer_data), axis=1)

In [10]:
# Divide the numeric columns from the non-numeric 
numeric_cols = ['MonthlyCharges', 'TotalCharges', 'tenure']
target_col = ['Churn']

# Select categorical 
categorical_cols = customer_data.select_dtypes(include='object').columns
categorical_cols = [col for col in categorical_cols if col not in target_col]

In [11]:
# Standardise the numeric data before fitting to the data to a model
customer_data[numeric_cols] = StandardScaler().fit_transform(customer_data[numeric_cols])

In [12]:
# Use LabelEncoder instead of dummy categories
for col in categorical_cols:
    customer_data[col] = LabelEncoder().fit_transform(customer_data[col])

#### Save the data for use with Machine Learning Models

In [13]:
customer_data.to_csv('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/processed/Telco-Customer-Churn-Processed.csv')