## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Explainify\data\raw\customer_data.csv")

In [3]:
df = df.drop_duplicates()

In [4]:
df = df.drop(columns = ["customerID"])

In [5]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors = "coerce")

## Handling Categorical Data

In [6]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
cat_features = ["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines",
                "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", 
                "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod"]

In [8]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
col = "TotalCharges"
median = df[col].median()
df[col] = df[col].fillna(median)

In [10]:
df = pd.get_dummies(df, columns = cat_features, drop_first = False)
print("New shape after dummies:", df.shape)

New shape after dummies: (7043, 47)


In [11]:
df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,No,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
1,34,56.95,1889.5,No,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,2,53.85,108.15,Yes,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,45,42.3,1840.75,No,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,2,70.7,151.65,Yes,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False


In [12]:
df.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',

## Train test split

In [13]:
x = df.drop("Churn", axis = 1)
y = df["Churn"]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.25, stratify = y, random_state = 42)

## Handling Numerical Data

In [15]:
num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [16]:
df[num_features].isna().sum(axis = 0)

tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64

In [None]:
scaler = StandardScaler().fit(x_train[num_features])

x_train[num_features] = scaler.transform(x_train[num_features])
x_test[num_features] = scaler.transform(x_test[num_features])

## Exporting the test split

In [18]:
test_set = x_test.copy()
test_set["Churn"] = y_test.values

In [19]:
test_set.to_csv(r"C:\Users\LENOVO\Desktop\Explainify\data\processed\test.csv", index = False)

In [20]:
train_set = x_train.copy()
train_set["Churn"] = y_train.values

In [21]:
train_set.to_csv(r"C:\Users\LENOVO\Desktop\Explainify\data\processed\original_train.csv", index = False)

## SMOTE

In [22]:
print("Before SMOTE:", Counter(y_train))

Before SMOTE: Counter({'No': 3880, 'Yes': 1402})


In [23]:
smote = SMOTE(random_state = 42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [24]:
print("After SMOTE:", Counter(y_train_resampled))

After SMOTE: Counter({'No': 3880, 'Yes': 3880})


## Exporting the training set

In [25]:
train_set = x_train_resampled.copy()
train_set["Churn"] = y_train_resampled.values

In [26]:
train_set.to_csv(r"C:\Users\LENOVO\Desktop\Explainify\data\processed\smote_train.csv", index = False)