<a href="https://colab.research.google.com/github/olinyoder2534/TensorflowPractice/blob/main/CustomerChurn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

In [108]:
data = pd.read_csv('/CustomerChurn.csv')

Data Exploration

In [109]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [84]:
data.shape

(7043, 21)

In [110]:
data.isna().sum().sum()

0

In [111]:
data.replace('', pd.NA, inplace=True)
blank_values = data.isna().sum()
print(blank_values)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [87]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [88]:
print('Number of genders in data set: {}'.format(data['gender'].nunique()))
print('Number of females: {}'.format(data[data['gender'] == 'Female'].shape[0]))
print('Number of males: {}'.format(data[data['gender'] == 'Male'].shape[0]))

Number of genders in data set: 2
Number of females: 3488
Number of males: 3555


In [None]:
sns.histplot(data=data, x='gender', hue='Churn', multiple='stack')
plt.title('Histogram of Churn by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

sns.histplot(data=data, x='Partner', hue='Churn', multiple='stack')
plt.title('Histogram of Churn by Partner Status')
plt.xlabel('Partner')
plt.ylabel('Count')
plt.show()

sns.boxplot(data=data, x='Churn', y='tenure')
plt.title('Histogram of Churn by Partner Status')
plt.xlabel('Churn')
plt.ylabel('Tenure')
plt.show()

Data Cleaning

In [112]:
#drop ID column
data.drop('customerID', axis = 'columns', inplace = True)

In [None]:
data.shape

(7043, 20)

In [90]:
#convert total charges variable to numeric
#data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])
data.iloc[488]
  #In row 488, TotalCharges in blank but was missed earlier

gender                                 Female
SeniorCitizen                               0
Partner                                   Yes
Dependents                                Yes
tenure                                      0
PhoneService                               No
MultipleLines                No phone service
InternetService                           DSL
OnlineSecurity                            Yes
OnlineBackup                               No
DeviceProtection                          Yes
TechSupport                               Yes
StreamingTV                               Yes
StreamingMovies                            No
Contract                             Two year
PaperlessBilling                          Yes
PaymentMethod       Bank transfer (automatic)
MonthlyCharges                          52.55
TotalCharges                                 
Churn                                      No
Name: 488, dtype: object

In [145]:
data.reset_index(drop=True, inplace=True)
#for some reason, blank rows are not showing up on my first test for them
blank_rows = [488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754]
data2 = data.drop(index=blank_rows, axis=0)

In [147]:
#linear regression model using L1 regularization. Will be used to impute the missing TotalCharges value
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = data2.drop(columns=['TotalCharges'])
y = data2['TotalCharges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

numerical_columns = ['tenure', 'MonthlyCharges']
categorical_columns = [col for col in data.columns if col not in numerical_columns]
categorical_columns.remove('TotalCharges')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

lasso_model = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=0.1))
])

lasso_model.fit(X_train, y_train)

In [150]:
blank_rows = [488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754]
data_subset = data.iloc[blank_rows]

In [None]:
data.iloc[1340]
data_subset

In [152]:
TotalChargesImputed = lasso_model.predict(data_subset)
TotalChargesImputed

array([    9.18579532, -1250.91275536,  1297.74610256,  -960.74999898,
         410.35643335, -1241.56158942,  -974.17250441, -1255.70663392,
       -1062.37258539,  1007.49353286,   287.50196381])

In [155]:
#predictions are negative?!?! Something is wrong, especially given the smallest total charge is 18.8, just going to use data_subset moving forward
data2['TotalCharges'].min()

18.8

In [None]:
data2.shape
data.shape

(7042, 20)