# Data Preprocessing:

### 1. Importing the libraries:

In [None]:
# pip install imbalanced-learn

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

### 2. Load the Data:

In [2]:
churndata = pd.read_csv('./telecom-customer-churn-by-maven-analytics/telecom_customer_churn.csv')
churndata.head().T

Unnamed: 0,0,1,2,3,4
Customer ID,0002-ORFBO,0003-MKNFE,0004-TLHLJ,0011-IGKFF,0013-EXCHZ
Gender,Female,Male,Male,Male,Female
Age,37,46,50,78,75
Married,Yes,No,No,Yes,Yes
Number of Dependents,0,0,0,0,0
City,Frazier Park,Glendale,Costa Mesa,Martinez,Camarillo
Zip Code,93225,91206,92627,94553,93010
Latitude,34.827662,34.162515,33.645672,38.014457,34.227846
Longitude,-118.999073,-118.203869,-117.922613,-122.115432,-119.079903
Number of Referrals,2,0,0,1,3


The Column "Customer Status" is our response variable of interest in this analysis. The reasons for Churn has its records in the "Churn Category" and "Churn Reason" columns.  

In [3]:
churndata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   City                               7043 non-null   object 
 6   Zip Code                           7043 non-null   int64  
 7   Latitude                           7043 non-null   float64
 8   Longitude                          7043 non-null   float64
 9   Number of Referrals                7043 non-null   int64  
 10  Tenure in Months                   7043 non-null   int64  
 11  Offer                              7043 non-null   objec

### 3. Train and test Split Data:

In [4]:
len(churndata) * .7, len(churndata) * .3

(4930.099999999999, 2112.9)

In [5]:
X = churndata.drop(["Customer ID","Zip Code","Latitude","Longitude","Customer Status","Churn Category","Churn Reason"],axis=1)
y = churndata["Customer Status"]

In [6]:
from imblearn.over_sampling import SMOTEN
sampler = SMOTEN(random_state=0)
Xsm,ysm = sampler.fit_resample(X, y)

ValueError: Input contains NaN

In [8]:
missing1 = pd.concat([churndata.isnull().sum(), 100 * churndata.isnull().mean()], axis=1)
missing1.columns=['count', '%']
missing2 = churndata.isnull().any()

pd.concat([missing2, missing1], axis=1)

Unnamed: 0,0,count,%
Customer ID,False,0,0.0
Gender,False,0,0.0
Age,False,0,0.0
Married,False,0,0.0
Number of Dependents,False,0,0.0
City,False,0,0.0
Zip Code,False,0,0.0
Latitude,False,0,0.0
Longitude,False,0,0.0
Number of Referrals,False,0,0.0


In [17]:
churndata.loc[:,"Avg Monthly Long Distance Charges":"Unlimited Data"]
          
#.isnull().all(axis=1)].head()

Unnamed: 0,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data
0,42.39,No,Yes,Cable,16.0,No,Yes,No,Yes,Yes,No,No,Yes
1,10.69,Yes,Yes,Cable,10.0,No,No,No,No,No,Yes,Yes,No
2,33.65,No,Yes,Fiber Optic,30.0,No,No,Yes,No,No,No,No,Yes
3,27.82,No,Yes,Fiber Optic,4.0,No,Yes,Yes,No,Yes,Yes,No,Yes
4,7.38,No,Yes,Fiber Optic,11.0,No,No,No,Yes,Yes,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,46.68,No,Yes,DSL,59.0,Yes,No,No,Yes,No,No,Yes,Yes
7039,16.20,Yes,Yes,Fiber Optic,17.0,No,No,No,No,No,Yes,Yes,Yes
7040,18.62,No,Yes,DSL,51.0,No,Yes,No,No,No,No,No,Yes
7041,2.12,No,Yes,Cable,58.0,Yes,No,Yes,Yes,No,Yes,Yes,Yes


In [22]:
churndata[churndata.loc[:,"Phone Service":"Multiple Lines"].isnull().any(1)]

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
10,0017-DINOC,Male,47,No,0,Rancho Santa Fe,92091,32.993560,-117.207121,0,...,Credit Card,45.20,2460.55,0.0,0,0.0,2460.55,Stayed,,
14,0019-GFNTW,Female,39,No,0,Los Olivos,93441,34.704340,-120.026090,0,...,Bank Withdrawal,45.05,2560.10,0.0,0,0.0,2560.10,Stayed,,
16,0020-JDNXP,Female,52,Yes,1,Point Reyes Station,94956,38.060264,-122.830646,0,...,Credit Card,61.25,1993.20,0.0,0,0.0,1993.20,Stayed,,
19,0023-HGHWL,Male,67,No,0,Morgan Hill,95037,37.161544,-121.649371,0,...,Bank Withdrawal,25.10,25.10,0.0,0,0.0,25.10,Churned,Competitor,Competitor made better offer
25,0032-PGELS,Female,37,Yes,1,Palomar Mountain,92060,33.309852,-116.823091,1,...,Bank Withdrawal,30.50,30.50,0.0,0,0.0,30.50,Churned,Attitude,Attitude of service provider
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6979,9906-NHHVC,Female,75,No,0,Macdoel,96058,41.769709,-121.920630,0,...,Bank Withdrawal,53.50,3517.90,0.0,0,0.0,3517.90,Stayed,,
6980,9907-SWKKF,Female,69,No,0,Twain,95984,40.022184,-121.062384,0,...,Credit Card,25.05,25.05,0.0,0,0.0,25.05,Churned,Competitor,Competitor had better devices
6996,9928-BZVLZ,Female,58,No,0,Sunset Beach,90742,33.719221,-118.073596,0,...,Credit Card,49.85,552.10,0.0,0,0.0,552.10,Stayed,,
7016,9955-QOPOY,Male,23,Yes,0,Sunnyvale,94089,37.421633,-122.009613,7,...,Credit Card,60.80,4263.40,0.0,0,0.0,4263.40,Stayed,,
