In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
pd.pandas.set_option('display.max_columns',None)

In [13]:
train_data = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(train_data.shape)

(7043, 21)


In [14]:
# there were no Zero Variances while Doing the EDA
# there were no Missing Values while Doing the EDA

In [15]:
train_data["Churn"] = train_data['Churn'].map({'Yes':1, 'No':0})

In [16]:
categorical_features=[feature for feature in train_data.columns if train_data[feature].dtype=='O' and feature != "customerID"]

In [17]:
#labeling the Categorical data
for feature in categorical_features:
    labels_ordered=train_data.groupby([feature])['Churn'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train_data[feature]=train_data[feature].map(labels_ordered)
    # in the Test Data there are some new category in the feature data
#     for category in test_data[feature].unique():
#         if category not in labels_ordered:
#             labels_ordered[category] = len(labels_ordered)
#     test_data[feature]=test_data[feature].map(labels_ordered)

In [18]:
train_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,0,1,1,0,0,1,2,1,2,2,2,2,2,1,3,29.85,29.85,0
1,5575-GNVDE,0,0,1,1,34,1,1,1,1,2,1,2,2,2,1,0,2,56.95,1889.5,0
2,3668-QPYBK,0,0,1,1,2,1,1,1,1,1,2,2,2,2,2,1,2,53.85,108.15,1
3,7795-CFOCW,0,0,1,1,45,0,0,1,1,2,1,1,2,2,1,0,1,42.3,1840.75,0
4,9237-HQITU,1,0,1,1,2,1,1,2,2,2,2,2,2,2,2,1,3,70.7,151.65,1


In [19]:
def coorilation_detection(data, features = None):
    corrmat = data.corr()
    columns = features if features != None else [ i for i in data.columns if (data[i].dtype in ['int32','int64','float32','float64'] ) ]
    coorilated_variables = {}
    for variable in columns:
        for variable1 in columns:
            if variable == variable1:
                continue
            elif abs(corrmat[variable][variable1]) >= 0.75:
                if ((variable not in coorilated_variables.keys()) and 
                    (variable1 not in coorilated_variables.keys())): 
                    coorilated_variables[variable] = []
                    coorilated_variables[variable].append(variable1)
                elif (variable1 not in coorilated_variables.keys()):  
                    coorilated_variables[variable].append(variable1)
    return coorilated_variables 

In [33]:
lst = [i for i in train_data.columns if (i != 'Churn' and train_data[i].dtypes != 'O')]
coorilated_data = coorilation_detection(train_data, lst)

{'tenure': ['TotalCharges'],
 'InternetService': ['OnlineSecurity', 'TechSupport', 'MonthlyCharges'],
 'OnlineSecurity': ['OnlineBackup', 'TechSupport'],
 'OnlineBackup': ['TechSupport'],
 'DeviceProtection': ['TechSupport', 'StreamingTV', 'StreamingMovies'],
 'StreamingTV': ['StreamingMovies']}

In [24]:
print(coorilated_data)
coorilated_data.keys()

{'tenure': ['TotalCharges'], 'InternetService': ['OnlineSecurity', 'TechSupport', 'MonthlyCharges'], 'OnlineSecurity': ['OnlineBackup', 'TechSupport'], 'OnlineBackup': ['TechSupport'], 'DeviceProtection': ['TechSupport', 'StreamingTV', 'StreamingMovies'], 'StreamingTV': ['StreamingMovies']}


dict_keys(['tenure', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'StreamingTV'])

In [11]:
train_data.drop(coorilated_data.keys(), axis = 1,inplace=True)

In [12]:
train_data.shape

(7043, 15)

In [37]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        #1st quartile
        Q1 = np.percentile(df[c],25)
        #3rd quartile
        Q3 = np.percentile(df[c],75)
        #IQR
        IQR = Q3-Q1
        #Outlier Step
        outlier_step= IQR * 1  # Consedering 1.5 for 3mu which is 99% ,Consider 1 for only 2 mu which is 95 %
        #Detect outlier and their indices
        outlier_list_col = df[(df[c]<Q1 - outlier_step) | (df[c]> Q3 + outlier_step)].index
        #store indices
        outlier_indices.extend(outlier_list_col)    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i,v in outlier_indices.items() if v > 1)
    
    return multiple_outliers

In [38]:
lst = [i for i in train_data.columns if i != 'Churn' and i not in categorical_features and i != 'customerID']
print(len(detect_outliers(train_data,lst)))

60


In [39]:
train_data = train_data.drop(detect_outliers(train_data,lst),axis=0).reset_index(drop = True) 

In [43]:
train_data.shape

(6983, 15)

In [44]:
train_data.to_csv('../data/train_processed_data.csv',index=False)