## Importing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/Churn_Modelling - Churn_Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Checking Missing Values

In [3]:
df.isnull().sum()

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


In [5]:
## Dropping the Categorical Columns
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Geography'])
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Male,42,3,75075.31,2,1,0,92888.52,1


In [6]:
## Converting the data into Numerical only
df['Gender'] = df['Gender'].map({'Male' : 1, 'Female' : 0})
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,42,2,0.00,1,1,1,101348.88,1
1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,42,8,159660.80,3,1,0,113931.57,1
3,699,0,39,1,0.00,2,0,0,93826.63,0
4,850,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,0,36,7,0.00,1,0,1,42085.58,1
9998,772,1,42,3,75075.31,2,1,0,92888.52,1


In [8]:
x = df.drop(columns=['Exited'])
y = df['Exited']

In [7]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [10]:
np.round(x_train.describe(), 2)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,651.65,0.55,38.9,5.0,76102.14,1.53,0.7,0.51,100431.29
std,96.37,0.5,10.52,2.88,62461.01,0.58,0.46,0.5,57518.92
min,350.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,90.07
25%,585.0,0.0,32.0,3.0,0.0,1.0,0.0,0.0,51364.12
50%,653.0,1.0,37.0,5.0,96447.52,1.0,1.0,1.0,100487.72
75%,719.0,1.0,44.0,7.0,127611.33,2.0,1.0,1.0,149595.84
max,850.0,1.0,92.0,10.0,250898.09,4.0,1.0,1.0,199970.74


In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
sc = StandardScaler()

In [13]:
x_train_sc = sc.fit_transform(x_train)

In [14]:
x_train_new = pd.DataFrame(x_train_sc, columns = x_train.columns)

In [16]:
np.round(x_train_new.describe(), 2)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.13,-1.09,-1.99,-1.74,-1.22,-0.92,-1.54,-1.03,-1.74
25%,-0.69,-1.09,-0.66,-0.7,-1.22,-0.92,-1.54,-1.03,-0.85
50%,0.01,0.91,-0.18,-0.0,0.33,-0.92,0.65,0.97,0.0
75%,0.7,0.91,0.49,0.69,0.82,0.81,0.65,0.97,0.85
max,2.06,0.91,5.05,1.73,2.8,4.26,0.65,0.97,1.73


In [17]:
x_train_new.head(3)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,0.3565,0.913248,-0.655786,0.34568,-1.218471,0.808436,0.649203,0.974817,1.36767
1,-0.203898,0.913248,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,1.661254
2,-0.961472,0.913248,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,-0.252807


In [21]:
x_train_new.to_csv('Noramalized.csv', index=False)

## Attrition Data

In [22]:
df = pd.read_csv('/content/Attrition - Attrition.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


## Handling Missing Values

In [23]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0
EmployeeCount,0
EmployeeNumber,0


In [24]:
## Dropping the Categorical Columns
df = df.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'BusinessTravel', 'Department', 'EducationField', 'JobRole'])
df

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,1102,1,2,2,Female,94,3,2,...,3,1,0,8,0,1,6,4,0,5
1,49,No,279,8,1,3,Male,61,2,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,1373,2,2,4,Male,92,2,1,...,3,2,0,7,3,3,0,0,0,0
3,33,No,1392,3,4,4,Female,56,3,1,...,3,3,0,8,3,3,8,7,3,0
4,27,No,591,2,1,1,Male,40,3,1,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,884,23,2,3,Male,41,4,2,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,613,6,1,4,Male,42,2,3,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,155,4,3,2,Male,87,4,2,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,1023,2,3,4,Male,63,2,2,...,3,4,0,17,3,2,9,6,0,8


In [25]:
## Converting into Numerical Data
df['Gender'] = df['Gender'].map({'Male' : 1, 'Female' : 0})
df['Attrition'] = df['Attrition'].map({'Yes' : 1, 'No' : 0})
df['MaritalStatus'] = df['MaritalStatus'].map({'Single' : 0, 'Married' : 1, 'Divorced' : 2})
df['Over18'] = df['Over18'].map({'Y' : 1, 'N' : 0})
df['OverTime'] = df['OverTime'].map({'Yes' : 1, 'No' : 0})

In [26]:
df.columns

Index(['Age', 'Attrition', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [27]:
x = df.drop(columns=['Attrition'])
y = df['Attrition']

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 56)

In [29]:
np.round(x_train.describe(), 2)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,...,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0
mean,36.98,795.2,9.32,2.93,2.71,0.62,66.0,2.72,2.06,2.7,...,3.15,2.71,0.8,11.21,2.81,2.75,6.87,4.13,2.12,4.07
std,9.19,401.32,8.22,1.02,1.09,0.49,20.38,0.71,1.11,1.1,...,0.36,1.08,0.85,7.81,1.3,0.7,6.05,3.61,3.14,3.57
min,18.0,102.0,1.0,1.0,1.0,0.0,30.0,1.0,1.0,1.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.75,2.0,2.0,2.0,0.0,48.0,2.0,1.0,2.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,791.0,7.0,3.0,3.0,1.0,66.0,3.0,2.0,3.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1146.0,14.0,4.0,4.0,1.0,84.0,3.0,3.0,4.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,2.0,7.0
max,60.0,1499.0,29.0,5.0,4.0,1.0,100.0,4.0,5.0,4.0,...,4.0,4.0,3.0,40.0,6.0,4.0,36.0,18.0,15.0,17.0


In [30]:
x_train_sc = sc.fit_transform(x_train)

In [31]:
x_train_new = pd.DataFrame(x_train_sc, columns = x_train.columns)

In [32]:
np.round(x_train_new.describe(), 2)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,...,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0
mean,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,...,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.07,-1.73,-1.01,-1.89,-1.57,-1.27,-1.77,-2.41,-0.96,-1.55,...,-0.42,-1.58,-0.94,-1.44,-2.17,-2.51,-1.14,-1.15,-0.68,-1.14
25%,-0.76,-0.82,-0.89,-0.91,-0.65,-1.27,-0.88,-1.01,-0.96,-0.64,...,-0.42,-0.65,-0.94,-0.67,-0.63,-1.07,-0.64,-0.59,-0.68,-0.58
50%,-0.11,-0.01,-0.28,0.07,0.26,0.79,-0.0,0.39,-0.05,0.27,...,-0.42,0.27,0.24,-0.16,0.14,0.36,-0.31,-0.31,-0.36,-0.3
75%,0.66,0.87,0.57,1.04,1.18,0.79,0.88,0.39,0.85,1.18,...,-0.42,1.19,0.24,0.49,0.14,0.36,0.35,0.79,-0.04,0.82
max,2.51,1.75,2.4,2.02,1.18,0.79,1.67,1.79,2.66,1.18,...,2.39,1.19,2.6,3.69,2.45,1.8,4.82,3.84,4.1,3.62


In [33]:
x_train_new.to_csv('Attrition_Normalized.csv', index=False)