In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/EmployeeAttrition.csv")

In [3]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [5]:
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

# Define X & Y

Y =>>> target / dependent variable
Attrition column

X =>>> All other columns are features / independent variables 

In [7]:
X = df.drop('Attrition',axis=1)
Y = df.loc[:,'Attrition']

In [8]:
X.shape, Y.shape

((1470, 34), (1470,))

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,
                                                 random_state=7,
                                                 stratify=Y)

In [11]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1029, 34), (441, 34), (1029,), (441,))

# Pre process training data

## Find categorical columns and convert them to numbers

In [12]:
df_obj = df.select_dtypes(include='object')

In [13]:
df_obj.shape

(1470, 9)

In [14]:
df_ohe = pd.get_dummies(df_obj)
# convert all object columns to one hot encoded columns

In [15]:
df_ohe.shape

(1470, 31)

In [16]:
df_ohe.columns

Index(['Attrition_No', 'Attrition_Yes', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Human Resources',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male',
       'JobRole_Healthcare Representative', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative', 'MaritalStatus_Divorced',
       'MaritalStatus_Married', 'MaritalStatus_Single', 'Over18_Y',
       'OverTime_No', 'OverTime_Yes'],
      dtype='object')

## One hot encoding on X-train

In [17]:
X_train_ohe = pd.get_dummies(X_train)
# convert all object columns to one hot encoded columns

In [18]:
X_train_ohe.shape

(1029, 55)

In [20]:
X_train_ohe.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'Gender_Male', 'JobR

In [21]:
X_train_ohe.dtypes

Age                                  int64
DailyRate                            int64
DistanceFromHome                     int64
Education                            int64
EmployeeCount                        int64
EmployeeNumber                       int64
EnvironmentSatisfaction              int64
HourlyRate                           int64
JobInvolvement                       int64
JobLevel                             int64
JobSatisfaction                      int64
MonthlyIncome                        int64
MonthlyRate                          int64
NumCompaniesWorked                   int64
PercentSalaryHike                    int64
PerformanceRating                    int64
RelationshipSatisfaction             int64
StandardHours                        int64
StockOptionLevel                     int64
TotalWorkingYears                    int64
TrainingTimesLastYear                int64
WorkLifeBalance                      int64
YearsAtCompany                       int64
YearsInCurr

# Check for missing values

In [22]:
X_train_ohe.isna().sum() #isnull()

Age                                  0
DailyRate                            0
DistanceFromHome                     0
Education                            0
EmployeeCount                        0
EmployeeNumber                       0
EnvironmentSatisfaction              0
HourlyRate                           0
JobInvolvement                       0
JobLevel                             0
JobSatisfaction                      0
MonthlyIncome                        0
MonthlyRate                          0
NumCompaniesWorked                   0
PercentSalaryHike                    0
PerformanceRating                    0
RelationshipSatisfaction             0
StandardHours                        0
StockOptionLevel                     0
TotalWorkingYears                    0
TrainingTimesLastYear                0
WorkLifeBalance                      0
YearsAtCompany                       0
YearsInCurrentRole                   0
YearsSinceLastPromotion              0
YearsWithCurrManager     

# Normalize / Standardise continuous columns

- Assume : numeric / continuous columns are those which are not categorical (for this case : object data type)

In [23]:
X_train_numeric = X_train.select_dtypes(exclude='object')

In [24]:
X_train_numeric.shape

(1029, 26)

In [26]:
num_columns = X_train_numeric.columns

## Apply standardization

Need of std the num

- standard scaler

new_val = (val - mean) / std_dev

zscore formula 
mean is set to 0 and std_dev = 1

- minmax scaler

new_val = (max - val ) / (max - min)

here all values are convertted to be in 0-1 range

- robust scaler

new_val = (val - median) / IQR

not sensitive to outliers

In [28]:
from sklearn.preprocessing import StandardScaler, RobustScaler

In [29]:
X_train.loc[:,num_columns].shape

(1029, 26)

In [30]:
std = StandardScaler()
std.fit(X_train.loc[:,num_columns])

StandardScaler()

In [31]:
X_train_std = std.transform(X_train.loc[:,num_columns])

In [52]:
X_train_std = pd.DataFrame(X_train_std,columns= num_columns, 
                           index=X_train.index)

In [53]:
X_train_std.shape

(1029, 26)

In [35]:
X_train_std.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [36]:
X_train_std.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,...,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0
mean,2.175131e-16,-1.381035e-16,-8.631472e-17,-8.631472e-18,0.0,1.260195e-16,-9.494619e-17,-2.727545e-16,2.589441e-16,-2.071553e-17,...,-6.9051770000000005e-18,0.0,-4.8336240000000005e-17,8.631472e-17,2.9347e-17,-2.658493e-16,2.9347e-17,8.113583000000001e-17,7.250436000000001e-17,-8.286213e-17
std,1.000486,1.000486,1.000486,1.000486,0.0,1.000486,1.000486,1.000486,1.000486,1.000486,...,1.000486,0.0,1.000486,1.000486,1.000486,1.000486,1.000486,1.000486,1.000486,1.000486
min,-2.046045,-1.717471,-0.9974329,-1.859407,0.0,-1.705394,-1.588194,-1.778655,-2.400335,-0.9534237,...,-1.606938,0.0,-0.9586409,-1.423599,-2.145011,-2.43078,-1.153944,-1.195568,-0.676714,-1.171414
25%,-0.7387614,-0.8246966,-0.871422,-0.8916125,0.0,-0.8783988,-0.6695591,-0.8946491,-1.004097,-0.9534237,...,-0.6821398,0.0,-0.9586409,-0.6564437,-0.6138169,-1.035759,-0.6515147,-0.6233634,-0.676714,-0.5941225
50%,-0.1940599,-0.01106027,-0.2413677,0.07618208,0.0,-0.003148452,0.249076,-0.0106432,0.3921407,-0.03738917,...,0.2426584,0.0,0.1964428,-0.1450066,0.1517802,0.359262,-0.3165616,-0.3372613,-0.3604839,-0.3054766
75%,0.6774626,0.9014984,0.5146974,1.043977,0.0,0.8920696,1.167711,0.8733627,0.3921407,0.8786454,...,1.167457,0.0,0.1964428,0.4942898,0.1517802,0.359262,0.3533446,0.8071472,0.2719763,0.849107
max,2.529448,1.734919,2.530871,2.011771,0.0,1.73404,1.167711,1.659146,1.788379,2.710714,...,1.167457,0.0,2.50661,3.690772,2.448571,1.754283,5.545118,3.382066,4.066737,3.735566


# Create the final training data

join the numeric columns which are standard scalar

and one hot encoded columns together

In [54]:
X_train_cat = X_train_ohe.drop(num_columns,axis=1)

In [55]:
X_train_cat.shape

(1029, 29)

In [56]:
X_train_final = pd.concat([X_train_cat, X_train_std],axis=1 )

In [57]:
X_train_final.shape

(1029, 55)

In [None]:
X_train_final

In [32]:
X_test_std = std.transform(X_test.loc[:,num_columns])

# Quick Shortcut  pre processing

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../Data/EmployeeAttrition.csv")

In [82]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [83]:
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [84]:
df_ohe = pd.get_dummies(df)

In [85]:
df_ohe.shape

(1470, 57)

In [86]:
df_ohe.fillna(df_ohe.median(),inplace=True)

In [87]:
df_ohe.shape

(1470, 57)

In [88]:
df_ohe.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_No',
       'Attrition_Yes', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Human Resources',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gen

# Working of Standard Scaler

In [66]:
import numpy as np

In [67]:
a1 = np.array([23,67,33,11,7,99,24,57,78])
a2 = np.array([0.001,0.1,0.3,0.00001,0.4,0.89,0.0005,0.04,0])

In [68]:
a2.min(), a1.min()

(0.0, 7)

In [69]:
a1.mean(),a2.mean()

(44.333333333333336, 0.19239)

In [70]:
a1.std(),a2.std()

(30.364452901377952, 0.28295583503044747)

In [71]:
# new_val = (val - mean) / std_dev
a1_new = (a1 - a1.mean()) / a1.std()

In [72]:
a1_new

array([-0.70257592,  0.74648691, -0.37324346, -1.09777487, -1.22950785,
        1.80035079, -0.66964267,  0.41715445,  1.10875262])

In [73]:
a1_new.mean() , a1_new.std()

(-9.868649107779169e-17, 1.0)

In [75]:
a2.mean(), a2.std()

(0.19239, 0.28295583503044747)

In [79]:
a2

array([1.0e-03, 1.0e-01, 3.0e-01, 1.0e-05, 4.0e-01, 8.9e-01, 5.0e-04,
       4.0e-02, 0.0e+00])

In [76]:
a2_new = (a2 - a2.mean() ) / a2.std()

In [77]:
a2_new

array([-0.67639531, -0.32651739,  0.3803067 , -0.67989409,  0.73371874,
        2.46543776, -0.67816237, -0.53856461, -0.67992943])

In [78]:
a2_new.mean(), a2_new.std()

(2.4671622769447922e-17, 0.9999999999999999)

# Missing Value Practice

In [89]:
import pandas as pd

In [90]:
df = pd.read_csv("/content/risk_factors_cervical_cancer.csv")

In [91]:
df.isna().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


In [92]:
df = pd.read_csv("/content/risk_factors_cervical_cancer.csv",
                 na_values="?")

In [93]:
df.isna().sum()

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

In [94]:
df.fillna(df.median(),inplace=True)

In [95]:
df.dtypes

Age                                     int64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                float64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives               float64
Hormonal Contraceptives (years)       float64
IUD                                   float64
IUD (years)                           float64
STDs                                  float64
STDs (number)                         float64
STDs:condylomatosis                   float64
STDs:cervical condylomatosis          float64
STDs:vaginal condylomatosis           float64
STDs:vulvo-perineal condylomatosis    float64
STDs:syphilis                         float64
STDs:pelvic inflammatory disease      float64
STDs:genital herpes                   float64
STDs:molluscum contagiosum            float64
STDs:AIDS                         