In [1]:
import pandas as pd
import numpy as np
import re
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

### Reading the data

In [2]:
df = pd.read_csv("../data/data.csv")

### Overall descriptive statistics of your dataset

In [3]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [4]:
df.shape

(252001, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252001 entries, 0 to 252000
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Id                 252001 non-null  int64 
 1   Income             252001 non-null  int64 
 2   Age                252001 non-null  int64 
 3   Experience         252001 non-null  int64 
 4   Married/Single     252001 non-null  object
 5   House_Ownership    252001 non-null  object
 6   Car_Ownership      252001 non-null  object
 7   Profession         252001 non-null  object
 8   CITY               252001 non-null  object
 9   STATE              252001 non-null  object
 10  CURRENT_JOB_YRS    252001 non-null  int64 
 11  CURRENT_HOUSE_YRS  252001 non-null  int64 
 12  Risk_Flag          252001 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB


lowering all columns names

In [6]:
df.rename(
    columns={
        "CITY": "City",
        "STATE": "State",
        "CURRENT_JOB_YRS": "Current_Job_Years",
        "CURRENT_HOUSE_YRS": "Current_House_Years",
    },
    inplace=True,
)

In [7]:
df.columns = [column_name.lower() for column_name in df.columns]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252001 entries, 0 to 252000
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id                   252001 non-null  int64 
 1   income               252001 non-null  int64 
 2   age                  252001 non-null  int64 
 3   experience           252001 non-null  int64 
 4   married/single       252001 non-null  object
 5   house_ownership      252001 non-null  object
 6   car_ownership        252001 non-null  object
 7   profession           252001 non-null  object
 8   city                 252001 non-null  object
 9   state                252001 non-null  object
 10  current_job_years    252001 non-null  int64 
 11  current_house_years  252001 non-null  int64 
 12  risk_flag            252001 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB


Data description of all features present in the file

In [9]:
data_info = pd.read_csv("../data/Data_Dictionary.csv")
data_info

Unnamed: 0,Column_Name,Data_Description,Data_Class,Data_Type,Domain
0,Id,Customer Identification Number,Categorical,int,Positive integers
1,Income,Income of the Customer,Continuous,int,Positive integers
2,Age,Age of the Customer,Continuous,int,Positive integers
3,Experience,Total Professional Experience in Years,Continuous,int,Positive integers
4,Married/Single,Marital Status of the Customer,Categorical,string,"[single,married]"
5,House_Ownership,Home Ownership Status of the Customer,Categorical,string,"3 String values [rented,owned,norent_noown]"
6,Car_Ownership,Car Ownership Status of the Customer,Categorical,string,"[yes,no]"
7,Profession,Occupation of the Customer,Categorical,string,51 String values
8,City,City of Residence,Categorical,string,317 String values
9,State,State of Residence,Categorical,string,29 String values


In [10]:
# Segragating numerical features.
numerical_vars = data_info[(data_info['Data_Type']=='int') & (data_info['Column_Name'] != 'Risk_Flag')]['Column_Name']
numerical_vars.reset_index(drop=True, inplace=True)
numerical_vars = [i.lower() for i in list(numerical_vars)]

In [11]:
# Segragating string variables, all of which are categorical
string_vars = data_info[data_info['Data_Type']=='string']['Column_Name']
string_vars.reset_index(drop=True, inplace=True)
string_vars = [i.lower() for i in list(string_vars)]

In [12]:
def clean_text(text):
    text = text.encode('ascii', errors='ignore').decode('utf8')
    text = re.sub(r"[()\[\]]" , "" , text)
    text = re.sub(r"[0-9]" , "" , text)   
    text = re.sub(r"[_]" , " " , text)
    text = text.lower()
    return (text)

In [13]:
for str_column_name in string_vars:
    df[str_column_name] = df.loc[:,str_column_name].apply(clean_text)

In [14]:
df

Unnamed: 0,id,income,age,experience,married/single,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1,1303834,23,3,single,rented,no,mechanical engineer,rewa,madhya pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,software developer,parbhani,maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,technical writer,alappuzha,kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,software developer,bhubaneswar,odisha,2,12,1
4,5,5768871,47,11,single,rented,no,civil servant,tiruchirappalli,tamil nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251996,251997,2843572,26,10,single,rented,no,army officer,rewa,madhya pradesh,6,11,0
251997,251998,4522448,46,7,single,rented,no,design engineer,kalyan-dombivli,maharashtra,7,12,0
251998,251999,6507128,45,0,single,rented,no,graphic designer,pondicherry,puducherry,0,10,0
251999,252000,9070230,70,17,single,rented,no,statistician,avadi,tamil nadu,7,11,0


### Label Encoding
**Performed encoding on categorical variables**

In [16]:
lencoder = {}
for str_column_name in string_vars:
    label_encoder = preprocessing.LabelEncoder()
    df[str_column_name] = label_encoder.fit_transform(df[str_column_name])
    lencoder[str_column_name] = label_encoder

In [17]:
df

Unnamed: 0,id,income,age,experience,married/single,house_ownership,car_ownership,profession,city,state,current_job_yrs,current_house_yrs,risk_flag
0,1,1303834,23,3,1,2,0,33,251,14,3,13,0
1,2,7574516,40,10,1,2,0,43,227,15,9,13,0
2,3,3991815,66,4,0,2,0,47,8,12,4,10,0
3,4,6256451,41,2,1,2,1,43,53,18,2,12,1
4,5,5768871,47,11,1,2,0,11,296,23,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251996,251997,2843572,26,10,1,2,0,3,251,14,6,11,0
251997,251998,4522448,46,7,1,2,0,17,143,15,7,12,0
251998,251999,6507128,45,0,1,2,0,27,233,19,0,10,0
251999,252000,9070230,70,17,1,2,0,44,25,23,7,11,0


### One-Hot Encoding:
**As we are aware that Scikit learn models work only on numeric data, we have encoded the necessary categorical features to numeric format using 'One-Hot' Encoding technique. This technique will convert our textual data to an array of 0s and 1s.**

In [15]:
oheencoder = {}
col_names = []
df_ohe = pd.DataFrame()
dfs=[]
for str_column_name in string_vars:
    enc = preprocessing.OneHotEncoder()
    df_temp = pd.DataFrame(enc.fit_transform(df.loc[:,[str_column_name]]).toarray())
    col_names = [str(str_column_name)+'_'+str(x) for x in list(enc.categories_[0])]
    df_temp.columns = col_names
    oheencoder[str_column_name] = enc
    dfs.append(df_temp)
df_ohe = pd.concat(dfs,axis=1)

In [16]:
# now we will combine the numberical and one hot encoded vectors
df_processed = pd.concat([df[numerical_vars], df_ohe], axis = 1)
df_processed = df_processed.drop(columns=['id'])
df_processed

Unnamed: 0,income,age,experience,current_job_years,current_house_years,married/single_married,married/single_single,house_ownership_norent noown,house_ownership_owned,house_ownership_rented,...,state_puducherry,state_punjab,state_rajasthan,state_sikkim,state_tamil nadu,state_telangana,state_tripura,state_uttar pradesh,state_uttarakhand,state_west bengal
0,1303834,23,3,3,13,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7574516,40,10,9,13,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3991815,66,4,4,10,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6256451,41,2,2,12,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5768871,47,11,3,14,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251996,2843572,26,10,6,11,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251997,4522448,46,7,7,12,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251998,6507128,45,0,0,10,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251999,9070230,70,17,7,11,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Train - Test Split

In [17]:
X = df_processed.values
y = df.risk_flag.values

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Scaling

#### Standard Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

#### Min Max Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_x = MinMaxScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)