# Import Libraries

In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder

# Import Data

In [126]:
# training data
train_raw = pd.read_csv('Dataset/Raw Data/train_data_raw.csv')

# testing data
test_raw = pd.read_csv('Dataset/Raw Data/test_data_raw.csv')

# View the Dataset

## Training Data

In [127]:
train_raw.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


**move the target variable 'y' to the end of the dataset**

In [128]:
train_raw = train_raw.reindex(columns = [col for col in train_raw.columns if col != 'y'] + ['y'])

train_raw.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,y
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,0.00784,...,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426,100.669318
std,2437.608688,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,0.088208,...,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734,12.679381
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.11
25%,2095.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.82
50%,4220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.15
75%,6314.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.01
max,8417.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,265.32


In [129]:
# select non-object dtypes
train_raw_select_dtype = train_raw.select_dtypes(include=['int', 'float'])

# select all columns except last target 'y' variable
train_raw_ind_var = train_raw_select_dtype.loc[:, train_raw_select_dtype.columns != 'y']

# check the minimum values of the columns of the dataset
train_raw_ind_var_min = [train_raw_ind_var.min(axis = 0)]

# check to see if all values in the list are same
train_raw_ind_var_min.count(train_raw_ind_var_min[0]) == len(train_raw_ind_var_min)

True

In [130]:
# select non-object dtypes
train_raw_select_dtype = train_raw.select_dtypes(include=['int', 'float'])

# select all columns except last target 'y' variable
train_raw_ind_var = train_raw_select_dtype.loc[:, train_raw_select_dtype.columns != 'y']

# check the minimum values of the columns of the dataset
train_raw_ind_var_max = [train_raw_ind_var.max(axis = 0)]

# check to see if all values in the list are same
train_raw_ind_var_max.count(train_raw_ind_var_max[0]) == len(train_raw_ind_var_max)

True

**It seems that all the non-object dtype columns have the 0 as the minimu value and 1 as the maximum value. Hence no normalization needs to be performed on the training set**

In [131]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to y
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


**The 'train_raw' datset has 4209 rows and 378 columns. Next let us check if NA vlues or zeros are present in the dataset**

In [132]:
train_raw.isnull().sum().sum()

0

**Check for outliers**

since all the columns have values between 0 and 1 and the categorical values have been converted into one hot encoders, there does not seem to be any outliers.

**Check for number of unique vales in each column**

In [133]:
train_raw.nunique(axis=0)

ID      4209
X0        47
X1        27
X2        44
X3         7
        ... 
X382       2
X383       2
X384       2
X385       2
y       2545
Length: 378, dtype: int64

**It seems that there are 8 columns which have dtype as 'object'. Let's observe the columns which have dtype as object**

In [134]:
object_dtype_cols = [column for column, is_type in (train_raw.dtypes=="object").items() if is_type]

object_dtype_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

**Let's convert the categorical values into one hot encoders**

In [135]:
# get one hot encoding of 'object_dtype_cols'
one_hot = pd.get_dummies(train_raw[['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']])

# Drop columns 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8' as it is now encoded
train_raw_dummies = train_raw.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'],axis = 1)

# Join the encoded data
train_raw_encoded = train_raw_dummies.join(one_hot)

# view the encoded dataset
train_raw_encoded.head()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Export the training dataset

In [136]:
train_raw_encoded.to_csv('Dataset/Pre Processed Data/train_raw_encoded.csv')

## Testing Data

In [137]:
test_raw.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [138]:
test_raw.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4211.039202,0.019007,0.000238,0.074364,0.06106,0.427893,0.000713,0.002613,0.008791,0.010216,...,0.325968,0.049656,0.311951,0.019244,0.011879,0.008078,0.008791,0.000475,0.000713,0.001663
std,2423.078926,0.136565,0.015414,0.262394,0.239468,0.494832,0.026691,0.051061,0.093357,0.10057,...,0.468791,0.217258,0.463345,0.137399,0.108356,0.089524,0.093357,0.021796,0.026691,0.040752
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4202.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6310.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8416.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [139]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


In [140]:
# select non-object dtypes
train_raw_select_dtype = train_raw.select_dtypes(include=['int'])

# check the minimum values of the columns of the dataset
train_raw_select_dtype_min = [train_raw_select_dtype.min(axis = 0)]

# check to see if all values in the list are same
train_raw_select_dtype_min.count(train_raw_select_dtype_min[0]) == len(train_raw_select_dtype_min)

True

In [141]:
# select non-object dtypes
train_raw_select_dtype = train_raw.select_dtypes(include=['int'])

# check the minimum values of the columns of the dataset
train_raw_select_dtype_max = [train_raw_select_dtype.max(axis = 0)]

# check to see if all values in the list are same
train_raw_select_dtype_max.count(train_raw_select_dtype_max[0]) == len(train_raw_select_dtype_max)

True

**It seems that all the non-object dtype columns have the 0 as the minimum value and 1 as the maximum value. Hence no normalization needs to be performed on the training set**

**Next let us check if NA vlues or zeros are present in the dataset**

In [142]:
train_raw.isnull().sum().sum()

0

**Check for outliers**

since all the columns have values between 0 and 1 and the categorical values have been converted into one hot encoders, there does not seem to be any outliers.

In [143]:
object_dtype_cols = [column for column, is_type in (test_raw.dtypes=="object").items() if is_type]

object_dtype_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

**It seems that there are 8 columns which have dtype as 'object'. Let's observe the columns which have dtype as object**

**Let's convert the categorical values into one hot encoders**

In [144]:
# get one hot encoding of 'object_dtype_cols'
one_hot = pd.get_dummies(test_raw[['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']])

# Drop columns 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8' as it is now encoded
test_raw_dummies = test_raw.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'],axis = 1)

# Join the encoded data
test_raw_encoded = test_raw_dummies.join(one_hot)

# view the encoded dataset
test_raw_encoded.head()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Export the training dataset**

In [145]:
test_raw_encoded.to_csv('Dataset/Pre Processed Data/test_raw_encoded.csv')