# Import Libraries

In [201]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder

# Import Data

In [202]:
# training data
train_raw = pd.read_csv('Dataset/Raw Data/train_data_raw.csv')

# testing data
test_raw = pd.read_csv('Dataset/Raw Data/test_data_raw.csv')

# View the Dataset

## View The Data

In [203]:
train_raw.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [204]:
test_raw.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


# Drop the columns not required

In [205]:
# drop the column 'ID'
train_raw = train_raw.drop(['ID'], axis = 1)
test_raw = test_raw.drop(['ID'], axis = 1)

In [206]:
train_raw.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
test_raw.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


# Separate Independent and Dependent variables

In [208]:
# rearrange columns to make column 'y' as the last column
train_raw = train_raw.reindex(columns = [col for col in train_raw.columns if col != 'y'] + ['y'])

# select independent variables
train_raw_x = train_raw.iloc[:, :-1]

# select dependent variable
train_raw_y = train_raw.iloc[:, -1:]

In [209]:
# view independent variable
train_raw_x.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


In [210]:
# view dependent variable
train_raw_y.head()

Unnamed: 0,y
0,130.81
1,88.53
2,76.26
3,80.62
4,78.02


In [211]:
# view test dataset
test_raw.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


**Check to see if the minimum and the maximum values are same or different for the independent variables**

In [212]:
# select non-object dtypes
train_raw_x_select_dtype = train_raw_x.select_dtypes(include=['int', 'float'])

# check the minimum values of the columns of the dataset
train_raw_x_min = [train_raw_x_select_dtype.min(axis = 0)]

# check to see if all values in the list are same
train_raw_x_min.count(train_raw_x_min[0]) == len(train_raw_x_min)

True

In [213]:
# select non-object dtypes
train_raw_x_select_dtype = train_raw_x.select_dtypes(include=['int', 'float'])

# check the minimum values of the columns of the dataset
train_raw_x_max = [train_raw_x_select_dtype.max(axis = 0)]

# check to see if all values in the list are same
train_raw_x_max.count(train_raw_x_max[0]) == len(train_raw_x_max)

True

**It seems that all the non-object dtype columns have the 0 as the minimum value and 1 as the maximum value. Hence no normalization needs to be performed on the training set**

In [214]:
train_raw_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 376 entries, X0 to X385
dtypes: int64(368), object(8)
memory usage: 12.1+ MB


In [215]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 376 entries, X0 to X385
dtypes: int64(368), object(8)
memory usage: 12.1+ MB


**Next let us check if NA vlues or zeros are present in the dataset**

In [216]:
train_raw_x.isnull().sum().sum()

0

In [217]:
test_raw.isnull().sum().sum()

0

**Check for outliers**

since all the columns have values between 0 and 1 and the categorical values have been converted into one hot encoders, there does not seem to be any outliers.

**Check for number of unique vales in each column**

In [218]:
train_raw_x.nunique(axis=0)

X0      47
X1      27
X2      44
X3       7
X4       4
        ..
X380     2
X382     2
X383     2
X384     2
X385     2
Length: 376, dtype: int64

In [219]:
test_raw.nunique(axis=0)

X0      49
X1      27
X2      45
X3       7
X4       4
        ..
X380     2
X382     2
X383     2
X384     2
X385     2
Length: 376, dtype: int64

# Let's observe the columns which have dtype as object

In [220]:
train_raw_x_object_dtype_cols = [column for column, is_type in (train_raw_x.dtypes=="object").items() if is_type]

train_raw_x_object_dtype_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [221]:
test_raw_object_dtype_cols = [column for column, is_type in (test_raw.dtypes=="object").items() if is_type]

test_raw_object_dtype_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

**Let's convert the categorical values into one hot encoders**

In [222]:
# get one hot encoding of 'object_dtype_cols'
one_hot = pd.get_dummies(test_raw[['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']])

# Drop columns 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8' as it is now encoded
train_raw_dummies = train_raw.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'],axis = 1)
test_raw_dummies = test_raw.drop(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'],axis = 1)

# Join the encoded data
train_raw_pre = train_raw_dummies.join(one_hot)
test_raw_pre = test_raw_dummies.join(one_hot)

In [223]:
train_raw_pre.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [224]:
test_raw_pre.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Add the y variable**

In [225]:
train_raw_pre = train_raw_x_encoded.assign(y = train_raw_y)

# view the merged dataset
train_raw_pre.head()

Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y,y
0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,130.81
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,88.53
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,76.26
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,80.62
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,78.02


### Export the training dataset

In [226]:
# export training set
train_raw_pre.to_csv('Dataset/Pre Processed Data/train_raw_pre.csv')

# export testing set
test_raw_pre.to_csv('Dataset/Pre Processed Data/test_raw_pre.csv')