### Main steps of this notebook:
- reading data
- dropping unimportant features
- checking null values and creating features from them
- downcasting some features for efficency
- encoding some features using one-hot-encoding


In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('training.csv', sep=';', low_memory=False)
df_va = pd.read_csv('validation.csv', sep=';', low_memory=False)

In [3]:
print(df.shape)
print(df_va.shape)

(3700, 19)
(200, 19)


##### merge validation data with training data to apply our processing on both of them, then seperate them when we finish

In [4]:
df = df.append(df_va)

In [5]:
df.shape

(3900, 19)

In [6]:
df.columns

Index(['variable1', 'variable2', 'variable3', 'variable4', 'variable5',
       'variable6', 'variable7', 'variable8', 'variable9', 'variable10',
       'variable11', 'variable12', 'variable13', 'variable14', 'variable15',
       'variable17', 'variable18', 'variable19', 'classLabel'],
      dtype='object')

#### ِAfter doing analysis of the data, i found:
- in training data variable19 is same as our target variable so we have to drop it or the model will learn to just predict this variable and perform so bad on validation data
- 'variable6', 'variable7', 'variable12', this 3 variables are not that important, and models get same performance without them, so we don't need them

In [7]:
to_drop = ['variable6', 'variable7', 'variable12','variable19']
df = df.drop(to_drop, axis=1)

#### take a look at our remaining variables

In [8]:
df.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable8,variable9,variable10,variable11,variable13,variable14,variable15,variable17,variable18,classLabel
0,a,1792,0.00054,u,g,175,f,t,1,g,80.0,5,800000.0,t,no.
1,b,1692,0.00335,y,p,29,f,f,0,s,200.0,0,2000000.0,,no.
2,b,3125,1125.0,u,g,0,f,t,1,g,96.0,19,960000.0,t,no.
3,a,4817,1335.0,u,g,335,f,f,0,g,0.0,120,0.0,,no.
4,b,3233,35.0,u,g,5,f,f,0,g,232.0,0,2320000.0,f,no.


#### how many null values we have in each column

In [9]:
df.isnull().sum()

variable1       42
variable2       42
variable3        0
variable4       66
variable5       66
variable8        0
variable9        0
variable10       0
variable11       0
variable13       0
variable14     103
variable15       0
variable17     103
variable18    2256
classLabel       0
dtype: int64

##### because we have too many null values in some columns, let's make a new feature for each column because that benefit our model in learning

In [10]:
for i in ['variable1', 'variable2', 'variable4', 'variable5', 'variable14', 'variable17', 'variable18']:
    df[i+'_na'] = pd.isnull(df[i])

#### this variables looks like a floa/int number to me but it's repesented with object, so convert it to float/int and i tried both approaches and this one give better result

In [11]:
def rep(x):
    return x.replace(',','.')
df.variable8 = df.variable8.astype(str)
df.variable8 = pd.to_numeric(df.variable8.apply(rep))
df.variable8 = df.variable8.astype(np.float32)

def rep(x):
    x = x.replace('nan','0')
    return x
df.variable17 = df.variable17.astype(str)
df.variable17 = pd.to_numeric(df.variable17.apply(rep), downcast='integer')

def rep(x):
    x = x.replace(',','.')
    x = x.replace('nan','0')
    return x
df.variable2 = df.variable2.astype(str)
df.variable2 = pd.to_numeric(df.variable2.apply(rep))

def rep(x):
    return x.replace(',','.')
df.variable3 = df.variable3.astype(str)
df.variable3 = pd.to_numeric(df.variable3.apply(rep))

#### check type of each column and number of unique values in each column to decide which one to use one-hot-encding on them or label encoding

In [12]:
df.dtypes

variable1         object
variable2        float64
variable3        float64
variable4         object
variable5         object
variable8        float32
variable9         object
variable10        object
variable11         int64
variable13        object
variable14       float64
variable15         int64
variable17         int32
variable18        object
classLabel        object
variable1_na        bool
variable2_na        bool
variable4_na        bool
variable5_na        bool
variable14_na       bool
variable17_na       bool
variable18_na       bool
dtype: object

In [13]:
df.nunique()

variable1          2
variable2        350
variable3        215
variable4          3
variable5          3
variable8        132
variable9          2
variable10         2
variable11        23
variable13         3
variable14       170
variable15       240
variable17       170
variable18         2
classLabel         2
variable1_na       2
variable2_na       2
variable4_na       2
variable5_na       2
variable14_na      2
variable17_na      2
variable18_na      2
dtype: int64

#### since these variables have <= 3 unique values, it's better to use one-hot-encoding on them, because using label encoding may not capture some relations so it's better to use hot-encoding on features with small number of variables

In [14]:
to_binary = ['variable9', 'variable10', 'classLabel']

In [15]:
#check values
for i in to_binary:
    print(df[i].unique())

['f' 't']
['t' 'f']
['no.' 'yes.']


In [16]:
#encode binary features with 0 and 1
for i in to_binary:
    df[i] = pd.factorize(df[i])[0]
    df[i] = df[i].astype(np.uint8)

In [17]:
#let's check our features types again
df.dtypes

variable1         object
variable2        float64
variable3        float64
variable4         object
variable5         object
variable8        float32
variable9          uint8
variable10         uint8
variable11         int64
variable13        object
variable14       float64
variable15         int64
variable17         int32
variable18        object
classLabel         uint8
variable1_na        bool
variable2_na        bool
variable4_na        bool
variable5_na        bool
variable14_na       bool
variable17_na       bool
variable18_na       bool
dtype: object

In [18]:
#downcast some columns
df.variable11 = df.variable11.astype(np.uint8)
df.variable14 = df.variable14.astype(np.float16)
df.variable15 = df.variable15.astype(np.uint32)

#### use one-hot-encoding on those features

In [19]:
one_hot = ['variable1', 'variable4', 'variable5', 'variable13', 'variable18']

In [20]:
for i in one_hot:
    print(i)
    one_h = pd.get_dummies(df[i], prefix= i)
    df = df.drop(i,axis=1)
    df = pd.concat([df, one_h], axis=1, sort=False)

variable1
variable4
variable5
variable13
variable18


In [21]:
#fill any na value
df.fillna(-999, inplace=True)

#### check that our features are ready for training and split them again to train/validation and save them

In [22]:
df.dtypes

variable2        float64
variable3        float64
variable8        float32
variable9          uint8
variable10         uint8
variable11         uint8
variable14       float16
variable15        uint32
variable17         int32
classLabel         uint8
variable1_na        bool
variable2_na        bool
variable4_na        bool
variable5_na        bool
variable14_na       bool
variable17_na       bool
variable18_na       bool
variable1_a        uint8
variable1_b        uint8
variable4_l        uint8
variable4_u        uint8
variable4_y        uint8
variable5_g        uint8
variable5_gg       uint8
variable5_p        uint8
variable13_g       uint8
variable13_p       uint8
variable13_s       uint8
variable18_f       uint8
variable18_t       uint8
dtype: object

In [23]:
df_tr = df[:3700]
df_va = df[3700:]

In [24]:
print(df_tr.shape,df_va.shape)

(3700, 30) (200, 30)


In [25]:
df_tr.to_csv('df_tr.csv', index=False)
df_va.to_csv('df_va.csv', index=False)