# Kaggle Challenge : E-mail classification 

**Team name : *Maxime et les garçons, à table***

**Alexis Carpier, Maxime Seince, Victor Perroux, Théau Pihouée**

# Importing libraries 

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

# Reading files 

In [2]:
train_df = pd.read_csv('train_ml.csv', index_col=0)
test_df = pd.read_csv('test_ml.csv', index_col=0)

In [3]:
train_df.head(3)

Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,updates,personal,promotions,forums,purchases,travel,spam,social
0,"Mon, 15 Oct 2018 08:03:09 +0000 (UTC)",researchgatemail,net,0,0,multipart/alternative,4,28,0,1,47.0,25556,0,1,0,0,0,0,0,1
1,"Thu, 17 Apr 2014 09:12:33 -0700 (PDT)",no-ip,com,0,0,multipart/alternative,6,32,0,0,46.0,19930,1,1,0,0,0,0,0,0
2,"Thu, 27 Oct 2016 01:36:28 +0000",mail,goodreads.com,0,0,multipart/mixed,0,0,0,0,21.0,4,0,1,0,0,0,0,0,1


In [4]:
test_df.head(3)

Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body
0,"Thu, 13 Jul 2017 08:55:57 +0000",twitter,com,0,0,multipart/alternative,7,56,0,0,67.0,36243
1,"Sun, 30 Sep 2018 14:42:12 +0000",mailer,netflix.com,0,0,multipart/alternative,5,33,0,0,27.0,27015
2,"Mon, 13 Feb 2017 10:47:00 +0530",iiitd,ac.in,0,0,text/plain,0,2,1,0,22.0,788


In [5]:
print("train_df shape:",train_df.shape)
print("test_df shape:",test_df.shape)

train_df shape: (39671, 20)
test_df shape: (17002, 12)


# Feature engineering 

During this process, we will apply the changes both to the training and testing data. 

In [6]:
# Filling missing values with zeros to make the manipulation of the data esaier. 
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

## Processing of the column 'date'

### Creating a column 'day'

In [7]:
# We create a new column
train_df['day']=np.nan
test_df['day']=np.nan
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [8]:
# For the training set : 
for i in range (len(train_df)):
    if 'Mon' in train_df.date[i]:
        train_df.day[i]=1
    elif 'Tue' in train_df.date[i]:
        train_df.day[i]=2
    elif 'Wed' in train_df.date[i]:
        train_df.day[i]=3
    elif 'Thu' in train_df.date[i]:
        train_df.day[i]=4
    elif 'Fri' in train_df.date[i]:
        train_df.day[i]=5
    elif 'Sat' in train_df.date[i]:
        train_df.day[i]=6
    elif 'Sun' in train_df.date[i]:
        train_df.day[i]=7   
    else :
        train_df.day[i]=0

# For the testing set : 
for i in range (len(test_df)):
    if 'Mon' in test_df.date[i]:
        test_df.day[i]=1
    elif 'Tue' in test_df.date[i]:
        test_df.day[i]=2
    elif 'Wed' in test_df.date[i]:
        test_df.day[i]=3
    elif 'Thu' in test_df.date[i]:
        test_df.day[i]=4
    elif 'Fri' in test_df.date[i]:
        test_df.day[i]=5
    elif 'Sat' in test_df.date[i]:
        test_df.day[i]=6
    elif 'Sun' in test_df.date[i]:
        test_df.day[i]=7   
    else :
        test_df.day[i]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a

### Creating a column 'month'

In [9]:
#We create a new column
train_df['month']=np.nan
test_df['month']=np.nan
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [10]:
# For the training set : 
for i in range (len(train_df)):
    if 'Jan' in train_df.date[i]:
        train_df.month[i]=1
    elif 'Feb' in train_df.date[i]:
        train_df.month[i]=2
    elif 'Mar' in train_df.date[i]:
        train_df.month[i]=3
    elif 'Apr' in train_df.date[i]:
        train_df.month[i]=4
    elif 'May' in train_df.date[i]:
        train_df.month[i]=5
    elif 'Jun' in train_df.date[i]:
        train_df.month[i]=6
    elif 'Jul' in train_df.date[i]:
        train_df.month[i]=7   
    elif 'Aug' in train_df.date[i]:
        train_df.month[i]=8  
    elif 'Sep' in train_df.date[i]:
        train_df.month[i]=9  
    elif 'Oct' in train_df.date[i]:
        train_df.month[i]=10  
    elif 'Nov' in train_df.date[i]:
        train_df.month[i]=11
    elif 'Dec' in train_df.date[i]:
        train_df.month[i]=12   
    else :
        train_df.month[i]=0
        

# For the testing set : 
for i in range (len(test_df)):
    if 'Jan' in test_df.date[i]:
        test_df.month[i]=1
    elif 'Feb' in test_df.date[i]:
        test_df.month[i]=2
    elif 'Mar' in test_df.date[i]:
        test_df.month[i]=3
    elif 'Apr' in test_df.date[i]:
        test_df.month[i]=4
    elif 'May' in test_df.date[i]:
        test_df.month[i]=5
    elif 'Jun' in test_df.date[i]:
        test_df.month[i]=6
    elif 'Jul' in test_df.date[i]:
        test_df.month[i]=7   
    elif 'Aug' in test_df.date[i]:
        test_df.month[i]=8  
    elif 'Sep' in test_df.date[i]:
        test_df.month[i]=9  
    elif 'Oct' in test_df.date[i]:
        test_df.month[i]=10  
    elif 'Nov' in test_df.date[i]:
        test_df.month[i]=11
    elif 'Dec' in test_df.date[i]:
        test_df.month[i]=12   
    else :
        test_df.month[i]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the d

### Creating a column 'year'

In [11]:
# We create a new column
train_df['year']=np.nan
test_df['year']=np.nan
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [12]:
# The mails were all sent between 2013 and 2020
# For the training set : 
for i in range (len(train_df)):
    if '2013' in train_df.date[i]:
        train_df.year[i]=1
    elif '2014' in train_df.date[i]:
        train_df.year[i]=2
    elif '2015' in train_df.date[i]:
        train_df.year[i]=3
    elif '2016' in train_df.date[i]:
        train_df.year[i]=4
    elif '2017' in train_df.date[i]:
        train_df.year[i]=5
    elif '2018' in train_df.date[i]:
        train_df.year[i]=6
    elif '2019' in train_df.date[i]:
        train_df.year[i]=7   
    elif '2020' in train_df.date[i]:
        train_df.year[i]=8  
    else :
        train_df.year[i]=0

# For the testing set : 
for i in range (len(test_df)):
    if '2013' in test_df.date[i]:
        test_df.year[i]=1
    elif '2014' in test_df.date[i]:
        test_df.year[i]=2
    elif '2015' in test_df.date[i]:
        test_df.year[i]=3
    elif '2016' in test_df.date[i]:
        test_df.year[i]=4
    elif '2017' in test_df.date[i]:
        test_df.year[i]=5
    elif '2018' in test_df.date[i]:
        test_df.year[i]=6
    elif '2019' in test_df.date[i]:
        test_df.year[i]=7   
    elif '2020' in test_df.date[i]:
        test_df.year[i]=8     
    else :
        test_df.year[i]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFram

### Adding the hour of the day

We convert the hour of each day in seconds and store the values in the already existing column "date". 

In [13]:
dates = train_df['date'].to_list()
for i, date in enumerate(dates) :
    for j, caracter in enumerate(date) :
        if caracter == ':' :
            print(train_df['date'][i][j-2:j])
            train_df['date'][i] = int(train_df['date'][i][j-2:j])*3600 + int(train_df['date'][i][j+1:j+3])*60 + int(train_df['date'][i][j+4:j+5])
            break

08
09
01
08
10
12
14
19
12
09
14
13
15
03
02
02
06
14
14
20
12
16
22
18
18
12
12
23
12
14
05
05
14
09
10
05
13
12
17
21
08
23
13
12
15
11
21
14
16
20
23
14
10
03
20
22
17
02
15
14
08
13
01
09
22
03
12
01
16
12
10
17
11
00
17
17
11
07
07
14
11
03
14
06
17
13
16
08
13
09
08
14
11
05
00
15
23
17
22
10
17
06
00
18
04
11
17
13
19
02
22
01
10
12
17
04
11
21
14
10
13
16
11
10
10
12
18
09
22
00
08
19
06
17
21
07
21
17
18
10
21
17
18
11
05
17
23
07
19
06
09
07
04
04
14
12
11
12
16
10
15
03
13
08
14
01
03
17
19
08
14
12
15
16
10
19
22
19
11
15
09
19
15
01
11
07
12
20
22
07
19
18
18
11
16
00
18
12
08
10
00
03
08
12
12
17
14
10
15
11
05
15
18
09
20
08
18
03
14
14
16
21
18
21
03
04
16
15
11
08
02
04
00
09
03
07
20
11
20
00
14
04
10
18
16
10
10
06
11
20
08
17
18
17
07
20
11
11
11
18
23
04
13
11
23
21
11
16
08
17
13
01
21
13
12
22
23
21
15
09
02
14
12
15
01
10
04
04
07
16
00
00
23
20
15
13
06
09
08
07
14
15
14
00
15
09
14
22
01
05
11
18
16
13
18
02
20
08
20
19
03
16
12
16
02
15
11
18
00
17
05
16
09
2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  



07
01
12
15
15
15
16
14
11
13
17
18
17
07
05
05
17
08
00
08
01
00
15
09
21
19
10
14
07
18
00
09
14
06
12
12
13
04
23
17
10
09
21
18
09
22
02
07
12
09
18
06
13
01
21
22
17
08
09
11
12
15
17
10
07
22
15
08
12
16
18
09
09
13
02
13
21
04
17
05
06
13
05
00
16
16
12
11
15
14
10
15
10
02
23
05
20
22
14
06
15
22
19
11
18
12
04
19
06
18
00
09
11
08
18
01
07
12
05
16
16
04
18
14
23
04
02
01
15
00
23
13
22
01
10
10
18
23
05
09
05
00
09
06
10
18
18
13
12
15
11
21
12
11
07
04
17
10
12
16
03
06
02
02
16
09
07
19
23
22
12
09
04
12
10
04
18
06
07
15
02
07
06
06
17
11
06
11
15
12
07
09
14
13
12
04
17
08
13
09
06
23
03
14
14
13
22
04
17
22
13
05
00
14
02
10
03
05
04
09
04
12
15
17
10
14
14
13
10
15
05
09
18
19
09
10
13
18
08
08
09
19
02
07
12
09
12
05
03
09
09
20
16
08
00
04
07
14
08
01
23
14
07
13
03
05
00
14
10
15
07
08
16
11
09
12
14
16
13
07
07
06
19
18
08
12
08
17
16
04
19
05
23
12
18
12
08
11
08
05
07
05
12
12
17
04
13
07
03
21
14
11
08
15
20
09
16
23
16
00
06
15
11
09
09
10
14
18
06
19
20
22
12


In [14]:
dates = test_df['date'].to_list()
for i, date in enumerate(dates) :
    for j, caracter in enumerate(date) :
        if caracter == ':' :
            print(test_df['date'][i][j-2:j])
            test_df['date'][i] = int(test_df['date'][i][j-2:j])*3600 + int(test_df['date'][i][j+1:j+3])*60 + int(test_df['date'][i][j+4:j+5])
            break

08
14
10
09
01
03
23
15
23
10
08
11
15
08
00
16
03
11
11
13
17
01
16
07
15
18
13
04
18
05
09
23
15
12
14
12
13
11
09
12
20
18
15
18
10
00
03
09
13
19
08
14
04
15
10
02
03
10
00
06
03
04
17
03
23
15
12
12
15
11
00
19
05
01
21
06
14
14
06
16
16
13
21
15
01
09
14
04
08
10
14
12
14
00
18
10
18
06
01
03
05
10
14
00
21
20
12
22
07
05
11
16
22
18
06
15
06
13
15
15
14
23
14
11
16
19
18
14
16
09
01
09
15
03
12
17
05
15
11
05
12
13
16
13
16
11
16
00
09
08
06
14
17
05
15
00
07
08
04
08
18
05
01
09
08
09
03
21
19
07
00
11
12
09
23
00
18
11
09
12
15
10
00
10
11
02
19
12
18
16
15
02
09
12
08
00
22
15
15
14
12
15
21
08
03
16
13
03
09
06
09
22
22
13
15
03
06
15
10
13
20
10
08
03
09
22
08
18
07
11
01
09
08
13
17
08
20
07
17
21
07
15
18
19
11
04
09
08
14
11
03
11
09
14
10
17
01
14
17
01
14
03
01
13
21
06
20
12
16
04
18
09
21
14
13
09
20
09
06
08
09
05
09
14
05
16
12
09
00
16
01
10
01
18
07
09
18
12
15
04
05
16
03
17
18
16
06
07
10
09
12
21
23
09
17
08
15
07
08
12
09
09
11
07
17
19
00
06
05
19
05
21
17
1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  



04
17
04
09
03
13
06
02
05
00
11
08
15
05
11
09
09
18
12
05
01
12
14
21
01
11
10
22
22
13
00
07
19
07
21
12
14
04
20
19
17
16
07
11
01
20
11
10
12
14
09
17
11
11
03
13
06
17
13
12
09
18
04
00
16
11
21
09
13
23
15
16
08
09
08
03
23
10
10
11
19
19
14
10
01
10
06
11
10
08
04
20
13
05
16
11
09
05
12
17
01
12
19
13
23
05
16
14
12
09
21
10
23
22
10
09
16
16
23
12
13
16
12
12
03
11
17
09
15
10
21
16
10
14
22
23
09
01
02
08
11
13
05
00
11
06
14
02
05
11
08
12
17
14
19
04
19
17
00
16
16
14
22
00
04
12
00
18
22
19
10
01
08
05
10
06
05
11
05
08
21
10
16
06
16
11
17
09
16
08
10
04
17
04
10
23
05
17
23
12
15
05
18
00
18
22
16
16
17
14
17
00
13
14
17
16
08
07
06
11
07
13
17
21
17
12
08
15
19
18
15
08
07
00
10
00
12
19
06
06
08
22
12
19
04
13
10
23
15
07
11
22
07
21
12
21
13
11
07
00
03
09
11
10
11
04
17
20
13
00
01
00
16
15
04
03
14
17
12
05
23
14
00
10
23
04
10
18
02
11
16
22
15
04
07
02
16
07
09
18
15
06
18
16
07
15
16
08
06
13
05
13
10
11
00
16
16
17
02
01
23
14
08
10
11
23
19
21
01
13
19
07
11


## Encoding 'mail_type', 'org', and 'tld'

Let's see what labels are contained in mail_type: 

In [15]:
train_df['mail_type'].value_counts()

multipart/alternative    28194
text/html                 5588
multipart/mixed           3384
text/plain                1397
multipart/related          902
0                           98
text/html                   40
multipart/signed            23
Multipart/Mixed             21
multipart/report            12
Text/Html                    6
text/calendar                2
Multipart/Alternative        2
multipart/IDM                1
text/HTML                    1
Name: mail_type, dtype: int64

We can see that some mail types are the same, but are yet considered different because of the way they are written. For example : text/html; text/html; Text/Html; text/HTML represent the same type. To solve this issue, we will capitalize all labels from this column. 

We will apply the same process for tld and org for the same reason : 

In [16]:
# TRAINING 

## We capitalize the column org
for i in range (len(train_df)):
    org_label = train_df.iloc[i,1]
    if org_label != 0:    
        train_df.iloc[i,1] = org_label.upper()

## We capitalize the column tld
for i in range (len(train_df)):
    tld_label = train_df.iloc[i,2]
    if tld_label != 0:    
        train_df.iloc[i,2] = tld_label.upper()

## We capitalize the column mail_type
for i in range (len(train_df)):
    mail_label = train_df.iloc[i,5]
    if mail_label != 0:    
        train_df.iloc[i,5]=mail_label.upper()

In [17]:
# TESTING 

## We capitalize the column org
for i in range (len(test_df)):
    org_label = test_df.iloc[i,1]
    if org_label != 0:    
        test_df.iloc[i,1] = org_label.upper()

## We capitalize the column tld
for i in range (len(test_df)):
    tld_label = test_df.iloc[i,2]
    if tld_label != 0:    
        test_df.iloc[i,2] = tld_label.upper()

## We capitalize the column mail_type
for i in range (len(test_df)):
    mail_label = test_df.iloc[i,5]
    if mail_label != 0:    
        test_df.iloc[i,5]=mail_label.upper()

We encode the three columns using the get_dummies method. We then drop the categorical columns (the old ones). 

In [18]:
# Encoding org
df_dummies_train = pd.get_dummies(train_df['org'], prefix='org')
train_df = pd.concat([train_df, df_dummies_train],axis=1)
train_df.drop(['org'],axis=1, inplace=True)

df_dummies_test = pd.get_dummies(test_df['org'], prefix='org')
test_df = pd.concat([test_df, df_dummies_test],axis=1)
test_df.drop(['org'],axis=1, inplace=True)

# Encoding tld
df_dummies_train = pd.get_dummies(train_df['tld'], prefix='tld')
train_df = pd.concat([train_df, df_dummies_train],axis=1)
train_df.drop(['tld'],axis=1, inplace=True)

df_dummies_test = pd.get_dummies(test_df['tld'], prefix='tld')
test_df = pd.concat([test_df, df_dummies_test],axis=1)
test_df.drop(['tld'],axis=1, inplace=True)

# Encoding mail_type
train_feat = pd.get_dummies(train_df['mail_type'], prefix='mail_type')
train_df=pd.concat([train_df,train_feat],axis=1)
train_df.drop(['mail_type'],axis=1, inplace=True)

test_feat = pd.get_dummies(test_df['mail_type'], prefix='mail_type')
test_df=pd.concat([test_df,test_feat],axis=1)
test_df.drop(['mail_type'],axis=1, inplace=True)

## Spliting the data 

We separate the explanatory variables (train_x) from the target (train_y). 

In [19]:
train_y = train_df[['updates', 'personal', 'promotions', 'forums', 'purchases', 'travel', 'spam', 'social']]

train_x = train_df.drop(['updates', 'personal', 'promotions', 'forums', 'purchases', 'travel', 'spam', 'social'], axis=1)
train_x = train_x.fillna(value='None') #Filling missing values with 'None' will cause the random forest to ignore these values.

test_x = test_df

## Further selection of features : Reduction of the number of features

### We only keep those who are both in train_df and test_df (at the intersection)

For the moment, the training and testing datasets have a diffrerent number of features, because we encoded all the labels from the columns 'org' and 'tld' for both datasets. 

In [20]:
print('The number of features in the training set is ',len(train_x.columns))
print('The number of features in the testing set is ',len(test_x.columns))

The number of features in the training set is  1252
The number of features in the testing set is  970


In order to apply a machine learning algorithm, such as a random forest classifier, both datasets must have the same number of features. Thus we will only keep the intersection of features (those who are present in both the training set and the testing set). 

In [21]:
for i in test_x.columns:
    if i not in train_x.columns:
        test_x.drop([i],axis=1,inplace=True)
for i in train_x.columns:
    if i not in test_x.columns:
        train_x.drop([i],axis=1,inplace=True)

In [22]:
print('The number of features in the training set is : ',len(train_x.columns))
print('The number of features in the testing set is : ',len(test_x.columns))

The number of features in the training set is :  828
The number of features in the testing set is :  828


### Visualizing the most important features and removing the less important features

We train a random forest classifier to then apply the feature_importance method, to have information about the relevance of all the features that are used. 

In [23]:
clf = RandomForestClassifier(random_state=32, max_depth=40, n_estimators=400, min_samples_split=3)
clf.fit(train_x, train_y)

RandomForestClassifier(max_depth=40, min_samples_split=3, n_estimators=400,
                       random_state=32)

Applying the feature_importance method and sorting the features in descending order of importance: 

In [24]:
importances = pd.DataFrame({'feature':train_x.columns,'importance':np.round(clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.reset_index(inplace=True)
importances.head(10)

Unnamed: 0,feature,importance
0,year,0.12
1,chars_in_body,0.075
2,urls,0.073
3,images,0.061
4,chars_in_subject,0.058
5,date,0.042
6,org_IIITD,0.042
7,tld_AC.IN,0.039
8,ccs,0.035
9,month,0.034


We can now see that the year is the most important feature. The number of characters in the body and in the subject, the number of urls, images, are also important features. 

We will now build a list of the most irrelevent features based on the condition (importance < 0.002). 

In [25]:
irrelevent_features = []
for k in range(importances.shape[0]):
    if importances.iloc[k,1] < 0.002:
        irrelevent_features.append(importances.iloc[k,0])
len(irrelevent_features)

774

As these features are not important according to our decision tree, we will get rid of them to focus only on the features that matter. 

In [26]:
train_x.drop(irrelevent_features, axis=1, inplace=True)
test_x.drop(irrelevent_features, axis=1, inplace=True)
print("Number of features of train_x: ", train_x.shape[1])
print("Number of features of test_x: ", test_x.shape[1])

Number of features of train_x:  54
Number of features of test_x:  54


Thus, the final number of features that we use is 54. 

# Model tuning and prediction 

## Model tuning: finding the best hyperparameters 

We will use a grid search to find the best hyperparameters among values that were chosen after running random forests with various hyperparamaters and seeing which one were performing better. 

In [27]:
rfc = RandomForestClassifier(random_state=42)

We will tune three hyperparameters : the number of estimators (trees), the minimum number of samples used to make a split, and the maximum depth of each tree. 

In [28]:
param_grid = { 
    'n_estimators': [200,300,400],
    'min_samples_split': [2,3,4],
    'max_depth': [25,30,40]
}

In [29]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [25, 30, 40],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [200, 300, 400]})

We can now see what are the optimal hyperparameters: 

In [30]:
CV_rfc.best_params_

{'max_depth': 40, 'min_samples_split': 2, 'n_estimators': 400}

## Random forest Classifier 

In [31]:
clf = RandomForestClassifier(random_state=42, max_depth=40, n_estimators=400, min_samples_split=2) 
prediction = clf.fit(train_x, train_y).predict_proba(test_x)

Because the prediction is not in the right format, we need to reshape it before submitting our results. 

In [32]:
list_preds_label=[]
A=prediction
for i in range (len(prediction[0])):
#On crée le tableau de position i
    L=[]
    for j in A:
        L.append(j[i][1])
    list_preds_label.append(L)

We can now save the prediction file and submit it to Kaggle. 

In [33]:
pred_df = pd.DataFrame(list_preds_label, columns=['updates', 'personal', 'promotions',
                        'forums', 'purchases', 'travel',
                        'spam', 'social'])
pred_df.to_csv('random_forest_sample_submission.csv', index=True, index_label='Id')