In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_csv('Train.csv')

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['text_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])

X_features = pd.concat([data['text_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,text_len,punct%,0,1,2,3,4,5,6,7,...,716,717,718,719,720,721,722,723,724,725
0,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,4.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,19,5.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,42,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# # Explore RandomForestClassifier Attributes and Hyperparameters

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
#cross validation
from sklearn.model_selection import KFold, cross_val_score



In [8]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring=('neg_log_loss'), n_jobs=-1)

array([-0.38562944, -0.47569035, -0.73065558, -0.88361884, -0.6136559 ])

# # Explore RandomForestClassifier through Holdout Set

In [9]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [11]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [12]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.15800255162770488, 20),
 (0.06765632197219927, 618),
 (0.05757212773063001, 'punct%'),
 (0.04965645252718851, 74),
 (0.049437655155164906, 253),
 (0.0366466890994257, 701),
 (0.03340001167078495, 633),
 (0.032912984712066724, 'text_len'),
 (0.025141719635624976, 21),
 (0.02098783964033881, 159)]

In [13]:
y_pred = rf_model.predict(X_test)

precision, recall, fscore, support = score(y_test, y_pred)

In [15]:
import numpy as np
print('Precision: {} / Recall: {} / Accuracy: {}'.format(np.round(precision, 3), np.round(recall, 3), np.round((y_pred==y_test).sum() / len(y_pred), 3)))

Precision: [0.95  0.787 0.857 1.   ] / Recall: [0.679 1.    0.667 0.231] / Accuracy: 0.823


In [17]:
#Calculate Log Loss
#y_pred_log_loss = rf_model.predict(X_test)
#the_log_loss = log_loss(y_test, y_pred)
#log loss works with float, int values

#f1 score is not the best metric if you are interested in the results of all the classes

# # Explore Random Forest model with grid-search

In [18]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred)
    print('Est : {} / Depth: {} ---- Precision: {}/ Recall:{} / Accuracy:{}'.format(n_est, depth, np.round(precision, 3), np.round(recall, 3), np.round((y_pred==y_test).sum() / len(y_pred), 3)))

In [19]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est : 10 / Depth: 10 ---- Precision: [0.875 0.712 1.    1.   ]/ Recall:[0.5   1.    0.111 0.231] / Accuracy:0.742


  _warn_prf(average, modifier, msg_start, len(result))


Est : 10 / Depth: 20 ---- Precision: [1.    0.747 0.833 0.   ]/ Recall:[0.679 1.    0.556 0.   ] / Accuracy:0.79
Est : 10 / Depth: 30 ---- Precision: [0.905 0.802 0.75  1.   ]/ Recall:[0.679 0.986 0.667 0.308] / Accuracy:0.823
Est : 10 / Depth: None ---- Precision: [0.792 0.812 0.857 0.625]/ Recall:[0.679 0.932 0.667 0.385] / Accuracy:0.798
Est : 50 / Depth: 10 ---- Precision: [1.    0.705 1.    1.   ]/ Recall:[0.393 1.    0.556 0.231] / Accuracy:0.75
Est : 50 / Depth: 20 ---- Precision: [1.    0.779 1.    1.   ]/ Recall:[0.714 1.    0.667 0.231] / Accuracy:0.831
Est : 50 / Depth: 30 ---- Precision: [0.905 0.793 0.857 1.   ]/ Recall:[0.679 0.986 0.667 0.308] / Accuracy:0.823
Est : 50 / Depth: None ---- Precision: [0.905 0.8   0.857 0.667]/ Recall:[0.679 0.973 0.667 0.308] / Accuracy:0.815
Est : 100 / Depth: 10 ---- Precision: [1.    0.725 1.    1.   ]/ Recall:[0.643 1.    0.222 0.154] / Accuracy:0.774
Est : 100 / Depth: 20 ---- Precision: [0.95 0.76 1.   1.  ]/ Recall:[0.679 0.986 0.55

# # Evaluate Reandom Forest with grid search CV

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['text'])
X_count_feat = pd.concat([data['text_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feat

Unnamed: 0,text_len,punct%,0,1,2,3,4,5,6,7,...,716,717,718,719,720,721,722,723,724,725
0,31,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,4.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19,5.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,42,2.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,30,3.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,25,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,20,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614,13,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Exploring paramter settings using GridSearchCV



In [29]:
from sklearn.model_selection import GridSearchCV

In [33]:
param = {'n_estimators': [10, 150, 300], 
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,2.932633,0.209115,0.388497,0.144186,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.887097,0.878049,0.813008,0.829268,0.837398,0.848964,0.028686,1
11,2.598106,0.212656,0.210724,0.043105,,300,"{'max_depth': None, 'n_estimators': 300}",0.879032,0.878049,0.813008,0.829268,0.837398,0.847351,0.026652,2
10,1.946061,0.152804,0.223847,0.054851,,150,"{'max_depth': None, 'n_estimators': 150}",0.870968,0.878049,0.813008,0.821138,0.845528,0.845738,0.025911,3
4,1.760731,0.261524,0.29621,0.029504,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.879032,0.878049,0.813008,0.813008,0.845528,0.845725,0.029309,4
5,3.125444,0.366913,0.301281,0.059903,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.870968,0.869919,0.804878,0.829268,0.837398,0.842486,0.025214,5


In [31]:
rf_count = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300], 
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf_count, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.920938,0.012719,0.048271,0.006451,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.895161,0.878049,0.804878,0.829268,0.845528,0.850577,0.03259,1
11,1.718996,0.257955,0.04867,0.013356,,300,"{'max_depth': None, 'n_estimators': 300}",0.887097,0.878049,0.813008,0.821138,0.845528,0.848964,0.029594,2
5,1.770268,0.016832,0.080983,0.003302,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.879032,0.878049,0.813008,0.821138,0.837398,0.845725,0.027923,3
7,0.912361,0.01227,0.045278,0.002239,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.879032,0.869919,0.813008,0.821138,0.845528,0.845725,0.02596,3
8,1.972726,0.179111,0.08238,0.016249,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.870968,0.878049,0.813008,0.821138,0.837398,0.844112,0.026128,5
