In [1]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


In [15]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import string
import xgboost as xgb

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_csv('Train.csv')

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['text_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])

X_features = pd.concat([data['text_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)



count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['text'])

X_features_count = pd.concat([data['text_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_features_count

Unnamed: 0,text_len,punct%,0,1,2,3,4,5,6,7,...,716,717,718,719,720,721,722,723,724,725
0,31,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,4.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19,5.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,42,2.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,30,3.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,25,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,20,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614,13,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

### Build Grid Search

In [11]:
import numpy as np

def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred)
    print('Est : {} / Depth: {} ---- Precision: {}/ Recall:{} / Accuracy:{}'.format(n_est, max_depth, np.round(precision, 3), np.round(recall, 3), np.round((y_pred==y_test).sum() / len(y_pred), 3)))

In [12]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 1, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est : 50 / Depth: 3 ---- Precision: [1.    0.684 1.    1.   ]/ Recall:[0.581 1.    0.545 0.133] / Accuracy:0.75
Est : 50 / Depth: 3 ---- Precision: [1.    0.744 0.889 0.571]/ Recall:[0.71  0.955 0.727 0.267] / Accuracy:0.79
Est : 50 / Depth: 3 ---- Precision: [1.    0.747 0.8   0.3  ]/ Recall:[0.677 0.925 0.727 0.2  ] / Accuracy:0.758
Est : 50 / Depth: 7 ---- Precision: [1.    0.691 1.    1.   ]/ Recall:[0.581 1.    0.636 0.133] / Accuracy:0.758
Est : 50 / Depth: 7 ---- Precision: [1.    0.747 1.    0.571]/ Recall:[0.71  0.97  0.727 0.267] / Accuracy:0.798
Est : 50 / Depth: 7 ---- Precision: [1.    0.729 0.8   0.375]/ Recall:[0.677 0.925 0.727 0.2  ] / Accuracy:0.758


  _warn_prf(average, modifier, msg_start, len(result))


Est : 50 / Depth: 1 ---- Precision: [1.    0.609 0.    0.   ]/ Recall:[0.452 1.    0.    0.   ] / Accuracy:0.653
Est : 50 / Depth: 1 ---- Precision: [1.   0.72 1.   1.  ]/ Recall:[0.645 1.    0.727 0.2  ] / Accuracy:0.79
Est : 50 / Depth: 1 ---- Precision: [1.    0.747 0.8   0.3  ]/ Recall:[0.677 0.925 0.727 0.2  ] / Accuracy:0.758
Est : 50 / Depth: 15 ---- Precision: [1.    0.699 0.875 0.75 ]/ Recall:[0.613 0.97  0.636 0.2  ] / Accuracy:0.758
Est : 50 / Depth: 15 ---- Precision: [1.    0.778 0.818 0.5  ]/ Recall:[0.71  0.94  0.818 0.333] / Accuracy:0.798
Est : 50 / Depth: 15 ---- Precision: [1.    0.756 1.    0.429]/ Recall:[0.71  0.97  0.818 0.2  ] / Accuracy:0.798
Est : 100 / Depth: 3 ---- Precision: [1.    0.713 1.    1.   ]/ Recall:[0.645 1.    0.727 0.133] / Accuracy:0.782
Est : 100 / Depth: 3 ---- Precision: [1.    0.75  0.889 0.444]/ Recall:[0.71  0.94  0.727 0.267] / Accuracy:0.782
Est : 100 / Depth: 3 ---- Precision: [1.    0.738 0.8   0.333]/ Recall:[0.677 0.925 0.727 0.2  ]

## Using GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV
gb_cv = GradientBoostingClassifier()

params = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

gs = GridSearchCV(gb_cv, params, n_jobs=-1)
cv_fit = gs.fit(X_features, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,17.868442,1.151098,0.034009,0.006479,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.846774,0.861789,0.796748,0.788618,0.837398,0.826265,0.028618,1
2,24.352044,0.122321,0.0358,0.012411,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.83871,0.853659,0.804878,0.780488,0.837398,0.823026,0.02657,2
4,31.345704,2.429637,0.0325,0.007637,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.830645,0.853659,0.804878,0.780488,0.829268,0.819788,0.024987,3
3,38.315841,0.910543,0.031282,0.002491,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.830645,0.845528,0.804878,0.772358,0.837398,0.818162,0.026643,4
5,39.672165,0.817392,0.022448,0.001416,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.830645,0.845528,0.813008,0.772358,0.829268,0.818162,0.025111,5


In [19]:
gs_count = GridSearchCV(gb_cv, params, n_jobs=-1)
cv_fit = gs.fit(X_features_count, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,26.677105,1.201984,0.026191,0.002744,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.862903,0.861789,0.804878,0.796748,0.853659,0.835995,0.029017,1
2,21.40759,0.14903,0.02858,0.002656,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.862903,0.853659,0.796748,0.788618,0.861789,0.832743,0.032965,2
3,31.890856,0.700864,0.030111,0.002118,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.862903,0.853659,0.796748,0.780488,0.853659,0.829491,0.033935,3
5,39.054384,0.415899,0.028676,0.00565,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.862903,0.853659,0.804878,0.772358,0.853659,0.829491,0.035084,3
0,14.49032,0.333992,0.023926,0.004052,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.822581,0.853659,0.821138,0.788618,0.853659,0.827931,0.024266,5
