In [36]:
import pandas as pd
import nltk
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import metrics
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Reading and Understanding data

In [10]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

In [11]:
pprint(vars(train))

{'_data': BlockManager
Items: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Axis 1: RangeIndex(start=0, stop=7613, step=1)
IntBlock: slice(0, 8, 4), 2 x 7613, dtype: int64
ObjectBlock: slice(1, 4, 1), 3 x 7613, dtype: object,
 '_is_copy': None,
 '_item_cache': {}}


In [12]:
#train.head()
train["text"][1000]

'#OVOFest Drake straight body bagging Meek on that OVO stage. #ZIPHIMUP!'

In [13]:
sample.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


### Try k-nearest neighbour on the data

In [14]:
trainer,valid = train_test_split(train,
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [15]:
trainer.shape

(6090, 5)

<h2>Use tf-idf on the text</h2>

In [16]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [17]:
tfv.fit(list(trainer.text.values) + list(valid.text.values))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=3, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=1, stop_words='english', strip_accents='unicode',
                sublinear_tf=1, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=1, vocabulary=None)

In [18]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [19]:
xtrain_tfv =  tfv.transform(trainer.text.values) 
xvalid_tfv = tfv.transform(valid.text.values)
pprint(vars(xtrain_tfv))

{'_shape': (6090, 9229),
 'data': array([0.33437892, 0.25069802, 0.06115396, ..., 0.31132424, 0.33215319,
       0.26934142]),
 'indices': array([8605, 8604, 7909, ..., 2824,  525,  524], dtype=int32),
 'indptr': array([    0,    15,    21, ..., 76789, 76810, 76825], dtype=int32),
 'maxprint': 50}


<h2>Use Logistic Regression and linear svc on data</h2>

In [108]:
#clf = KNeighborsClassifier(n_neighbors=5)
#help(LogisticRegression)
clf = LogisticRegression(C=2)
clf.fit(xtrain_tfv, trainer.target)
print("Number of features used:", np.sum(clf.coef_ != 0))

Number of features used: 9193


In [109]:
#predictions = clf.predict_proba(xvalid_tfv)
train_predictions =  clf.predict(xtrain_tfv)
predictions =  clf.predict(xvalid_tfv)
print("Test set predictions:", predictions)

Test set predictions: [0 0 0 ... 1 1 1]


In [110]:
print ("f1_score on training: %0.3f " % f1_score(trainer.target.values, train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, predictions))

f1_score on training: 0.870 
f1_score on test: 0.744 


In [61]:
print("Training set accuracy: {:.2f}".format(clf.score(xtrain_tfv, trainer.target)))
print("Test set accuracy: {:.2f}".format(clf.score(xvalid_tfv, valid.target)))

Training set accuracy: 0.93
Test set accuracy: 0.78


In [59]:
train_predictions = clf.predict(xtrain_tfv)
print ("f1_score on training: %0.3f " % f1_score(trainer.target.values, train_predictions))

f1_score on training: 0.916 


In [60]:
#print ("logloss: %0.3f " % multiclass_logloss(valid.target.values, predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, predictions))
#f1 score should be made as close to 1 as possible

f1_score on test: 0.727 


In [28]:
valid.target.values.shape[0]

1523

In [74]:
#help(LinearSVC)
sv_clf = LinearSVC(C=0.38)
sv_clf.fit(xtrain_tfv, trainer.target)

LinearSVC(C=0.38, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [75]:
sv_predictions = sv_clf.predict(xvalid_tfv)
sv_train_predictions = sv_clf.predict(xtrain_tfv)


In [76]:
print ("f1_score on training: %0.3f " % f1_score(trainer.target.values, sv_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, sv_predictions))

f1_score on training: 0.896 
f1_score on test: 0.741 


<h2>Use Naive bayes on the data</h2>

In [55]:
nb_clf = MultinomialNB(alpha=1)
nb_clf.fit(xtrain_tfv, trainer.target)


MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [56]:
nb_predictions = nb_clf.predict(xvalid_tfv)
nb_train_predictions = nb_clf.predict(xtrain_tfv)

In [57]:
print ("f1_score on training: %0.3f " % f1_score(trainer.target.values, nb_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, nb_predictions))

f1_score on training: 0.816 
f1_score on test: 0.726 


<h1> Using Count vectorizer instead of td-idf below<h1>

In [34]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [35]:
ctv.fit(list(trainer.text.values) + list(valid.text.values))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [36]:
xtrain_ctv = ctv.transform(trainer.text.values)
xvalid_ctv = ctv.transform(valid.text.values)

In [121]:
clf_ctv = LogisticRegression(C=0.2)
clf_ctv.fit(xtrain_ctv, trainer.target)
print("Number of features used:", np.sum(clf_ctv.coef_ != 0))

Number of features used: 108907


In [122]:
ctv_train_predictions = clf_ctv.predict(xtrain_ctv)
ctv_predictions = clf_ctv.predict(xvalid_ctv)


In [123]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, ctv_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, ctv_predictions))

f1_score on train: 0.719 
f1_score on test: 0.619 


In [30]:
dt_clf = DecisionTreeClassifier(max_depth = 75)
dt_clf.fit(xtrain_tfv, trainer.target)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=75, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [31]:
dt_train_predictions = dt_clf.predict(xtrain_tfv)
dt_predictions = dt_clf.predict(xvalid_tfv)

In [32]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, dt_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, dt_predictions))

f1_score on train: 0.868 
f1_score on test: 0.650 


In [19]:
#help(RandomForestClassifier)
forest = RandomForestClassifier()
forest.fit(xtrain_tfv, trainer.target.values)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
rfc_train_predictions = forest.predict(xtrain_tfv)
rfc_predictions = forest.predict(xvalid_tfv)

In [23]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, rfc_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, rfc_predictions))

f1_score on train: 0.983 
f1_score on test: 0.700 


|Classifier|Data Transformer|Parameters|f1_Score_training|f1_Score_test|
| --- | --- | --- | --- | --- |
|LogisticRegression|tf-idf|C=2|0.870|0.744 
|LinearSVC|tf-idf|C=0.38|0.896|0.741| 
|LinearSVC|tf-idf|C=0.1|0.836|0.734| 
|MultinomialNB|tf-idf|default(alpha=1)|0.816|0.726|
|LogisticRegression|countVectorizer|liblinear/l1|0.857|0.738|
|DecisionTreeClassifier|tf-idf|Default|0.983|0.656|
|RandomForestClassifier|tf-idf|Default|0.983|0.700|
|GradientBoostingClassifier|tf-idf|lr = 1|0.87|0.700|
|xgboost|tf-idf|lr=0.3,nestimators=200,max_depth=5|0.856|0.718|
|SVC|tf-idf|default|0.942|0.739|
|MLP|tf-idf|max_iter=30,lbfgs,random_state=42|0.869|0.736|

In [35]:
gbc_clf = GradientBoostingClassifier(learning_rate = 1)
gbc_clf.fit(xtrain_tfv, trainer.target.values)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [38]:
gbc_train_predictions = gbc_clf.predict(xtrain_tfv)
gbc_predictions = gbc_clf.predict(xvalid_tfv)

In [40]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, gbc_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, gbc_predictions))

f1_score on train: 0.877 
f1_score on test: 0.690 


In [35]:
#help(xgb)
xgb_clf = xgb.XGBClassifier(max_depth = 5, n_estimators = 200, learning_rate = 0.1, nthread = 8)

In [36]:
xgb_clf.fit(xtrain_tfv, trainer.target.values)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=8, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [37]:
xgb_train_predictions = xgb_clf.predict(xtrain_tfv)
xgb_predictions = xgb_clf.predict(xvalid_tfv)

In [38]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, xgb_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, xgb_predictions))

f1_score on train: 0.774 
f1_score on test: 0.701 


In [43]:
#help(SVC)
svc_clf = SVC()
svc_clf.fit(xtrain_tfv, trainer.target.values)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
svc_train_predictions = svc_clf.predict(xtrain_tfv)
svc_predictions = svc_clf.predict(xvalid_tfv)

In [45]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, svc_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, svc_predictions))

f1_score on train: 0.942 
f1_score on test: 0.739 


In [28]:
f1_scorer = metrics.make_scorer(f1_score, greater_is_better=True, needs_proba=False)
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
clf = pipeline.Pipeline([('svd', svd),
                        ('scl', scl),
                        ('lr', lr_model)])

In [22]:
param_grid = {'svd__n_components' : [120,180],
             'lr__C':[0.1, 1.0, 10],
             'lr__penalty': ['l1','l2']}

In [29]:
model = GridSearchCV(estimator = clf, param_grid = param_grid, scoring = f1_scorer, verbose = 10,n_jobs=-1,iid=True,
                    refit = True, cv=2)
model.fit(xtrain_tfv, trainer.target.values)
print("Best score:%0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:   14.7s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   17.0s finished


Best score:0.689
Best parameters set:
	lr__C: 10
	lr__penalty: 'l2'
	svd__n_components: 180


# #Applying Multilayer Perceptron

In [33]:
#help(MLPClassifier)
mlp = MLPClassifier(max_iter=30, solver='lbfgs')
mlp.fit(xtrain_tfv, trainer.target.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=30,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [34]:
mlp_train_predictions = mlp.predict(xtrain_tfv)
mlp_predictions = mlp.predict(xvalid_tfv)

In [35]:
print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, mlp_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, mlp_predictions))

f1_score on train: 0.918 
f1_score on test: 0.714 


In [6]:
word_embeddings = {}
f = open('glove.6B.300d.txt', encoding='utf8')
for line in tqdm(f):
    values = line.split()  
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
print('Found %s word vectors.' % len(word_embeddings))



0it [00:00, ?it/s][A[A

366it [00:00, 3417.97it/s][A[A

728it [00:00, 3466.34it/s][A[A

1096it [00:00, 3512.04it/s][A[A

1503it [00:00, 3662.46it/s][A[A

1906it [00:00, 3764.74it/s][A[A

2304it [00:00, 3809.27it/s][A[A

2711it [00:00, 3880.64it/s][A[A

3087it [00:00, 3839.85it/s][A[A

3483it [00:00, 3842.89it/s][A[A

3861it [00:01, 3812.82it/s][A[A

4234it [00:01, 3554.79it/s][A[A

4587it [00:01, 3234.93it/s][A[A

4915it [00:01, 3136.48it/s][A[A

5232it [00:01, 3080.39it/s][A[A

5620it [00:01, 3247.65it/s][A[A

5995it [00:01, 3376.77it/s][A[A

6410it [00:01, 3572.89it/s][A[A

6774it [00:01, 3524.73it/s][A[A

7131it [00:02, 3109.78it/s][A[A

7476it [00:02, 3197.83it/s][A[A

7807it [00:02, 3223.43it/s][A[A

8136it [00:02, 3138.49it/s][A[A

8455it [00:02, 3147.44it/s][A[A

8779it [00:02, 3172.59it/s][A[A

9119it [00:02, 3232.96it/s][A[A

9507it [00:02, 3276.17it/s][A[A

9840it [00:02, 3282.66it/s][A[A

10178it [00:02, 3307.83it/s]

85464it [00:24, 3203.46it/s][A[A

85865it [00:24, 3407.66it/s][A[A

86212it [00:24, 3397.89it/s][A[A

86598it [00:24, 3512.88it/s][A[A

86954it [00:24, 3337.04it/s][A[A

87293it [00:24, 3195.73it/s][A[A

87629it [00:24, 3239.98it/s][A[A

88002it [00:25, 3364.75it/s][A[A

88343it [00:25, 3250.61it/s][A[A

88673it [00:25, 3260.02it/s][A[A

89053it [00:25, 3370.84it/s][A[A

89423it [00:25, 3408.03it/s][A[A

89788it [00:25, 3473.86it/s][A[A

90195it [00:25, 3585.15it/s][A[A

90556it [00:25, 3573.91it/s][A[A

90952it [00:25, 3680.02it/s][A[A

91349it [00:25, 3760.11it/s][A[A

91739it [00:26, 3770.15it/s][A[A

92138it [00:26, 3823.02it/s][A[A

92527it [00:26, 3825.36it/s][A[A

92917it [00:26, 3794.57it/s][A[A

93326it [00:26, 3870.19it/s][A[A

93721it [00:26, 3820.25it/s][A[A

94104it [00:26, 3621.93it/s][A[A

94469it [00:26, 3555.11it/s][A[A

94844it [00:26, 3603.23it/s][A[A

95223it [00:26, 3654.17it/s][A[A

95666it [00:27, 3694.16it/s]

170323it [00:47, 4163.39it/s][A[A

170745it [00:48, 4167.59it/s][A[A

171166it [00:48, 4034.50it/s][A[A

171573it [00:48, 3908.13it/s][A[A

171972it [00:48, 3855.23it/s][A[A

172369it [00:48, 3888.15it/s][A[A

172760it [00:48, 3884.13it/s][A[A

173158it [00:48, 3911.74it/s][A[A

173566it [00:48, 3959.86it/s][A[A

173977it [00:48, 3992.82it/s][A[A

174377it [00:48, 3981.28it/s][A[A

174776it [00:49, 3790.14it/s][A[A

175263it [00:49, 3928.52it/s][A[A

175659it [00:49, 3914.69it/s][A[A

176053it [00:49, 3910.52it/s][A[A

176446it [00:49, 3742.82it/s][A[A

176920it [00:49, 3852.12it/s][A[A

177308it [00:49, 3650.55it/s][A[A

177723it [00:49, 3781.99it/s][A[A

178188it [00:49, 3870.71it/s][A[A

178587it [00:50, 3895.39it/s][A[A

178979it [00:50, 3861.62it/s][A[A

179367it [00:50, 3797.20it/s][A[A

179749it [00:50, 3647.69it/s][A[A

180117it [00:50, 3039.56it/s][A[A

180530it [00:50, 3243.59it/s][A[A

180932it [00:50, 3413.64it/s][A[A

1

248518it [01:12, 3307.87it/s][A[A

248906it [01:12, 3452.46it/s][A[A

249258it [01:12, 3429.83it/s][A[A

249606it [01:12, 3373.92it/s][A[A

249947it [01:12, 3166.61it/s][A[A

250269it [01:13, 3105.14it/s][A[A

250584it [01:13, 2651.88it/s][A[A

250864it [01:13, 2620.46it/s][A[A

251136it [01:13, 2621.79it/s][A[A

251406it [01:13, 2598.91it/s][A[A

251768it [01:13, 2768.41it/s][A[A

252121it [01:13, 2959.89it/s][A[A

252516it [01:13, 3193.21it/s][A[A

252874it [01:13, 3298.76it/s][A[A

253213it [01:14, 3165.27it/s][A[A

253537it [01:14, 3183.62it/s][A[A

253955it [01:14, 3304.54it/s][A[A

254324it [01:14, 3402.44it/s][A[A

254784it [01:14, 3634.02it/s][A[A

255185it [01:14, 3738.61it/s][A[A

255566it [01:14, 3759.13it/s][A[A

256002it [01:14, 3839.41it/s][A[A

256396it [01:14, 3858.98it/s][A[A

256803it [01:14, 3918.76it/s][A[A

257209it [01:15, 3949.30it/s][A[A

257606it [01:15, 3904.43it/s][A[A

257998it [01:15, 3901.39it/s][A[A

2

334020it [01:35, 3754.41it/s][A[A

334397it [01:35, 3602.48it/s][A[A

334760it [01:35, 3610.37it/s][A[A

335134it [01:35, 3638.24it/s][A[A

335568it [01:36, 3733.20it/s][A[A

335980it [01:36, 3830.28it/s][A[A

336395it [01:36, 3872.92it/s][A[A

336784it [01:36, 3855.78it/s][A[A

337171it [01:36, 3854.09it/s][A[A

337580it [01:36, 3915.83it/s][A[A

338071it [01:36, 4167.19it/s][A[A

338506it [01:36, 4062.47it/s][A[A

338917it [01:36, 4048.20it/s][A[A

339369it [01:37, 4020.21it/s][A[A

339777it [01:37, 3981.23it/s][A[A

340188it [01:37, 3938.57it/s][A[A

340584it [01:37, 3888.72it/s][A[A

340974it [01:37, 3861.27it/s][A[A

341368it [01:37, 3878.41it/s][A[A

341792it [01:37, 3847.11it/s][A[A

342178it [01:37, 3739.41it/s][A[A

342553it [01:37, 3689.65it/s][A[A

342959it [01:37, 3775.63it/s][A[A

343358it [01:38, 3831.76it/s][A[A

343745it [01:38, 3836.97it/s][A[A

344130it [01:38, 3827.62it/s][A[A

344538it [01:38, 3844.82it/s][A[A

3

Found 400000 word vectors.





In [37]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(word_embeddings[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if (type(v) != np.ndarray):
        return np.zeros(300)
    return v/np.sqrt((v ** 2).sum())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manokuma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [38]:
xtrain_glove = [sent2vec(x) for x in tqdm(trainer.text.values)]
xvalid_glove = [sent2vec(x) for x in tqdm(valid.text.values)]
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)









  0%|                                                                                         | 0/6090 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  3%|█▉                                                                           | 158/6090 [00:00<00:03, 1569.49it/s][A[A[A[A[A[A[A[A







  5%|███▌                                                                         | 277/6090 [00:00<00:04, 1432.23it/s][A[A[A[A[A[A[A[A







  7%|█████▌                                                                       | 436/6090 [00:00<00:03, 1472.79it/s][A[A[A[A[A[A[A[A







 10%|███████▌                                                                     | 595/6090 [00:00<00:03, 1505.56it/s][A[A[A[A[A[A[A[A







 12%|█████████▍                                                                   | 748/6090 [00:00<00:03, 1493.42it/s][A[A[A[A[A[A[A[A







 15%|███████████▎                                                                 | 89

In [39]:
xgb_clf_glove = xgb.XGBClassifier(nthread=10, silent = False)
xgb_clf_glove.fit(xtrain_glove, trainer.target.values)
xgb_glove_predictions = xgb_clf_glove.predict(xvalid_glove)

In [41]:
#print ("f1_score on train: %0.3f " % f1_score(trainer.target.values, xgb_train_predictions))
print ("f1_score on test: %0.3f " % f1_score(valid.target.values, xgb_glove_predictions))

f1_score on test: 0.760 
