In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter='\t', quoting=3)

print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
train["sentiment"].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [5]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
from KaggleWord2VecUtility import KaggleWord2VecUtility
from bs4 import BeautifulSoup

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    return review_text

In [7]:
#%time train['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(train['review'], review_to_words, workers=4)
%time train['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(train['review'], KaggleWord2VecUtility.review_to_join_words, workers=4)

CPU times: user 84.4 ms, sys: 130 ms, total: 214 ms
Wall time: 36.7 s


In [8]:
#%time test['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(test['review'], review_to_words, workers=4)
%time test['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(test['review'], KaggleWord2VecUtility.review_to_join_words, workers=4)

CPU times: user 85.8 ms, sys: 182 ms, total: 268 ms
Wall time: 37.4 s


In [9]:
train['review_clean'][:10]

0    with all this stuff go down at the moment with...
1    the classic war of the world by timothi hine i...
2    the film start with a manag nichola bell give ...
3    it must be assum that those who prais this fil...
4    superbl trashi and wondrous unpretenti s explo...
5    i dont know whi peopl think this is such a bad...
6    this movi could have been veri good but come u...
7    i watch this video at a friend s hous i m glad...
8    a friend of mine bought this film for and even...
9    this movi is full of refer like mad max ii the...
Name: review_clean, dtype: object

In [10]:
test['review_clean'][:10]

0    natur in a film who s main theme are of mortal...
1    this movi is a disast within a disast film it ...
2    all in all this is a movi for kid we saw it to...
3    afraid of the dark left me with the impress th...
4    a veri accur depict of small time mob life fil...
5    as valuabl as king tut s tomb ok mayb not that...
6    this has to be one of the biggest misfir ever ...
7    this is one of those movi i watch and wonder w...
8    the worst movi i ve seen in year and i ve seen...
9    five medic student kevin bacon david labraccio...
Name: review_clean, dtype: object

In [34]:
X_train = train['review_clean']
X_test = test['review_clean']

## TF-IDF

In [12]:
# import nltk
# nltk.download('words')

In [12]:
from sklearn.pipeline import Pipeline
from nltk.corpus import words

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = Pipeline([('tfidf_char', TfidfVectorizer(
    analyzer='char', max_features=10000, ngram_range=(1, 9))) ])
char_vectorizer

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [35]:
X_all = X_train + X_test

In [36]:
#char_vectorizer.fit(X_train)
char_vectorizer.fit(X_all)

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [37]:
%time X_train_char = char_vectorizer.transform(X_train)
X_train_char

CPU times: user 2min 22s, sys: 1.81 s, total: 2min 24s
Wall time: 2min 26s


<25000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 50386859 stored elements in Compressed Sparse Row format>

In [38]:
%time X_test_char = char_vectorizer.transform(X_test)
X_test_char

CPU times: user 2min 18s, sys: 1.87 s, total: 2min 20s
Wall time: 2min 21s


<25000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 49635754 stored elements in Compressed Sparse Row format>

In [39]:
word_vectorizer = Pipeline([('tfidf_char', TfidfVectorizer(
    analyzer='word', 
    max_features=30000, 
    ngram_range=(1, 2)))])
word_vectorizer

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [40]:
#%time word_vectorizer.fit(X_train)
%time word_vectorizer.fit(X_all)

CPU times: user 28.8 s, sys: 1.27 s, total: 30 s
Wall time: 30.5 s


Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [41]:
%time X_train_word = word_vectorizer.transform(X_train)
X_train_word

CPU times: user 8.73 s, sys: 144 ms, total: 8.87 s
Wall time: 8.93 s


<25000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5864293 stored elements in Compressed Sparse Row format>

In [42]:
%time X_test_word = word_vectorizer.transform(X_test)
X_test_word

CPU times: user 8.79 s, sys: 178 ms, total: 8.97 s
Wall time: 9.07 s


<25000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5754188 stored elements in Compressed Sparse Row format>

In [43]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 56251152 stored elements in COOrdinate format>

In [44]:
X_test = hstack([X_test_char, X_test_word])
X_test

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 55389942 stored elements in COOrdinate format>

In [45]:
from scipy.sparse import csr_matrix

X_train = csr_matrix(X_train)
X_train

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 56251152 stored elements in Compressed Sparse Row format>

In [46]:
from scipy.sparse import csr_matrix

X_test = csr_matrix(X_test)
X_test

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 55389942 stored elements in Compressed Sparse Row format>

In [47]:
y_train = train["sentiment"]

print(y_train.shape)
y_train.head()

(25000,)


0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

## RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2018)
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [55]:
%time forest = forest.fit(X_train.toarray(), train['sentiment'])

CPU times: user 5min 50s, sys: 21.3 s, total: 6min 11s
Wall time: 2min 10s


In [34]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=5, shuffle=True, random_state=2018)

%time score = np.mean(cross_val_score(forest, X_train, train["sentiment"], cv=k_fold, scoring="roc_auc", n_jobs=-1))

CPU times: user 13.3 s, sys: 7.24 s, total: 20.6 s
Wall time: 6min


In [35]:
'{:,.5f}'.format(score)

'0.89080'

In [36]:
%time result = forest.predict(X_test.toarray())

CPU times: user 7.1 s, sys: 12.1 s, total: 19.2 s
Wall time: 22.1 s


In [37]:
result[:10]

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0])

In [38]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})

In [39]:
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1


In [40]:
output['sentiment'].value_counts()

0    12594
1    12406
Name: sentiment, dtype: int64

In [41]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

188


0    12594
1    12406
Name: sentiment, dtype: int64

In [43]:
output.to_csv("submissions/tutorial_5_tfidf_rf_{0:.5f}.csv".format(score), index=False, quoting=3)

## XGBoost

In [48]:
import xgboost as xgb

In [49]:
params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'lambda': 2.0,
    'alpha': 1.0,
    'lambda_bias': 6.0,
    'num_class': 5,
    'nthread': 8,
    'n_jobs': -1,
    'silent': 1,
}
xgb.XGBClassifier(params)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'booster': 'gblinear', 'objective': 'multi:softmax', 'eval_metric': 'merror', 'lambda': 2.0, 'alpha': 1.0, 'lambda_bias': 6.0, 'num_class': 5, 'nthread': 8, 'n_jobs': -1, 'silent': 1},
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [50]:
dtrain = xgb.DMatrix(X_train, label=y_train)

%time booster = xgb.train(params, dtrain, num_boost_round=90)

CPU times: user 3min 11s, sys: 440 ms, total: 3min 12s
Wall time: 3min 12s


In [51]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

(25000,)


array([ 1.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.], dtype=float32)

jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

In [52]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':predictions})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1.0
1,"""8348_2""",0.0
2,"""5828_4""",1.0
3,"""7186_2""",0.0
4,"""12128_7""",1.0


In [53]:
from datetime import datetime

current_time = datetime.now()
current_time = current_time.strftime("%Y%m%d_%H%M%S")

output.to_csv("submissions/tfidf_xgboost_char_word_{time}.csv".format(time=current_time), index=False, quoting=3)

## Submit - kaggle

In [33]:
!kaggle competitions submit -c word2vec-nlp-tutorial -f ./submissions/tutorial_5_tfidf_xgboost_char_word.csv -m 'tfidf xgboost'

Successfully submitted to Bag of Words Meets Bags of Popcorn

In [34]:
!kaggle competitions submissions -c word2vec-nlp-tutorial

fileName                                date                 description    status    publicScore  privateScore  
--------------------------------------  -------------------  -------------  --------  -----------  ------------  
tutorial_5_tfidf_xgboost_char_word.csv  2018-02-17 12:30:20  tfidf xgboost  complete  0.87588      None          
Word2Vec_AverageVectors_0.90431.csv     2018-02-16 09:36:30  None           complete  0.81872      None          
tutorial_1_BOW_0.92761.csv              2018-02-13 15:00:51  None           complete  0.85360      None          


In [35]:
271/578 

0.4688581314878893