In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter='\t', quoting=3)

print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
train["sentiment"].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [5]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
from KaggleWord2VecUtility import KaggleWord2VecUtility
from bs4 import BeautifulSoup

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    return review_text

In [7]:
%time train['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(train['review'], review_to_words, workers=4)

CPU times: user 78.9 ms, sys: 148 ms, total: 227 ms
Wall time: 1.86 s


In [8]:
%time test['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(test['review'], review_to_words, workers=4)

CPU times: user 82.5 ms, sys: 181 ms, total: 264 ms
Wall time: 1.83 s


In [9]:
train['review_clean'][:10]

0    "With all this stuff going down at the moment ...
1    "\"The Classic War of the Worlds\" by Timothy ...
2    "The film starts with a manager (Nicholas Bell...
3    "It must be assumed that those who praised thi...
4    "Superbly trashy and wondrously unpretentious ...
5    "I dont know why people think this is such a b...
6    "This movie could have been very good, but com...
7    "I watched this video at a friend's house. I'm...
8    "A friend of mine bought this film for £1, and...
9    "This movie is full of references. Like \"Mad ...
Name: review_clean, dtype: object

In [10]:
test['review_clean'][:10]

0    "Naturally in a film who's main themes are of ...
1    "This movie is a disaster within a disaster fi...
2    "All in all, this is a movie for kids. We saw ...
3    "Afraid of the Dark left me with the impressio...
4    "A very accurate depiction of small time mob l...
5    "...as valuable as King Tut's tomb! (OK, maybe...
6    "This has to be one of the biggest misfires ev...
7    "This is one of those movies I watched, and wo...
8    "The worst movie i've seen in years (and i've ...
9    "Five medical students (Kevin Bacon, David Lab...
Name: review_clean, dtype: object

In [11]:
X_train = train['review_clean']
X_test = test['review_clean']

## TF-IDF

In [12]:
# import nltk
# nltk.download('words')

In [13]:
from sklearn.pipeline import Pipeline
from nltk.corpus import words

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = Pipeline([('tfidf_char', TfidfVectorizer(
    analyzer='char', max_features=10000, ngram_range=(1, 9))) ])
char_vectorizer

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [15]:
char_vectorizer.fit(X_train)

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [16]:
%time X_train_char = char_vectorizer.transform(X_train)
X_train_char

CPU times: user 2min 42s, sys: 1.87 s, total: 2min 44s
Wall time: 2min 45s


<25000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 52373311 stored elements in Compressed Sparse Row format>

In [17]:
%time X_test_char = char_vectorizer.transform(X_test)
X_test_char

CPU times: user 2min 41s, sys: 2.05 s, total: 2min 43s
Wall time: 2min 43s


<25000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 51556696 stored elements in Compressed Sparse Row format>

In [18]:
word_vectorizer = Pipeline([('tfidf_char', TfidfVectorizer(
    analyzer='word', 
    max_features=30000, 
    ngram_range=(1, 2)))])
word_vectorizer

Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [19]:
%time word_vectorizer.fit(X_train)

CPU times: user 15.1 s, sys: 410 ms, total: 15.5 s
Wall time: 15.5 s


Pipeline(memory=None,
     steps=[('tfidf_char', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [20]:
%time X_train_word = word_vectorizer.transform(X_train)
X_train_word

CPU times: user 8.01 s, sys: 111 ms, total: 8.12 s
Wall time: 8.13 s


<25000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5608076 stored elements in Compressed Sparse Row format>

In [21]:
%time X_test_word = word_vectorizer.transform(X_test)
X_test_word

CPU times: user 7.88 s, sys: 95.8 ms, total: 7.97 s
Wall time: 7.97 s


<25000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5457604 stored elements in Compressed Sparse Row format>

In [22]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 57981387 stored elements in COOrdinate format>

In [23]:
X_test = hstack([X_test_char, X_test_word])
X_test

<25000x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 57014300 stored elements in COOrdinate format>

In [24]:
y_train = train["sentiment"]

print(y_train.shape)
y_train.head()

(25000,)


0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

In [20]:
# from sklearn.ensemble import RandomForestClassifier

# forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2018)
# forest

In [21]:
# %time forest = forest.fit(X_train.toarray(), train['sentiment'])

In [22]:
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score

# k_fold = KFold(n_splits=5, shuffle=True, random_state=2018)

# %time score = np.mean(cross_val_score(forest, X_train_tfidf_vector, train["sentiment"], cv=k_fold, scoring="roc_auc", n_jobs=-1))

In [23]:
# '{:,.5f}'.format(score)

In [24]:
# %time result = forest.predict(X_test.toarray())

In [25]:
# result[:10]

In [26]:
# output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})

In [27]:
# output.head()

In [28]:
# output['sentiment'].value_counts()

In [29]:
# output_sentiment = output['sentiment'].value_counts()
# print(output_sentiment[0] - output_sentiment[1])
# output_sentiment

In [30]:
# output.to_csv("data/tutorial_5_tfidf_{0:.5f}.csv".format(score), index=False, quoting=3)

## XGBoost

In [25]:
import xgboost as xgb



In [26]:
params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'lambda': 2.0,
    'alpha': 1.0,
    'lambda_bias': 6.0,
    'num_class': 5,
    'nthread': 8,
    'n_jobs': -1,
    'silent': 1,
}
xgb.XGBClassifier(params)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'booster': 'gblinear', 'objective': 'multi:softmax', 'eval_metric': 'merror', 'lambda': 2.0, 'alpha': 1.0, 'lambda_bias': 6.0, 'num_class': 5, 'nthread': 8, 'n_jobs': -1, 'silent': 1},
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [27]:
dtrain = xgb.DMatrix(X_train, label=y_train)

%time booster = xgb.train(params, dtrain, num_boost_round=90)

CPU times: user 3min 16s, sys: 314 ms, total: 3min 16s
Wall time: 3min 16s


In [28]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

(25000,)


array([ 1.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.], dtype=float32)

jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

In [30]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':predictions})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1.0
1,"""8348_2""",0.0
2,"""5828_4""",1.0
3,"""7186_2""",0.0
4,"""12128_7""",1.0


In [31]:
output.to_csv("submissions/tutorial_5_tfidf_xgboost_char_word.csv", index=False, quoting=3)

In [33]:
!kaggle competitions submit -c word2vec-nlp-tutorial -f ./submissions/tutorial_5_tfidf_xgboost_char_word.csv -m 'tfidf xgboost'

Successfully submitted to Bag of Words Meets Bags of Popcorn

In [34]:
!kaggle competitions submissions -c word2vec-nlp-tutorial

fileName                                date                 description    status    publicScore  privateScore  
--------------------------------------  -------------------  -------------  --------  -----------  ------------  
tutorial_5_tfidf_xgboost_char_word.csv  2018-02-17 12:30:20  tfidf xgboost  complete  0.87588      None          
Word2Vec_AverageVectors_0.90431.csv     2018-02-16 09:36:30  None           complete  0.81872      None          
tutorial_1_BOW_0.92761.csv              2018-02-13 15:00:51  None           complete  0.85360      None          


In [35]:
271/578 

0.4688581314878893