https://www.kaggle.com/c/word2vec-nlp-tutorial/discussion/11261

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter='\t', quoting=3)

print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [3]:
y_train = train["sentiment"]

In [4]:
from KaggleWord2VecUtility import KaggleWord2VecUtility

# traindata = []
# for i in range( 0, len(train["review"])):
#     traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
# testdata = []
# for i in range(0,len(test["review"])):
#     testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))

In [5]:
%time train['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(train['review'], KaggleWord2VecUtility.review_to_join_words, workers=4)

CPU times: user 82.8 ms, sys: 128 ms, total: 211 ms
Wall time: 36.9 s


In [6]:
%time test['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(test['review'], KaggleWord2VecUtility.review_to_join_words, workers=4)

CPU times: user 87.5 ms, sys: 174 ms, total: 262 ms
Wall time: 36.3 s


In [49]:
X_train = train['review_clean']
X_test = test['review_clean']

In [50]:
X_all = X_train + X_test

In [51]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
#                              strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
#                              ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
#                              stop_words='english')

# vectorizer.fit(X_all)
# X_train = vectorizer.transform(X_train)
# X_test = vectorizer.transform(X_test)

In [52]:
from sklearn.pipeline import Pipeline

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

# char_vectorizer = Pipeline([('tfidf_char', TfidfVectorizer(
#     analyzer='char', max_features=10000, ngram_range=(1, 9))) ])

char_vectorizer = TfidfVectorizer(
                    sublinear_tf=True,
                    strip_accents='unicode',
                    analyzer='char',
                    ngram_range=(1, 4),
                    max_features=20000)

%time char_vectorizer.fit(X_all)
%time X_train_char = char_vectorizer.transform(X_train)
%time X_test_char = char_vectorizer.transform(X_test)

CPU times: user 1min 29s, sys: 1.85 s, total: 1min 30s
Wall time: 1min 31s
CPU times: user 54.1 s, sys: 1.12 s, total: 55.2 s
Wall time: 55.7 s
CPU times: user 51.4 s, sys: 738 ms, total: 52.1 s
Wall time: 52.2 s


In [54]:
word_vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
                             strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                             ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
                             stop_words='english')

# word_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='word',
#     token_pattern=r'\w{1,}',
#     ngram_range=(1, 1),
#     max_features=20000)
    

%time word_vectorizer.fit(X_all)
%time X_train_word = word_vectorizer.transform(X_train)
%time X_test_word = word_vectorizer.transform(X_test)

CPU times: user 6.48 s, sys: 99.8 ms, total: 6.58 s
Wall time: 6.58 s
CPU times: user 3.58 s, sys: 55.6 ms, total: 3.64 s
Wall time: 3.64 s
CPU times: user 3.56 s, sys: 60.2 ms, total: 3.62 s
Wall time: 3.62 s


In [55]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_test = hstack([X_test_char, X_test_word])

In [56]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                         C=1, fit_intercept=True, intercept_scaling=1.0, 
                         class_weight=None, random_state=None)

In [57]:
from sklearn import cross_validation
score = np.mean(cross_validation.cross_val_score(model, X_train, y_train, cv=20, scoring='roc_auc'))
print ("20 Fold CV Score: ", score)

20 Fold CV Score:  0.961094656


In [58]:
model.fit(X_train, y_train)
#result = model.predict(X_test)
#result = model.predict_proba(X_test) 
result = model.predict_proba(X_test)[:,1]

In [59]:
result

array([ 0.99203488,  0.03665503,  0.52460254, ...,  0.13763411,
        0.94295881,  0.62267839])

In [60]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

In [61]:
from datetime import datetime

current_time = datetime.now()
current_time = current_time.strftime("%Y%m%d_%H%M%S")

description = "Bag_of_Words_model"

output.to_csv("submissions/{description}_{time}_{score:.5f}.csv".format(description=description, score=score, time=current_time), index=False, quoting=3)

score / kaggle
* baseline : 0.957149184 / 0.95358
* 0.957798912 / 0.88460
* 0.958188416 / 0.88516 - predict
* 0.958188416 / 0.95405 - fit(X_all), predict_proba
* 0.958357888 / 0.95440 - n't -> not
* 0.958798464 / 0.95647 - char/word tf-idf
* 0.961166976 / 0.95859 - char/word tf-idf, parameter tunning