In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter='\t', quoting=3)

print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
train["sentiment"].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [6]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [7]:
from KaggleWord2VecUtility import KaggleWord2VecUtility
from bs4 import BeautifulSoup

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    return review_text

In [8]:
%time train['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(train['review'], review_to_words, workers=4)

CPU times: user 88.2 ms, sys: 192 ms, total: 280 ms
Wall time: 2.08 s


In [9]:
%time test['review_clean'] = KaggleWord2VecUtility.apply_by_multiprocessing(test['review'], review_to_words, workers=4)

CPU times: user 84.3 ms, sys: 230 ms, total: 314 ms
Wall time: 2.25 s


In [11]:
train['review_clean'][:10]

0    "With all this stuff going down at the moment ...
1    "\"The Classic War of the Worlds\" by Timothy ...
2    "The film starts with a manager (Nicholas Bell...
3    "It must be assumed that those who praised thi...
4    "Superbly trashy and wondrously unpretentious ...
5    "I dont know why people think this is such a b...
6    "This movie could have been very good, but com...
7    "I watched this video at a friend's house. I'm...
8    "A friend of mine bought this film for £1, and...
9    "This movie is full of references. Like \"Mad ...
Name: review_clean, dtype: object

In [12]:
test['review_clean'][:10]

0    "Naturally in a film who's main themes are of ...
1    "This movie is a disaster within a disaster fi...
2    "All in all, this is a movie for kids. We saw ...
3    "Afraid of the Dark left me with the impressio...
4    "A very accurate depiction of small time mob l...
5    "...as valuable as King Tut's tomb! (OK, maybe...
6    "This has to be one of the biggest misfires ev...
7    "This is one of those movies I watched, and wo...
8    "The worst movie i've seen in years (and i've ...
9    "Five medical students (Kevin Bacon, David Lab...
Name: review_clean, dtype: object

In [13]:
X_train = train['review_clean']
X_test = test['review_clean']

## TF-IDF

In [18]:
# import nltk
# nltk.download('words')

[nltk_data] Downloading package words to /Users/leesu/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import words

vectorizer = CountVectorizer(analyzer = 'word',
                           lowercase = True,
                           tokenizer = None,
                           preprocessor = None,
                           stop_words = 'english', 
                           min_df = 2,
                           ngram_range = (1,3),
                           vocabulary = set(words.words()),
                           max_features = 90000
                          )
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=90000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary={'intermontane', 'proquaestor', 'sonnetic', 'gypsyfy', 'respecting', 'Antigonon', 'Ida', 'decemplicate', 'dag', 'perseverate', 'strepitant', 'cogwood', 'Noctiluca', 'Zend', 'Cercolabes', 'weald', 'atopy', 'varicellation', 'interlinguistic', 'ewry', 'hagioscopic', 'stoning', 'wintriness', ..., 'dermohumeral', 'mise', 'mammiform', 'reticularly', 'hawked', 'recruiter', 'hyraceum', 'tonalite'})

In [29]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', TfidfTransformer(smooth_idf = False)),
])  
pipeline

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=90000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
       ...('tfidf', TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True))])

In [30]:
%time X_train_tfidf_vector = pipeline.fit_transform(X_train)

CPU times: user 7.55 s, sys: 117 ms, total: 7.66 s
Wall time: 7.76 s


  idf = np.log(float(n_samples) / df) + 1.0


In [31]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

235892


['A',
 'Aani',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'Ababdeh']

In [32]:
%time X_test_tfidf_vector = pipeline.fit_transform(X_test)

CPU times: user 7 s, sys: 116 ms, total: 7.12 s
Wall time: 7.15 s


  idf = np.log(float(n_samples) / df) + 1.0


In [33]:
import numpy as np
dist = np.sum(X_train_tfidf_vector, axis=0)
    
for tag, count in zip(vocab, dist):
    print(count, tag)
    
pd.DataFrame(dist, columns=vocab)

[[ 0.  0.  0. ...,  0.  0.  0.]] A


Unnamed: 0,A,Aani,Aaron,Aaronic,Aaronical,Aaronite,Aaronitic,Aaru,Ab,Ababdeh,...,zymotechnical,zymotechnics,zymotechny,zymotic,zymotically,zymotize,zymotoxic,zymurgy,zythem,zythum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
X_train_tfidf_vector

<25000x235892 sparse matrix of type '<class 'numpy.float64'>'
	with 1612861 stored elements in Compressed Sparse Row format>

In [35]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2018)
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [36]:
%time forest = forest.fit(X_train_tfidf_vector, train['sentiment'])

CPU times: user 3min 8s, sys: 908 ms, total: 3min 9s
Wall time: 1min 3s


In [41]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=5, shuffle=True, random_state=2018)

%time score = np.mean(cross_val_score(forest, X_train_tfidf_vector, train["sentiment"], cv=k_fold, scoring="roc_auc", n_jobs=-1))

CPU times: user 367 ms, sys: 77.1 ms, total: 444 ms
Wall time: 3min 33s


In [42]:
'{:,.5f}'.format(score)

'0.92068'

In [43]:
%time result = forest.predict(X_test_tfidf_vector)

CPU times: user 2.51 s, sys: 175 ms, total: 2.69 s
Wall time: 974 ms


In [44]:
result[:10]

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1])

In [48]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})

In [49]:
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",0
3,"""7186_2""",0
4,"""12128_7""",1


In [50]:
output['sentiment'].value_counts()

1    12688
0    12312
Name: sentiment, dtype: int64

In [51]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

-376


1    12688
0    12312
Name: sentiment, dtype: int64

In [47]:
output.to_csv("data/tutorial_4_tfidf_{0:.5f}.csv".format(score), index=False, quoting=3)