In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = './data/'

In [3]:
train_data = pd.read_csv(data_path + 'labeledTrainData.tsv', delimiter='\t', quoting=3)
test_data = pd.read_csv(data_path + 'testData.tsv', delimiter='\t', quoting=3)

In [4]:
from bs4 import BeautifulSoup
import re
def sentenceToWords(sentence):
    sentence = BeautifulSoup(sentence).get_text()
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    words = sentence.lower().split()
    return words

train_data['words'] = [' '.join(sentenceToWords(sentence)) for sentence in train_data['review']]
test_data['words'] = [' '.join(sentenceToWords(sentence)) for sentence in test_data['review']]

# Bag-of-words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
train_size = train_data.shape[0]

In [6]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,
                             max_features = 5000)
train_features = vectorizer.fit_transform(train_data['words'].values)
test_features = vectorizer.transform(test_data['words'].values)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import sklearn.model_selection
RF_params = {
    'n_estimators': 100,
    'n_jobs': -1
}
SGD_params = {
    'max_iter': 10000,
    'tol': 1e-4,
    'n_jobs': -1,
    'loss': 'log'
}
def cv(x, y, model, params={}, splits=5):
    clf = model(**params)
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
        'f1': 'f1_micro',
        'roc_auc': 'roc_auc'
    }, return_train_score=True)
    return cv_score

def run_cross_validation(x, y, model, params):
    cv_result = cv(x, y, model, params=params, splits=5)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))
        
print('Random Forest: ')
run_cross_validation(train_features, train_data['sentiment'].values, RandomForestClassifier, RF_params)
print('\n\nSGD:')
run_cross_validation(train_features, train_data['sentiment'].values, SGDClassifier, SGD_params)

Random Forest: 
fit_time: [7.64325404 7.30175304 7.48292899 7.76622891 7.38657403]
Average fit_time: 7.516148
score_time: [0.36505389 0.36059999 0.357687   0.35433722 0.35551095]
Average score_time: 0.358638
test_accuracy: [0.8474 0.8424 0.8378 0.8368 0.8406]
Average test_accuracy: 0.841000
train_accuracy: [1. 1. 1. 1. 1.]
Average train_accuracy: 1.000000
test_f1: [0.8474 0.8424 0.8378 0.8368 0.8406]
Average test_f1: 0.841000
train_f1: [1. 1. 1. 1. 1.]
Average train_f1: 1.000000
test_roc_auc: [0.923602   0.91951944 0.91685456 0.9169128  0.91790816]
Average test_roc_auc: 0.918959
train_roc_auc: [1. 1. 1. 1. 1.]
Average train_roc_auc: 1.000000


SGD:
fit_time: [1.00066376 1.24992776 1.27040219 1.001261   1.08128214]
Average fit_time: 1.120707
score_time: [0.01230001 0.00980926 0.0099709  0.00990391 0.01044583]
Average score_time: 0.010486
test_accuracy: [0.8612 0.8624 0.8594 0.8546 0.8638]
Average test_accuracy: 0.860280
train_accuracy: [0.9636  0.9528  0.94555 0.95185 0.944  ]
Average t

In [8]:
model = SGDClassifier(**SGD_params)
model.fit(train_features, train_data['sentiment'].values)
result = model.predict_proba(test_features)[:, 1]
test_data['sentiment'] = result

In [9]:
# Kaggle ROC-AUC score: 0.92870
test_data[['id', 'sentiment']].to_csv('res/BOW.csv', index=False, quoting=3)


# Word2Vec

In [10]:
from nltk.corpus import brown
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [11]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def paragraphToSentences(paragraph):
    sentences = tokenizer.tokenize(paragraph.strip())
    return [sentenceToWords(sentence) for sentence in sentences if sentence]

allSentences = []
print('Processing Training Set')
for review in train_data['review']:
    allSentences.extend(paragraphToSentences(review))
print('Processing Test Set')
for review in test_data['review']:
    allSentences.extend(paragraphToSentences(review))

Processing Training Set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Processing Test Set


  ' Beautiful Soup.' % markup)


In [12]:
from gensim.models import word2vec
word2vec_model = word2vec.Word2Vec(allSentences, workers=12, 
            size=300, min_count = 40, 
            window = 10, sample = 1e-3)

In [13]:
word2vec_model.wv.most_similar("man")

[('woman', 0.6817048788070679),
 ('lady', 0.5643702149391174),
 ('soldier', 0.549045205116272),
 ('businessman', 0.5414022207260132),
 ('boy', 0.5407766699790955),
 ('guy', 0.5258374214172363),
 ('priest', 0.5237928032875061),
 ('lad', 0.5100698471069336),
 ('monk', 0.5043931007385254),
 ('men', 0.49725764989852905)]

In [14]:
word2vec_model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [15]:
word2vec_model.wv.vectors.shape

(12857, 300)

In [16]:
def makeFeatureVec(words):
    featureVec = np.zeros(shape=(word2vec_model.wv.vectors.shape[1],), dtype="float32")
    count = 0
    word_set = set(word2vec_model.wv.index2word)
    for word in words:
        if word in word_set:
            count += 1
            featureVec += word2vec_model.wv[word]
    return featureVec / count

In [17]:
def getAvgFeatureVec(sentences):
    res = np.empty(shape=(len(sentences), word2vec_model.wv.vectors.shape[1]), dtype="float32")
    index = 0
    for sentence in sentences:
        if index % 5000 == 0:
            print(index)
        res[index] = makeFeatureVec(sentence.split())
        index += 1
    return res

In [18]:
train_features = getAvgFeatureVec(train_data['words'])
test_features = getAvgFeatureVec(test_data['words'])

0
5000
10000
15000
20000
0
5000
10000
15000
20000


In [None]:
print('SGD:')
run_cross_validation(train_features, train_data['sentiment'].values, SGDClassifier, SGD_params)

In [None]:
model = SGDClassifier(**SGD_params)
model.fit(train_features, train_data['sentiment'].values)
result = model.predict_proba(test_features)[:, 1]
test_data['sentiment'] = result
# Kaggle ROC-AUC score: 0.93756
test_data[['id', 'sentiment']].to_csv('res/Word2Vec.csv', index=False, quoting=3)