In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = './data/'

In [3]:
train_data = pd.read_csv(data_path + 'labeledTrainData.tsv', delimiter='\t', quoting=3)
test_data = pd.read_csv(data_path + 'testData.tsv', delimiter='\t', quoting=3)

In [4]:
from bs4 import BeautifulSoup
import re
def sentenceToWords(sentence):
    sentence = BeautifulSoup(sentence).get_text()
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    words = sentence.lower().split()
    return words

train_data['words'] = [' '.join(sentenceToWords(sentence)) for sentence in train_data['review']]
test_data['words'] = [' '.join(sentenceToWords(sentence)) for sentence in test_data['review']]

# Bag-of-words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
train_size = train_data.shape[0]

In [6]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,
                             max_features = 5000)
train_features = vectorizer.fit_transform(train_data['words'].values)
test_features = vectorizer.transform(test_data['words'].values)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import sklearn.model_selection
RF_params = {
    'n_estimators': 100,
    'n_jobs': -1
}
SGD_params = {
    'max_iter': 10000,
    'tol': 1e-4,
    'n_jobs': -1,
    'loss': 'log'
}
def cv(x, y, model, params={}, splits=5):
    clf = model(**params)
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
        'f1': 'f1_micro',
        'roc_auc': 'roc_auc'
    }, return_train_score=True)
    return cv_score

def run_cross_validation(x, y, model, params):
    cv_result = cv(x, y, model, params=params, splits=5)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))
        
print('Random Forest: ')
run_cross_validation(train_features, train_data['sentiment'].values, RandomForestClassifier, RF_params)
print('\n\nSGD:')
run_cross_validation(train_features, train_data['sentiment'].values, SGDClassifier, SGD_params)

Random Forest: 
fit_time: [8.46212125 7.00664186 7.18373871 6.92363095 7.52549791]
Average fit_time: 7.420326
score_time: [0.48268986 0.35405421 0.35841203 0.35830808 0.35904217]
Average score_time: 0.382501
test_accuracy: [0.8372 0.8326 0.841  0.8434 0.8514]
Average test_accuracy: 0.841120
train_accuracy: [1. 1. 1. 1. 1.]
Average train_accuracy: 1.000000
test_f1: [0.8372 0.8326 0.841  0.8434 0.8514]
Average test_f1: 0.841120
train_f1: [1. 1. 1. 1. 1.]
Average train_f1: 1.000000
test_roc_auc: [0.91861128 0.91471744 0.91926568 0.91591616 0.92468208]
Average test_roc_auc: 0.918639
train_roc_auc: [1. 1. 1. 1. 1.]
Average train_roc_auc: 1.000000


SGD:
fit_time: [1.07016206 1.03188992 1.03655696 0.93682003 1.07132268]
Average fit_time: 1.029350
score_time: [0.01092505 0.01448131 0.01440215 0.01158786 0.00974488]
Average score_time: 0.012228
test_accuracy: [0.8524 0.8652 0.8648 0.8624 0.8426]
Average test_accuracy: 0.857480
train_accuracy: [0.9557  0.94155 0.944   0.96125 0.92035]
Average t

In [8]:
model = SGDClassifier(**SGD_params)
model.fit(train_features, train_data['sentiment'].values)
result = model.predict_proba(test_features)[:, 1]
test_data['sentiment'] = result

In [9]:
# Kaggle ROC-AUC score: 0.92601
test_data[['id', 'sentiment']].to_csv('res/BOW.csv', index=False, quoting=3)


# Word2Vec

In [10]:
from nltk.corpus import brown
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [11]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def paragraphToSentences(paragraph):
    sentences = tokenizer.tokenize(paragraph.strip())
    return [sentenceToWords(sentence) for sentence in sentences if sentence]

allSentences = []
print('Processing Training Set')
for review in train_data['review']:
    allSentences.extend(paragraphToSentences(review))
print('Processing Test Set')
for review in test_data['review']:
    allSentences.extend(paragraphToSentences(review))

Processing Training Set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Processing Test Set


  ' Beautiful Soup.' % markup)


In [12]:
from gensim.models import word2vec
word2vec_model = word2vec.Word2Vec(allSentences, workers=12, 
            size=300, min_count = 40, 
            window = 10, sample = 1e-3)

In [13]:
word2vec_model.wv.most_similar("man")

[('woman', 0.6721714735031128),
 ('lady', 0.574566662311554),
 ('soldier', 0.5583536028862),
 ('priest', 0.5465677976608276),
 ('guy', 0.543327271938324),
 ('businessman', 0.5413163304328918),
 ('lad', 0.5302108526229858),
 ('boy', 0.5284541845321655),
 ('doctor', 0.519687831401825),
 ('men', 0.5051175355911255)]

In [14]:
word2vec_model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [15]:
word2vec_model.wv.vectors.shape

(12857, 300)

In [16]:
def makeFeatureVec(words):
    featureVec = np.zeros(shape=(word2vec_model.wv.vectors.shape[1],), dtype="float32")
    count = 0
    word_set = set(word2vec_model.wv.index2word)
    for word in words:
        if word in word_set:
            count += 1
            featureVec += word2vec_model.wv[word]
    return featureVec / count

In [17]:
def getAvgFeatureVec(sentences):
    res = np.empty(shape=(len(sentences), word2vec_model.wv.vectors.shape[1]), dtype="float32")
    index = 0
    for sentence in sentences:
        if index % 5000 == 0:
            print(index)
        res[index] = makeFeatureVec(sentence.split())
        index += 1
    return res

In [18]:
train_features = getAvgFeatureVec(train_data['words'])
test_features = getAvgFeatureVec(test_data['words'])

0
5000
10000
15000
20000
0
5000
10000
15000
20000


In [19]:
print('SGD:')
run_cross_validation(train_features, train_data['sentiment'].values, SGDClassifier, SGD_params)

SGD:
fit_time: [0.66455102 1.311234   0.85900378 1.12196302 0.83773685]
Average fit_time: 0.958898
score_time: [0.02380586 0.01853895 0.018049   0.01880908 0.01573992]
Average score_time: 0.018989
test_accuracy: [0.8688 0.8578 0.853  0.8636 0.8568]
Average test_accuracy: 0.860000
train_accuracy: [0.8648  0.86805 0.8608  0.8677  0.86745]
Average train_accuracy: 0.865760
test_f1: [0.8688 0.8578 0.853  0.8636 0.8568]
Average test_f1: 0.860000
train_f1: [0.8648  0.86805 0.8608  0.8677  0.86745]
Average train_f1: 0.865760
test_roc_auc: [0.938292   0.93376928 0.9333984  0.93520064 0.93183568]
Average test_roc_auc: 0.934499
train_roc_auc: [0.93747448 0.93898884 0.93860895 0.93819742 0.93918329]
Average train_roc_auc: 0.938491


In [20]:
model = SGDClassifier(**SGD_params)
model.fit(train_features, train_data['sentiment'].values)
result = model.predict_proba(test_features)[:, 1]
test_data['sentiment'] = result
# Kaggle ROC-AUC score: 0.93751
test_data[['id', 'sentiment']].to_csv('res/Word2Vec.csv', index=False, quoting=3)