# 0. Dependency

- [nltk](https://www.nltk.org)
- [sklearn](http://scikit-learn.org/stable/)
- [word2vec](https://github.com/danielfrg/word2vec)
- [fastText](https://fasttext.cc)

# 1. Data Preprocessing with Sklearn

In [1]:
# Load packages

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load data
trn_texts = open("trn-reviews.txt").read().strip().split("\n")
trn_labels = map(str, open("trn-labels.txt").read().strip().split("\n"))
print "Training data ..."
print len(trn_texts), len(trn_labels)

dev_texts = open("dev-reviews.txt").read().strip().split("\n")
dev_labels = map(str, open("dev-labels.txt").read().strip().split("\n"))
print "Development data ..."
print len(dev_texts), len(dev_labels)

tst_texts = open("tst-reviews.txt").read().strip().split("\n")
tst_labels = map(str, open("tst-labels.txt").read().strip().split("\n"))
print "Test data ..."
print len(tst_texts), len(tst_labels)

Training data ...
40000 40000
Development data ...
5000 5000
Test data ...
5000 5000


## Preprocessing the training data with different choices

In [169]:
choice = 3

if choice == 1:
    print "Preprocessing without any feature selection"
    vectorizer = CountVectorizer(lowercase=False)
    # vocab size 77166
elif choice == 2:
    print "Lowercasing all the tokens"
    vectorizer = CountVectorizer(lowercase=True)
    # vocab size 60610
elif choice == 3:
    print "Lowercasing and filtering out low-frequency words"
    vectorizer = CountVectorizer(lowercase=True, min_df=2)
    # vocab size 31218
elif choice == 4:
    print "Lowercasing and filtering out low-frequency words, uni- and bi-gram"
    vectorizer = CountVectorizer(lowercase=True, min_df=2, ngram_range=(1,2))
    # vocab size 323167
elif choice == 5:
    print "Uni- and bi-gram"
    vectorizer = CountVectorizer(ngram_range=(1,2))
    # vocab 1048596
elif choice == 6:
    print "Lowercasing and filtering out high-frequency words"
    vectorizer = CountVectorizer(lowercase=True, max_df=0.5)
    # vocab size 60610

trn_data = vectorizer.fit_transform(trn_texts)
print trn_data.shape

Lowercasing and filtering out low-frequency words
(40000, 31218)


## Preprocessing the dev and test data

In [170]:
dev_data = vectorizer.transform(dev_texts)
print dev_data.shape
tst_data = vectorizer.transform(tst_texts)
print tst_data.shape

(5000, 31218)
(5000, 31218)


# 2. Classification

In [166]:
from sklearn.linear_model import LogisticRegression as LR

In [167]:
# define a LR classifier

classifier = LR(C=10000, solver="lbfgs")
classifier.fit(trn_data, trn_labels)

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [168]:
# Measure the performance on training and dev data
print "Training accuracy =", classifier.score(trn_data, trn_labels)
print "Dev accuracy =", classifier.score(dev_data, dev_labels)

Training accuracy = 0.913325
Dev accuracy = 0.5754


Setup: Choice 1, C=10000, solver='lbfgs'

- Trn acc = 0.8492
- Dev acc = 0.6056

Setup: Choice 2, C=10000, solver='lbfgs'

- Trn acc = 0.8213
- Dev acc = 0.6142

Setup: Choice 3, C=10000, solver='lbfgs'

- Trn acc = 0.8099
- Dev acc = 0.6174

Setup: Choice 4, C=10000, solver='lbfgs'

- Trn acc = 0.9978
- Dev acc = 0.6202

Setup: Choice 5, C=10000, solver='lbfgs'

- Trn acc = 0.9999
- Dev acc = 0.624

# 3. Feature Analysis

- Setup: choice = 3

In [89]:
# get vocab

vocab = vectorizer.vocabulary_
ivocab = {}
for (key, val) in vocab.iteritems():
    ivocab[val] = key

In [90]:
# get classification weights

weights = classifier.coef_

In [157]:
print weights[0][vocab['delicious']]

-3.60375083477


In [132]:
# get salient features for each class

from numpy import argsort

def get_top_features(weight, vocab, topn=10):
    sorted_indices = list(argsort(weight))[::-1]
    for n in range(topn):
        print ivocab[sorted_indices[n]]

In [134]:
# top features with user rating 5
get_top_features(weights[4], ivocab)

exceptional
incredible
phenomenal
body
regret
worried
skeptical
hesitate
happier
mike


In [139]:
# top features with user rating 1
get_top_features(weights[0], ivocab)

worst
joke
disgusted
unprofessional
garbage
disgusting
luck
pathetic
apologies
horrible


# 3. Word2vec demo

In [98]:
import word2vec

# more examples in http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb

In [111]:
# Preprocessing
from nltk.tokenize import wordpunct_tokenize

def preprocessing_for_word2vec(fname):
    text = open(fname).read().replace("\n"," ")
    text = " ".join(wordpunct_tokenize(text)).lower()
    with open('word2vec-input.txt', 'w') as fout:
        fout.write(text)

word2vec_data = preprocessing_for_word2vec("trn-reviews.txt")

In [113]:
word2vec.doc2vec("word2vec-input.txt", "word2vec-output.bin", verbose=True)

Starting training using file word2vec-input.txt
Vocab size: 19026
Words in train file: 5531279
Alpha: 0.000002  Progress: 100.14%  Words/thread/sec: 530.87k  

## Word Similarity Prediction

In [116]:
w2v_model = word2vec.load('word2vec-output.bin')


In [118]:
indexes, metrics = w2v_model.cosine('yummy')
w2v_model.vocab[indexes]

array([u'delicious', u'tasty', u'delish', u'yum', u'incredible', u'superb',
       u'phenomenal', u'fantastic', u'disappoint', u'awesome'], 
      dtype='<U78')

In [161]:
indexes, metrics = w2v_model.cosine('horrible')
w2v_model.vocab[indexes]

array([u'terrible', u'poor', u'awful', u'customer', u'exceptional', u'bad',
       u'astonished', u'pleasant', u'happier', u'zero'], 
      dtype='<U78')

# 4. fastText

In [121]:
# reformat the data for fastText

from nltk.tokenize import wordpunct_tokenize

def reformat(infname, labelfname, outfname):
    texts = open(infname).read().split("\n")
    labels = open(labelfname).read().split("\n")
    fout = open(outfname, 'w')
    for (text, label) in zip(texts, labels):
        fout.write("__label__{} {}\n".format(label, " ".join(wordpunct_tokenize(text))))

In [123]:
reformat("trn-reviews.txt","trn-labels.txt","trn-data-fasttext.txt")
reformat("dev-reviews.txt","dev-labels.txt","dev-data-fasttext.txt")
reformat("tst-reviews.txt","tst-labels.txt","tst-data-fasttext.txt")

# Run fastText


    ./fasttext supervised -input ../trn-data-fasttext.txt -output model 
    ./fasttext test model.bin ../trn-data-fasttext.txt
    ./fasttext test model.bin ../dev-data-fasttext.txt


- Trn acc = 0.654
- Dev acc = 0.651
