### Read/Write functions

In [1]:
def read_tsv(tar, fname):
    member = tar.getmember(fname)
    print(member.name)
    tf = tar.extractfile(member)
    data = []
    labels = []
    for line in tf:
        line = line.decode("utf-8")
        (label,text) = line.strip().split("\t")
        labels.append(label)
        data.append(text)
    return data, labels

In [2]:
def read_files(tarfname):
    """Read the training and development data from the sentiment tar file.
    The returned object contains various fields that store sentiment data, such as:

    train_data,dev_data: array of documents (array of words)
    train_fnames,dev_fnames: list of filenames of the doccuments (same length as data)
    train_labels,dev_labels: the true string label for each document (same length as data)

    The data is also preprocessed for use with scikit-learn, as:

    count_vec: CountVectorizer used to process the data (for reapplication on new data)
    trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer
    le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication)
    target_labels: List of labels (same order as used in le)
    trainy,devy: array of int labels, one for each document
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    trainname = "train.tsv"
    devname = "dev.tsv"
    for member in tar.getmembers():
        if 'train.tsv' in member.name:
            trainname = member.name
        elif 'dev.tsv' in member.name:
            devname = member.name
                   
    class Data: pass
    sentiment = Data()
    print("-- train data")
    sentiment.train_data, sentiment.train_labels = read_tsv(tar,trainname)
    print(len(sentiment.train_data))

    print("-- dev data")
    sentiment.dev_data, sentiment.dev_labels = read_tsv(tar, devname)
    print(len(sentiment.dev_data))
    print("-- transforming data and labels")
    from sklearn.feature_extraction.text import CountVectorizer
    sentiment.count_vect = CountVectorizer()
    sentiment.trainX = sentiment.count_vect.fit_transform(sentiment.train_data)
    sentiment.devX = sentiment.count_vect.transform(sentiment.dev_data)
    from sklearn import preprocessing
    sentiment.le = preprocessing.LabelEncoder()
    sentiment.le.fit(sentiment.train_labels)
    sentiment.target_labels = sentiment.le.classes_
    sentiment.trainy = sentiment.le.transform(sentiment.train_labels)
    sentiment.devy = sentiment.le.transform(sentiment.dev_labels)
    tar.close()
    return sentiment

In [3]:
def read_unlabeled(tarfname, sentiment):
    """Reads the unlabeled data.

    The returned object contains three fields that represent the unlabeled data.

    data: documents, represented as sequence of words
    fnames: list of filenames, one for each document
    X: bag of word vector for each document, using the sentiment.vectorizer
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    class Data: pass
    unlabeled = Data()
    unlabeled.data = []
    
    unlabeledname = "unlabeled.tsv"
    for member in tar.getmembers():
        if 'unlabeled.tsv' in member.name:
            unlabeledname = member.name
            
    print(unlabeledname)
    tf = tar.extractfile(unlabeledname)
    for line in tf:
        line = line.decode("utf-8")
        text = line.strip()
        unlabeled.data.append(text)
        
            
    unlabeled.X = sentiment.count_vect.transform(unlabeled.data)
    print(unlabeled.X.shape)
    tar.close()
    return unlabeled

In [4]:
def write_pred_kaggle_file(unlabeled, cls, outfname, sentiment):
    """Writes the predictions in Kaggle format.

    Given the unlabeled object, classifier, outputfilename, and the sentiment object,
    this function write sthe predictions of the classifier on the unlabeled data and
    writes it to the outputfilename. The sentiment object is required to ensure
    consistent label names.
    """
    yp = cls.predict(unlabeled.X)
    labels = sentiment.le.inverse_transform(yp)
    f = open(outfname, 'w')
    f.write("ID,LABEL\n")
    for i in range(len(unlabeled.data)):
        f.write(str(i+1))
        f.write(",")
        f.write(labels[i])
        f.write("\n")
    f.close()

In [5]:
def write_basic_kaggle_file(tsvfile, outfname):
    """Writes the output Kaggle file of the naive baseline.

    This baseline predicts POSITIVE for all the instances.
    """
    f = open(outfname, 'w')
    f.write("ID,LABEL\n")
    i = 0
    with open(tsvfile, 'r') as tf:
        for line in tf:
            (label,review) = line.strip().split("\t")
            i += 1
            f.write(str(i))
            f.write(",")
            f.write("POSITIVE")
            f.write("\n")
    f.close()

### Supervised TC

In [6]:
import numpy as np
import string
import math
import re
from collections import defaultdict
from nltk.corpus import stopwords 
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression

In [7]:
print("Reading data")
tarfname = "data/sentiment.tar.gz"
sentiment = read_files(tarfname)
print("\nReading unlabeled data")
unlabeled = read_unlabeled(tarfname, sentiment)

Reading data
-- train data
sentiment/train.tsv
4582
-- dev data
sentiment/dev.tsv
458
-- transforming data and labels

Reading unlabeled data
sentiment/unlabeled.tsv
(91524, 9882)


In [8]:
def train_classifier(X, y, c):
    """Train a classifier using the given training data.

    Trains logistic regression on the input data with default parameters.
    """
    from sklearn.linear_model import LogisticRegression
    cls = LogisticRegression(C=c,random_state=0, solver='lbfgs', max_iter=10000)
    cls.fit(X, y)
    return cls

In [9]:
def evaluate(X, yt, cls, name='data'):
    """Evaluated a classifier on the given labeled data using accuracy."""
    from sklearn import metrics
    yp = cls.predict(X)
    acc = metrics.accuracy_score(yt, yp)
    print("  Accuracy on %s  is: %s" % (name, acc))

#### Unigram BAG

In [312]:
cls = train_classifier(sentiment.trainX, sentiment.trainy, 0.6)
print("\nEvaluating")
evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

print("Writing predictions to a file")
unlabeled.X = sentiment.count_vect.transform(unlabeled.data)
write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)


Evaluating
  Accuracy on train  is: 0.9644260148406809
  Accuracy on dev  is: 0.7816593886462883
Writing predictions to a file


#### TF_IDF Unigram

In [321]:
def tf(document):
    tf_table = defaultdict(int)
    lower = ''.join([c for c in document.lower()])
    r = re.split(r'\W+', lower)
    for w in r:
        tf_table[w] += 1
    for t in tf_table:
        tf_table[t] = math.log(1 + tf_table[t])
    return tf_table

In [322]:
def tfidf_matrix(dataset):
    row = []
    col = []
    data = []
    index = 0
    for document in dataset:
        tf_table = tf(document)
        for w in tf_table:
            if w in wordsId:
                row.append(index)
                col.append(wordsId[w])
                data.append(tf_table[w] * idf[w])
        index += 1
    return csr_matrix((data, (row, col)), shape=(len(dataset), len(words)))

In [323]:
number = set('0 1 2 3 4 5 6 7 8 9'.split())
wordsCount = defaultdict(int)
documentCount = defaultdict(int) 

for d in sentiment.train_data:
    lower = ''.join([c for c in d.lower()])
    r = re.split(r'\W+', lower)
    for w in r:
        wordsCount[w] += 1

    for w in set(r):
        documentCount[w] += 1

words = [x for x in wordsCount]
wordsId = dict(zip(words, range(len(words))))

idf = defaultdict(float)
for w in words:
    idf[w] = math.log(len(sentiment.train_data) / documentCount[w])

In [324]:
uni_trainX = tfidf_matrix(sentiment.train_data)
uni_devX = tfidf_matrix(sentiment.dev_data)

In [329]:
cls = train_classifier(uni_trainX, sentiment.trainy, 0.7)
print("\nEvaluating")
evaluate(uni_trainX, sentiment.trainy, cls, 'train')
evaluate(uni_devX, sentiment.devy, cls, 'dev')


Evaluating
  Accuracy on train  is: 0.9997817546922741
  Accuracy on dev  is: 0.7969432314410481


In [585]:
print("Writing predictions to a file")
unlabeled.X = tfidf_matrix(unlabeled.data)
write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

Writing predictions to a file


#### TF_IDF bigram

In [293]:
def tf_bigram(document):
    tf_table = defaultdict(int)
    lower = ''.join([c for c in document.lower()])
    r = re.split(r'\W+', lower)
    for i in range(len(r) - 1):
        b = r[i] + ' ' + r[i + 1]
        tf_table[b] += 1
    for t in tf_table:
        tf_table[t] = math.log(1 + tf_table[t])
    return tf_table

In [294]:
def tfidf_matrix_bigram(dataset):
    row = []
    col = []
    data = []
    index = 0
    for document in dataset:
        tf_table = tf_bigram(document)
        for b in tf_table:
            if b in bigramsId:
                row.append(index)
                col.append(bigramsId[b])
                data.append(tf_table[b] * bigram_idf[b])
        index += 1
    return csr_matrix((data, (row, col)), shape=(len(dataset), len(bigrams)))

In [295]:
bigramCount = defaultdict(int)
bigramDocumentCount = defaultdict(int)

for d in sentiment.train_data:
    lower = ''.join([c for c in d.lower()])
    r = re.split(r'\W+', lower)
    br = set()
    for i in range(len(r) - 1):
        b = r[i] + ' ' + r[i + 1]
        br.add(b)
        bigramCount[b] += 1
    for b in br:
        bigramDocumentCount[b] += 1

bigrams = [b for b in bigramCount]
bigramsId = dict(zip(bigrams, range(len(bigrams))))

bigram_idf = defaultdict(float)
for b in bigrams:
    bigram_idf[b] = math.log(len(sentiment.train_data) / bigramDocumentCount[b])

In [296]:
bi_trainX = tfidf_matrix_bigram(sentiment.train_data)
bi_devX = tfidf_matrix_bigram(sentiment.dev_data)

In [357]:
cls = train_classifier(bi_trainX, sentiment.trainy, 0.006)
print("\nEvaluating")
evaluate(bi_trainX, sentiment.trainy, cls, 'train')
evaluate(bi_devX, sentiment.devy, cls, 'dev')


Evaluating
  Accuracy on train  is: 0.9993452640768223
  Accuracy on dev  is: 0.7860262008733624


#### TF_IDF unigram + bigram

In [298]:
from scipy.sparse import hstack
ubi_trainX = hstack((uni_trainX, bi_trainX))
ubi_devX = hstack((uni_devX, bi_devX))

In [365]:
cls = train_classifier(ubi_trainX, sentiment.trainy, 0.1)
print("\nEvaluating")
evaluate(ubi_trainX, sentiment.trainy, cls, 'train')
evaluate(ubi_devX, sentiment.devy, cls, 'dev')


Evaluating
  Accuracy on train  is: 1.0
  Accuracy on dev  is: 0.8078602620087336


In [366]:
print("Writing predictions to a file")
unlabeled.X = hstack((tfidf_matrix(unlabeled.data), tfidf_matrix_bigram(unlabeled.data)))
write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

Writing predictions to a file


### Semi-supervised TC

In [10]:
def tokenize(dataset):
    termsCount = defaultdict(int)
    documentCount = defaultdict(int)

    for d in dataset:
        lower = ''.join([c for c in d.lower()])
        r = re.split(r'\W+', lower)
        voc = set()
        for i in range(len(r)):
            w = r[i]
            voc.add(w)
            termsCount[w] += 1
            
            if i != len(r) - 1:
                b = r[i] + ' ' + r[i + 1]
                voc.add(b)
                termsCount[b] += 1
            
            
        for t in voc:
            documentCount[t] += 1

    terms = [t for t in termsCount]
    termsId = dict(zip(terms, range(len(terms))))

    idf = defaultdict(float)
    for t in terms:
        idf[t] = math.log(len(dataset) / (documentCount[t]))
    return (termsId, idf)

In [11]:
def tf(document):
    tf_table = defaultdict(int)
    lower = ''.join([c for c in document.lower()])
    r = re.split(r'\W+', lower)
    for i in range(len(r)):
        w = r[i]
        tf_table[w] += 1
        
        if i != len(r) - 1:
            b = r[i] + ' ' + r[i + 1]
            tf_table[b] += 1 
        
    for t in tf_table:
        tf_table[t] = math.log(1 + tf_table[t])
    return tf_table

In [12]:
def tfidf_matrix(dataset, termsId, idf):
    row = []
    col = []
    data = []
    index = 0
    for document in dataset:
        tf_table = tf(document)
        for t in tf_table:
            if t in termsId:
                row.append(index)
                col.append(termsId[t])
                data.append(tf_table[t] * idf[t])
        index += 1
    return csr_matrix((data, (row, col)), shape=(len(dataset), len(termsId)))

In [13]:
from copy import deepcopy
from sklearn import metrics

In [28]:
wordsId, idf_table_1 = tokenize(sentiment.train_data)
diff_list = []
for w in idf_table_1:
    diff = abs(idf_table[w] - idf_table_1[w])
    diff_list.append((diff, w))
diff_list.sort()

In [29]:
len(differens)

[(2.5657045367685942, '魚肉也算新鮮 美中不足是壽司飯煮的不是很透'),
 (2.5657045367685942, '魚肉也算新鮮'),
 (2.5657045367685942, '飯給吃完 要不然就要算單點價錢基於食物與服務我只能給它兩星級'),
 (2.5657045367685942, '飯給吃完'),
 (2.5657045367685942, '還是堅持要肉 飯給吃完'),
 (2.5657045367685942, '還是堅持要肉'),
 (2.5657045367685942, '要不然就要算單點價錢基於食物與服務我只能給它兩星級 5'),
 (2.5657045367685942, '要不然就要算單點價錢基於食物與服務我只能給它兩星級'),
 (2.5657045367685942, '美中不足是壽司飯煮的不是很透 反應給日本師傅與經理後'),
 (2.5657045367685942, '美中不足是壽司飯煮的不是很透'),
 (2.5657045367685942, '昨天到這家口碑十足的日式壽司店 魚肉也算新鮮'),
 (2.5657045367685942, '昨天到這家口碑十足的日式壽司店'),
 (2.5657045367685942, '年後 昨天到這家口碑十足的日式壽司店'),
 (2.5657045367685942, '年後'),
 (2.5657045367685942, '反應給日本師傅與經理後 還是堅持要肉'),
 (2.5657045367685942, '反應給日本師傅與經理後'),
 (2.5657045367685942, 'über 25'),
 (2.5657045367685942, 'über'),
 (2.5657045367685942, 'être reçus'),
 (2.5657045367685942, 'été pour'),
 (2.5657045367685942, 'étions assises'),
 (2.5657045367685942, 'étions'),
 (2.5657045367685942, 'était pas'),
 (2.5657045367685942, 'équipe distante'),
 (2.5657045367685942, '

#### most_confident_prediction( fix cutoff) + dev stop increasing

In [19]:
import heapq

cur_train_data = deepcopy(sentiment.train_data)
cur_trainy = deepcopy(sentiment.trainy)
 
unlabel_data = deepcopy(unlabeled.data)
testX = unlabeled.X
pre_acc = 0
wordsId, idf_table_1 = tokenize(cur_train_data)
index = 0
while True:
    index += 1
    
    wordsId, idf_table = tokenize(cur_train_data)
    trainX = tfidf_matrix(cur_train_data, wordsId, idf_table)
    devX = tfidf_matrix(sentiment.dev_data, wordsId, idf_table)
    cls = train_classifier(trainX, cur_trainy, 0.1)
    
    dev_yp = cls.predict(devX)
    dev_acc = metrics.accuracy_score(sentiment.devy, dev_yp)
    
    print("loop", index, ": dev accuracy:", dev_acc)
    print("         train size:  ", len(cur_train_data))
    if index == 8:
        break
    pre_acc = dev_acc
        
    testX = tfidf_matrix(unlabel_data, wordsId, idf_table)
    test_yp = cls.predict(testX)
    test_confidence = abs(cls.decision_function(testX))
    
    expand_data = []
    expand_y = []    
    
    for i in range(len(test_confidence)):
        if test_confidence[i] > 3.5:
            expand_data.append(unlabeled.data[i])
            expand_y.append(test_yp[i])
    
    cur_train_data = sentiment.train_data + expand_data
    cur_trainy = np.concatenate((sentiment.trainy, np.array(expand_y)))

loop 1 : dev accuracy: 0.8078602620087336
         train size:   4582
loop 2 : dev accuracy: 0.8078602620087336
         train size:   19602
loop 3 : dev accuracy: 0.8013100436681223
         train size:   34557
loop 4 : dev accuracy: 0.7882096069868996
         train size:   45763
loop 5 : dev accuracy: 0.7903930131004366
         train size:   52570
loop 6 : dev accuracy: 0.7838427947598253
         train size:   56310
loop 7 : dev accuracy: 0.7838427947598253
         train size:   58369
loop 8 : dev accuracy: 0.7751091703056768
         train size:   59611


In [21]:
dev_yp = cls.predict(devX)
error_list = []
for i in range(len(sentiment.dev_data)):
    if dev_yp[i] != sentiment.devy[i]:
        error_list.append((sentiment.dev_data[i], dev_yp[i]))

In [22]:
error_list

[('Seeeeeeeeeeehr wunderbares Geschaeft fuer Menschen mit der Liebe zum Espresso.Herr Koeberl verkauft nicht nur Espressomaschinen, nein, man kann auch einen herausragend leckeren Espresso (oder auch 2) bei ihm trinken.Er fuehrt',
  0),
 ('If you want your fancy flavored iced coffee, sure go here. But if you want a good version of their namesake- doughnuts, then go elsewhere. Specifically, go south to Fresh',
  1),
 ('We decided to try hash house a go go after reading several positive yelp reviews.We were a little put off when we got to the Imperial Palace location at approximately',
  0),
 ('Had an awsome late lunch at this bbq restaurant this afternoon.  I have a gluten alergy so it was a relief to be able to talk to the owner',
  0),
 ("You are guaranteed a headache because of all of the annoying, loud advertisements in your face while you're filling up your car. It's out of control.",
  1),
 ("This location is fairly large compared to other's I've been to.  I prefer this location o

In [48]:
print("Writing predictions to a file")
unlabeled.X = tfidf_matrix(unlabeled.data, wordsId, idf_table)
write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

Writing predictions to a file


##### not using unlabeled at first
k = 6: 0.8013 -> 0.7948
       4582 -> 10401
k = 5.75: 0.8013 -> 0.7948
          4582 -> 11572
k = 5.5: 0.8013 -> 0.8034 -> 0.8100 -> 0.7991
         3582 -> 12868 -> 17310 -> 21104
k = 5.25: 0.8013 -> 0.8056 -> 0.8079 -> 0.8057
          4582 -> 14412 -> 20496 -> 25721
k = 5: 0.8013 -> 0.8057 -> 0.8035
       4582 -> 16109 -> 24104
k = 4.5: 0.8013 -> 0.8035 -> 0.8035
         4582 -> 20219 -> 32489
k = 4: 0.8013 -> 0.8035 -> 0.8057 -> 0.7926
       4582 -> 25301 -> 41856 -> 52429
k = 3: 0.8013 -> 0.8035 -> 0.7991
       4582 -> 38345 -> 60895

### Word Embedding using word2vec and tfidf

In [383]:
def idf_table(dataset):
    documentCount = defaultdict(int)
    for document in dataset:
        for w in set(document):
            documentCount[w] += 1
    idf = defaultdict(float)
    for w in documentCount:
        idf[w] = math.log(len(dataset) / documentCount[w])
    return idf

In [384]:
def tf_table(document):
    tf = defaultdict(int)
    for w in document:
        tf[w] += 1    
    for w in tf:
        tf[w] = math.log(1 + tf[w])
    return tf

In [408]:
def w2v_feature_matrix(dataset, idf, wv2):
    matrix = []
    for document in dataset:
        feature = np.zeros(length)
        base = 0
        tf = tf_table(document)
        for w in document:
            if w in wv2:
                """
                feature += wv2[w] * tf[w] * idf[w]
                base += tf[w] * idf[w]
                """
                feature += wv2[w]
                base += 1
                
                
        if base == 0:
            matrix.append(feature)
        else:
            matrix.append(feature / base)
    return np.array(matrix)

In [386]:
def tokenize(raw_data):
    tokenized_data = []
    for d in raw_data:
        lower = ''.join([c for c in d.lower()])
        r = re.split(r'\W+', lower)
        r = [w for w in r]
        tokenized_data.append(r)
    return tokenized_data

In [400]:
import gensim
train_data = tokenize(sentiment.train_data)
dev_data = tokenize(sentiment.dev_data)
test_data = tokenize(unlabeled.data)
length = 100

In [401]:
corpus = train_data + test_data 
model = gensim.models.Word2Vec(corpus, size=length, window=8, min_count=1, workers=10)
model.train(corpus, total_examples=len(corpus), epochs=10)
wv2 = dict(zip(model.wv.index2word, model.wv.vectors))

In [409]:
idf = idf_table(corpus)
train_X = w2v_feature_matrix(train_data, idf, wv2)
dev_X = w2v_feature_matrix(dev_data, idf, wv2)
cls = train_classifier(train_X, sentiment.trainy, 1)

print("\nEvaluating")
evaluate(train_X, sentiment.trainy, cls, 'train')
evaluate(dev_X, sentiment.devy, cls, 'dev')


Evaluating
  Accuracy on train  is: 0.8107813182016587
  Accuracy on dev  is: 0.8122270742358079


### self_training added

In [415]:
cur_train_data = deepcopy(train_data)
cur_trainy = deepcopy(sentiment.trainy)
 
unlabel_data = deepcopy(unlabeled.data)
testX = unlabeled.X
pre_acc = 0

index = 0
while True:
    index += 1
    trainX = w2v_feature_matrix(cur_train_data, idf, wv2)
    devX = w2v_feature_matrix(dev_data, idf, wv2)
    cls = train_classifier(trainX, cur_trainy, 1)
    
    dev_yp = cls.predict(devX)
    dev_acc = metrics.accuracy_score(sentiment.devy, dev_yp)
    print("loop", index, ": dev accuracy:", dev_acc)
    print("         train size:  ", len(cur_train_data))
    if index == 4:
        break
    pre_acc = dev_acc
        
    testX = w2v_feature_matrix(test_data, idf, wv2)
    test_yp = cls.predict(testX)
    test_confidence = abs(cls.decision_function(testX))
    
    expand_data = []
    expand_y = []    
    
    for i in range(len(test_confidence)):
        if test_confidence[i] > 4.5:
            expand_data.append(test_data[i])
            expand_y.append(test_yp[i])
    
    cur_train_data = train_data + expand_data
    cur_trainy = np.concatenate((sentiment.trainy, np.array(expand_y)))

loop 1 : dev accuracy: 0.8122270742358079
         train size:   4582
loop 2 : dev accuracy: 0.8165938864628821
         train size:   10889
loop 3 : dev accuracy: 0.8144104803493449
         train size:   13092
loop 4 : dev accuracy: 0.8165938864628821
         train size:   14435


In [416]:
print("Writing predictions to a file")
unlabeled.X = w2v_feature_matrix(test_data, idf, wv2)
write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

Writing predictions to a file
