### Read/Write functions

In [1]:
def read_tsv(tar, fname):
    member = tar.getmember(fname)
    print(member.name)
    tf = tar.extractfile(member)
    data = []
    labels = []
    for line in tf:
        line = line.decode("utf-8")
        (label,text) = line.strip().split("\t")
        labels.append(label)
        data.append(text)
    return data, labels

In [2]:
def read_files(tarfname):
    """Read the training and development data from the sentiment tar file.
    The returned object contains various fields that store sentiment data, such as:

    train_data,dev_data: array of documents (array of words)
    train_fnames,dev_fnames: list of filenames of the doccuments (same length as data)
    train_labels,dev_labels: the true string label for each document (same length as data)

    The data is also preprocessed for use with scikit-learn, as:

    count_vec: CountVectorizer used to process the data (for reapplication on new data)
    trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer
    le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication)
    target_labels: List of labels (same order as used in le)
    trainy,devy: array of int labels, one for each document
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    trainname = "train.tsv"
    devname = "dev.tsv"
    for member in tar.getmembers():
        if 'train.tsv' in member.name:
            trainname = member.name
        elif 'dev.tsv' in member.name:
            devname = member.name
                   
    class Data: pass
    sentiment = Data()
    print("-- train data")
    sentiment.train_data, sentiment.train_labels = read_tsv(tar,trainname)
    print(len(sentiment.train_data))

    print("-- dev data")
    sentiment.dev_data, sentiment.dev_labels = read_tsv(tar, devname)
    print(len(sentiment.dev_data))
    print("-- transforming data and labels")
    from sklearn.feature_extraction.text import CountVectorizer
    sentiment.count_vect = CountVectorizer()
    sentiment.trainX = sentiment.count_vect.fit_transform(sentiment.train_data)
    sentiment.devX = sentiment.count_vect.transform(sentiment.dev_data)
    from sklearn import preprocessing
    sentiment.le = preprocessing.LabelEncoder()
    sentiment.le.fit(sentiment.train_labels)
    sentiment.target_labels = sentiment.le.classes_
    sentiment.trainy = sentiment.le.transform(sentiment.train_labels)
    sentiment.devy = sentiment.le.transform(sentiment.dev_labels)
    tar.close()
    return sentiment

In [3]:
def read_unlabeled(tarfname, sentiment):
    """Reads the unlabeled data.

    The returned object contains three fields that represent the unlabeled data.

    data: documents, represented as sequence of words
    fnames: list of filenames, one for each document
    X: bag of word vector for each document, using the sentiment.vectorizer
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    class Data: pass
    unlabeled = Data()
    unlabeled.data = []
    
    unlabeledname = "unlabeled.tsv"
    for member in tar.getmembers():
        if 'unlabeled.tsv' in member.name:
            unlabeledname = member.name
            
    print(unlabeledname)
    tf = tar.extractfile(unlabeledname)
    for line in tf:
        line = line.decode("utf-8")
        text = line.strip()
        unlabeled.data.append(text)
        
            
    unlabeled.X = sentiment.count_vect.transform(unlabeled.data)
    print(unlabeled.X.shape)
    tar.close()
    return unlabeled

### Supervised TC

In [4]:
import numpy as np
import string
import math
import re
from collections import defaultdict
from nltk.corpus import stopwords 
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression

In [5]:
print("Reading data")
tarfname = "data/sentiment.tar.gz"
sentiment = read_files(tarfname)
print("\nReading unlabeled data")
unlabeled = read_unlabeled(tarfname, sentiment)

Reading data
-- train data
sentiment/train.tsv
4582
-- dev data
sentiment/dev.tsv
458
-- transforming data and labels

Reading unlabeled data
sentiment/unlabeled.tsv
(91524, 9882)


In [6]:
def train_classifier(X, y, c):
    """Train a classifier using the given training data.

    Trains logistic regression on the input data with default parameters.
    """
    from sklearn.linear_model import LogisticRegression
    cls = LogisticRegression(C=c,random_state=0, solver='lbfgs', max_iter=10000)
    cls.fit(X, y)
    return cls

In [7]:
def evaluate(X, yt, cls, name='data'):
    """Evaluated a classifier on the given labeled data using accuracy."""
    from sklearn import metrics
    yp = cls.predict(X)
    acc = metrics.accuracy_score(yt, yp)
    print("  Accuracy on %s  is: %s" % (name, acc))

#### TF_IDF unigram + bigram

In [12]:
from copy import deepcopy
from sklearn import metrics

In [9]:
def tokenize(dataset):
    termsCount = defaultdict(int)
    documentCount = defaultdict(int)

    for d in dataset:
        lower = ''.join([c for c in d.lower()])
        r = re.split(r'\W+', lower)
        voc = set()
        for i in range(len(r)):
            w = r[i]
            voc.add(w)
            termsCount[w] += 1
            
            if i != len(r) - 1:
                b = r[i] + ' ' + r[i + 1]
                voc.add(b)
                termsCount[b] += 1   
            
        for t in voc:
            documentCount[t] += 1

    terms = [t for t in termsCount]
    termsId = dict(zip(terms, range(len(terms))))

    idf = defaultdict(float)
    for t in terms:
        idf[t] = math.log(len(dataset) / (documentCount[t]))
    return (termsId, idf)

In [10]:
def tf(document):
    tf_table = defaultdict(int)
    lower = ''.join([c for c in document.lower()])
    r = re.split(r'\W+', lower)
    for i in range(len(r)):
        w = r[i]
        tf_table[w] += 1
        
        if i != len(r) - 1:
            b = r[i] + ' ' + r[i + 1]
            tf_table[b] += 1 
        
    for t in tf_table:
        tf_table[t] = math.log(1 + tf_table[t])
    return tf_table

In [11]:
def tfidf_matrix(dataset, termsId, idf):
    row = []
    col = []
    data = []
    index = 0
    for document in dataset:
        tf_table = tf(document)
        for t in tf_table:
            if t in termsId:
                row.append(index)
                col.append(termsId[t])
                data.append(tf_table[t] * idf[t])
        index += 1
    return csr_matrix((data, (row, col)), shape=(len(dataset), len(termsId)))

In [13]:
wordsId, idf_table = tokenize(sentiment.train_data)
trainX = tfidf_matrix(sentiment.train_data, wordsId, idf_table)
devX = tfidf_matrix(sentiment.dev_data, wordsId, idf_table)
cls = train_classifier(trainX, sentiment.trainy, 0.1)

In [14]:
print("\nEvaluating")
evaluate(trainX, sentiment.trainy, cls, 'train')
evaluate(devX, sentiment.devy, cls, 'dev')


Evaluating
  Accuracy on train  is: 1.0
  Accuracy on dev  is: 0.8078602620087336


### Semi-supervised TC

#### most_confident_prediction( fix cutoff) + dev stop increasing

In [19]:
import heapq

cur_train_data = deepcopy(sentiment.train_data)
cur_trainy = deepcopy(sentiment.trainy)
 
unlabel_data = deepcopy(unlabeled.data)
testX = unlabeled.X
pre_acc = 0
wordsId, idf_table_1 = tokenize(cur_train_data)
index = 0
while True:
    index += 1
    
    wordsId, idf_table = tokenize(cur_train_data)
    trainX = tfidf_matrix(cur_train_data, wordsId, idf_table)
    devX = tfidf_matrix(sentiment.dev_data, wordsId, idf_table)
    cls = train_classifier(trainX, cur_trainy, 0.1)
    
    dev_yp = cls.predict(devX)
    dev_acc = metrics.accuracy_score(sentiment.devy, dev_yp)
    
    print("loop", index, ": dev accuracy:", dev_acc)
    print("         train size:  ", len(cur_train_data))
    if index == 8:
        break
    pre_acc = dev_acc
        
    testX = tfidf_matrix(unlabel_data, wordsId, idf_table)
    test_yp = cls.predict(testX)
    test_confidence = abs(cls.decision_function(testX))
    
    expand_data = []
    expand_y = []    
    
    for i in range(len(test_confidence)):
        if test_confidence[i] > 3.5:
            expand_data.append(unlabeled.data[i])
            expand_y.append(test_yp[i])
    
    cur_train_data = sentiment.train_data + expand_data
    cur_trainy = np.concatenate((sentiment.trainy, np.array(expand_y)))

loop 1 : dev accuracy: 0.8078602620087336
         train size:   4582
loop 2 : dev accuracy: 0.8078602620087336
         train size:   19602
loop 3 : dev accuracy: 0.8013100436681223
         train size:   34557
loop 4 : dev accuracy: 0.7882096069868996
         train size:   45763
loop 5 : dev accuracy: 0.7903930131004366
         train size:   52570
loop 6 : dev accuracy: 0.7838427947598253
         train size:   56310
loop 7 : dev accuracy: 0.7838427947598253
         train size:   58369
loop 8 : dev accuracy: 0.7751091703056768
         train size:   59611


In [21]:
dev_yp = cls.predict(devX)
error_list = []
for i in range(len(sentiment.dev_data)):
    if dev_yp[i] != sentiment.devy[i]:
        error_list.append((sentiment.dev_data[i], dev_yp[i]))