# A Vector Space Model, with scikit-learn

In [None]:
# This is code to build a vector space model, with SVMs on Andrew Mass' 
# distribution of movie review sentiment data.

## namedtuple

In [7]:
# While Python tuples is indexed numerically (like a list), a named tuple assigns names to fields and 
# is also indexed numerically. This makes it possible to access the fields in a named tuple using these names
# as if they were attributes of an object (via dotting into the namedtuple)
# See also here: https://docs.python.org/2/library/collections.html
from collections import namedtuple
Student = namedtuple("Student", ["name", "age", "gender", "course"])
#--------------------------------------------------------------------
# Note: You can also provide field names as a space-delimited string, rather than a list.
#Student = namedtuple("Student", "name age gender course")
#--------------------------------------------------------------------

angela=Student(name="Angela", age=45, gender="female", course="Python")
soha=Student(name="Soha", age=25, gender="female", course="Visual Arts")
print(angela.gender)
print(soha.course)


female
Visual Arts


In [8]:
# A namedtuple is also iterable like a tuple
for i in soha:
    print(i)

Soha
25
female
Visual Arts


In [10]:
# You can access a namedtuple the same way you access a tuple or a list:
soha[-1]

'Visual Arts'

In [11]:
# We can now create a list where we append the two namedtuples above.
# i.e., a list of namedtuples
all_students=[]
all_students.append(angela)
all_students.append(soha)
for s in all_students:
    print(s)

Student(name='Angela', age=45, gender='female', course='Python')
Student(name='Soha', age=25, gender='female', course='Visual Arts')


In [32]:
for s in all_students:
    print("- {} is {} years old.").format(s.name, s.age)

- Angela is 45 years old.
- Soha is 25 years old.


In [42]:
# We should usually get tags automatically based on input data file.
# In the input data file we have, we know that the first 12500 data points are positive/1.0 and the next 12500 are
# negative/0.0 then the next 12500 is poitive and the fourth chunk is negative.
# So basically the train_data has 25K (with the first half positive and the second half negative)
# and test_data with the same setup for class label. 
# The rest of the data in the file is unknown/neutral/-1 and we don't use that part.

def map_tags(post_index):
    # if post is positive, tag=1, if it is negative tag=0, if it is neutral, tag=-1
    tag=-1
    if post_index < 12500:
        tag=1
    elif post_index < 25000:
        tag=0
    elif post_index < 37500:
        tag=1
    elif post_index < 50000:
        tag=0
    return tag

In [79]:
from collections import namedtuple

def get_all_data():
    """
    Returns a list of namedtuples from the IMDB file.
    Each namedtuple has two named fields:
        tag= class label (0 for "negative" and 1 for "positive")
        word_list the list of words in the review
    """
    # a list to house all the data
    all_data = []  
    
    DataDoc= namedtuple('DataDoc', 'tag words')
    with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:
        for line_no, line in enumerate(alldata):
            post_index=int(line.split()[0].split("*")[-1])
            label=map_tags(post_index)
            word_list=line.lower().split()[1:]
            all_data.append(DataDoc(label, word_list))
    return all_data

# Call the function to get the data
all_data= get_all_data()
# The data are 100K reviews as explained earlier
# Since the last 50K are unknown, let's throw them away
all_data=all_data[:50000]
print(len(all_data))
print("*"*50)
# print the first namedtuple
print(all_data[0])
print("*"*50)
# print the last namedtuple
#print(all_data[-1])

50000
**************************************************
DataDoc(tag=1, words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'teachers', '"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'teachers', '"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.'

In [80]:
# The data set is big, and we want to only work with a very small sample of it.
# Let's randomize the reviews and then take only 500 of them and call them train_data.
# We will then do cross-validation on these later.
from random import shuffle
shuffle(all_data)
#-------------------------
train_data = all_data[:500]
#------------------------
print len(train_data)

500


In [81]:
# Let's get a dictionary of all the words in training data
# These will be our bag-of-words features
# We won't need this function, since we will use gensim's built-in method "Dictionary" from the corpus module
# --> corpora.Dictionary, but we provide this so that you are clear on one way of how to do this.
from collections import defaultdict
def get_space(train_data):
    """
    input is a list of namedtuples
    get a dict of word space
    key=word
    value=len of the dict at that point 
    (that will be the index of the word and it is unique since the dict grows as we loop)
    """
    word_space=defaultdict(int)
    for doc in train_data:
        for w in doc.words:
            # indexes of words won't be in sequential order as they occur in data (can you tell why?), 
            # but that doesn't matter.
            word_space[w]=len(word_space)
    return word_space

word_space=get_space(train_data)
print len(word_space)
print word_space["love"]

13848
13828


In [82]:
import numpy as np

def get_sparse_vec(data_point, space):
    # create empty vector
    sparse_vec = np.zeros((len(space)))
    for w in set(data_point.words):
        # use exception handling such that this function can also be used to vectorize 
        # data with words not in train (i.e., test and dev data)
        try:
            sparse_vec[space[w]]=1
        except:
            continue
    return sparse_vec

 

train_vecs= [get_sparse_vec(data_point, word_space) for data_point in train_data]
# Get class labels
train_tags=[train_data[i].tag for i in range(len(train_data))]
# Let's look at the last training data point
print train_tags[-1], train_vecs[-1][:10]

0 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [83]:
# scikit-learn likes to take data as numpy arrays. So, let's change our data accordingly:
train_vecs=np.array(train_vecs)
train_tags=np.array(train_tags)
print train_vecs.shape

(500, 13848)


In [84]:
# Classification with scikit-learn
# Now we have: train_tags, train_vecs, test_tags, test_vecs
# Let's use sklearn to train an svm classifier:
#-------------------------------------------------

import argparse
import codecs
import time
import sys
import os, re, glob
import nltk
from collections import defaultdict
from random import shuffle, randint
import numpy as np
from numpy import array, arange, zeros, hstack, argsort
import unicodedata
from scipy.sparse import csr_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import gensim
n_jobs = 2

#train_vecs=array(train_vecs)
train_vecs=np.array(train_vecs)
train_tags=np.array(train_tags)

print type(train_tags)
print type(train_vecs)
clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear', gamma=1, verbose= False, probability=False))
clf.fit(train_vecs, train_tags)
print "\nDone fitting classifier on training data...\n"

#------------------------------------------------------------------------------------------
print "="*50, "\n"
print "Results with 5-fold cross validation:\n"
print "="*50, "\n"
#------------------------------------------------------------------------------------------
predicted = cross_validation.cross_val_predict(clf, train_vecs, train_tags, cv=5)
print "*"*20
print "\t accuracy_score\t", metrics.accuracy_score(train_tags, predicted)
print "*"*20
print "precision_score\t", metrics.precision_score(train_tags, predicted)
print "recall_score\t", metrics.recall_score(train_tags, predicted)
print "\nclassification_report:\n\n", metrics.classification_report(train_tags, predicted)
print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(train_tags, predicted)

<type 'numpy.ndarray'>
<type 'numpy.ndarray'>

Done fitting classifier on training data...


Results with 5-fold cross validation:


********************
	 accuracy_score	0.644
********************
precision_score	0.657692307692
recall_score	0.657692307692

classification_report:

             precision    recall  f1-score   support

          0       0.63      0.63      0.63       240
          1       0.66      0.66      0.66       260

avg / total       0.64      0.64      0.64       500


confusion_matrix:

[[151  89]
 [ 89 171]]


In [85]:
# Usually, we calculate a basline as the majority class in training data.
# Here, to simplify, we just get the majority class in all the data (see support, which is the number of data points in each
# class, in the classification report above)
majority_class=260/500.0
print(majority_class)

0.52
