In [2]:
import numpy as np
import torch
from collections import Counter
import re
import pickle as pkl
from sklearn import svm
import pandas as pd

In [9]:
# Function to load word vectors pre-trained on Google News
# Arguments: None
# Returns: w2v (dict)
# Where, w2v (dict) is a dictionary with words as keys (lowercase) and vectors as values
def load_w2v():
    with open('./Data/w2v.pkl', 'rb') as fin:
        return pkl.load(fin)
    
def get_tokens(doc):
	tokens = re.split(r"[^A-Za-z0-9-']", doc)
	tokens = list(filter(len, tokens))
	return tokens


def word2vec_rep(docs):
    '''  DESCRIPTION:   Function to get word2vec averge representations. Input is a collection of documents. Docs are tokenized and for each line the average representation
                        is computed from its constituent tokens. Unknownwords are skipped.
    
         ARGUMENTS: docs: A list of strings, each string represents a document
         
         RETURNS:   mat (numpy.ndarray) of size (len(docs), dim) mat is a two-dimensional numpy array containing vector representation for ith document (in input list docs) in ith row
                    dim represents the dimensions of word vectors, here dim = 300 for Google News pre-trained vectors
    
    '''
    # Declare variables
    freqs = Counter()
    docFreqs = [Counter() for d in docs]
    docTokens = [[] for d in docs]
    voc = []

    # Build Vocabulary
    for i,d in enumerate(docs):
        docTokens = get_tokens(d.lower())                          # get tokens
        #isWord = [t in stopwords for t in tokens]       # see which word is stop word
        #docTokens[i] = [t for (t,i) in zip(tokens,isWord) if not i]# eliminate stopwords from furhter consideration    
        #docTokens[i] = [t for (t,i) in zip(tokens,isWord) if not i]# eliminate stopwords from furhter consideration    
        docFreqs[i] = Counter(docTokens)                   # count the freqs os this docs' tokens 
        freqs += Counter(docTokens)                        # Add the doc's freqs to total freqs


    # Dummy matrix
    dim = 300
    mat = np.zeros((len(docs), dim))
    w2v = load_w2v()

    # Create a sorted vocabulary out of the unique terms. Declare the matrix that hold the per doc
    # bag-of-word represeantions in terms of appeared token freqeuencies!
    voc = list(freqs.keys())
    voc.sort()

    # Build averaged representations
    cnt = 0
    embedding = np.zeros(dim)
    for i in range(len(docs)):
        for j, v in enumerate(voc):
            if v in w2v:
                embedding += (w2v[v] * docFreqs[i][v])
                cnt += docFreqs[i][v]
            # else: # word not in w2v, just consider it as adding 0's
                # cnt += 1
        cnt = cnt if cnt > 0 else 1
        mat[i] = embedding / cnt # average the summed embeddings
        cnt = 0
        embedding.fill(0) 


    return mat

tweetFile = './Data/training-Obama-Romney-tweets_corrected2_normalized_no_stop_words.txt'

# Read all lines of tweet file, store them as list of strings
#file1 = open(tweetFile, 'r') 
#lines = file1.readlines()
with open(tweetFile) as f:
    lines = f.read().splitlines() 

In [5]:

# Get the average word 2 vec representation of all tweets. Unknown words are omitted.
m = word2vec_rep(lines)

# Save reps to disk for future use
saveFile = './Data/avg_w2v_rep.npy'
np.save(saveFile,m)

NameError: name 'm' is not defined

In [49]:
def tweet_summary_reps(lines, lenSizeType = 'maxVal', targetDesnity = 0.7):
    lengths = []
    splitLines = []
    totalLines = 0
    for l in lines:
        splitLine = l.split(' ')
        lineLen = len(splitLine)
        totalLines += 1
        lengths.append(lineLen)
        splitLines.append(splitLine)
        
    occurence_count = Counter(lengths) 
    # Find most frequent length. Structure is (top k most common tuples (val, freq)[choose tuple][val or freq])
    freqs = occurence_count.most_common()
    occurs, avgLen = 0, 0
    maxVal = 0
    # Pick the average size derived from the vals whose total frequency is 70% of the overall data
    for i, t in enumerate(freqs):
        occurs += t[1]
        avgLen += float(t[0] * t[1])
        maxVal = t[0] if t[0] > maxVal else maxVal
        #print(occurs, avgLen, targetDesnity (def 0.7)* totalLines, i, t[0], t[1])
        if occurs>= targetDesnity * totalLines:
            break
            
    mode = int(avgLen/ occurs)
    print('At {} density tweet length mode: {}, max val: {}'.format(targetDesnity, mode, maxVal))
    targetSize= maxVal if lenSizeType == 'maxVal' else mode
    # Turn all tweets to worv2vec reps. Pad all shorter and cut all large tweets to targetSize.
   
    cnt, flag  = 0, 0    
    # Declare rep Matrix. Dimension of word2vec is 300. So, the matrix should be numOfTweets * targetSize
    dim = 300
    mat = np.zeros((len(lines), dim* targetSize))
    w2v = load_w2v()
    embedding = np.zeros(dim)
     # Build  representations. if a tweet is longer than targetSize cut it to target size
    for i, l in enumerate(splitLines):
        if len(l) > targetSize:
            l = l[:targetSize]
        for j, w in enumerate(l):
            # if word is known add its represention, otherwise treat it as 0.              
            if w in w2v:
                mat[i, j*300: (j+1)*300] = w2v[w]
    
    print("Tweet_word2vec size: {}".format(mat.shape))

    return mat

m = tweet_summary_reps(lines)
saveFile = './Data/tweet_w2v_rep.npy'
np.save(saveFile, m)

At 0.7 density tweet length mode: 10, max val: 14
Tweet_word2vec size: (5624, 4200)


## Get labels

In [51]:
labelFile = './Data/training-Obama-Romney-tweets_corrected2_normalized_no_stop_words_labels.txt'
labels = np.loadtxt(labelFile)
print(m.shape, labels.shape)

(5624, 300) (5624,)


## SVM Sentiment CLassification
### Train Classifier

In [56]:
n = 0.8  # for 2 random indices
dataSize = m.shape[0]
trainSize = int(n * dataSize)
trainIdxs = np.random.choice(dataSize, trainSize, replace=False)  
print(trainIdxs)

[1376 4711 2137 ... 5419 4648 3384]


In [52]:
# One vs one case
#clf = svm.SVC()
#clf.fit(m, labels)
# One-vs rest
lin_clf = svm.LinearSVC()
lin_clf.fit(m, labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [53]:
dec = lin_clf.decision_function([[1]])
dec.shape[1]

ValueError: X has 1 features per sample; expecting 300