### SPARSE REPRESENTATION

In [1]:
import math
import random
import os

from collections import Counter

import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
from copy import copy
from scipy.optimize import minimize

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import confusion_matrix



In [2]:

DATA_DIR = '../data/svm/data/'
POS_DATA_DIR = DATA_DIR + 'pos/'
NEG_DATA_DIR = DATA_DIR + 'neg/'
TRAIN_SPLIT_COUNT = 1500


In [3]:

# read review file into a list of strings
# each string = a token from the review
def read_file(filepath):
    f = open(filepath)
    lines = f.read().split(' ')
    symbols = '${}()[].,:;+-*/&|<>=~" '
    words = map(lambda Element: Element.translate(str.maketrans("", "", symbols)).strip(), lines)
    words = filter(None, words)
    return Counter(list(words))

# return list of file names in a directory
# skips directories
def list_dir_files(dirpath):
    file_names = []
    children = os.listdir(dirpath)

    for child in children:
        if os.path.isfile(os.path.join(dirpath, child)):
            file_names.append(child)
    
    return file_names

# read all files in the directory
def read_dir(dirpath):
    file_contents = []
    file_names = list_dir_files(dirpath)

    for file_name in file_names:
        filepath = os.path.join(dirpath, file_name)
        file_content = read_file(filepath)
        file_contents.append(file_content)

    return file_contents

In [4]:
pos_reviews = read_dir(POS_DATA_DIR)
neg_reviews = read_dir(NEG_DATA_DIR)

NUM_POS_REVIEWS = len(pos_reviews)
NUM_NEG_REVIEWS = len(neg_reviews)

pos_review_points = list(zip(pos_reviews, [1] * NUM_POS_REVIEWS, range(1, NUM_POS_REVIEWS + 1)))
neg_review_points = list(zip(neg_reviews, [-1] * NUM_NEG_REVIEWS, range(1, NUM_NEG_REVIEWS + 1)))

all_points = pos_review_points.copy()
all_points.extend(neg_review_points)
random.shuffle(all_points)

train = all_points[:TRAIN_SPLIT_COUNT]
test = all_points[TRAIN_SPLIT_COUNT:]

train_unzip = list(zip(*train))
train_X = list(train_unzip[0])
train_Y = list(train_unzip[1])
train_indices = list(train_unzip[2])

test_unzip = list(zip(*test))
test_X = list(test_unzip[0])
test_Y = list(test_unzip[1])
test_indices = list(test_unzip[2])


In [5]:
NUM_NEG_REVIEWS

1000

### SVM with PEGASOS

In [6]:
"""
Dot product between sparse representation of dictionaries/counters
"""
def sparse_dot(d1, d2):
    if len(d1) < len(d2):
        return sparse_dot(d2, d1)
    
    return sum(d1[k] * v for k, v in d2.items())

"""
Inplace linear combination update for d1 += z * d2
"""
def sparse_lc(d1, z, d2):
    for k, v in d2.items():
        d1[k] += z * v

In [7]:
d1 = Counter(['this', 'is', 'nikhil'])
d2 = Counter(['who', 'is'])

d = copy(d1)
sparse_lc(d, 1.5, d2)
d

Counter({'this': 1, 'is': 2.5, 'nikhil': 1, 'who': 1.5})

In [50]:

class Pegasos():
    def __init__(self, l2_reg = 1):
        self.w = Counter()
        self.l2_reg = l2_reg
    
    def fit(self, X, y, max_epochs = 1000):
        epoch = 0
        m = len(X)
        t = 1
        while epoch < max_epochs:
            for j in range(m):
                x_j = X[j]
                y_j = y[j]

                t += 1
                step_size = 1 / (t * self.l2_reg)
                
                if y_j * sparse_dot(self.w, x_j) < 1:
                    sparse_lc(self.w, -step_size * self.l2_reg, self.w)
                    sparse_lc(self.w, step_size * y_j, x_j)
                else:
                    sparse_lc(self.w, -step_size * self.l2_reg, self.w)
            
                
            epoch += 1
        
        return self.w, epoch


In [57]:
class Pegasos_opt():
    def __init__(self, l2_reg = 1):
        self.w = Counter()
        self.s = 1
        self.l2_reg = l2_reg
    
    def fit(self, X, y, max_epochs = 1000):
        epoch = 0
        m = len(X)
        t = 1
        s_ = 1
        while epoch < max_epochs:
            for j in range(m):
                x_j = X[j]
                y_j = y[j]

                t += 1
                step_size = 1 / (t * self.l2_reg)

                s_ = (1 - step_size * self.l2_reg) * s_

                if s_ == 0:
                    s_ = 1
                    self.w = Counter()
                else:
                    if y_j * sparse_dot(self.w, x_j) < 1:
                        sparse_lc(self.w, step_size * y_j/s_, x_j)
                
            epoch += 1
        
        sparse_lc(self.w, s_ - 1, self.w)

        return self.w, epoch

In [58]:
pegasos = Pegasos_opt(1)
w, e = pegasos.fit(train_X, train_Y, max_epochs = 2)
w.most_common(20)

[('most', 0.03932022659112988),
 ('quite', 0.037987337554142186),
 ('many', 0.036987670776412074),
 ('also', 0.0353215594801668),
 ('see', 0.0339886704431791),
 ('and', 0.03365544818393573),
 ('you', 0.03332222592469236),
 ('seen', 0.03232255914694804),
 ('well', 0.0303232255914736),
 ('very', 0.02999000333221602),
 ('movies', 0.028323892035984954),
 ('own', 0.02665778073975389),
 ('life', 0.024658447184265242),
 ('jackie', 0.02432522492502187),
 ('cauldron', 0.02432522492502187),
 ('will', 0.023325558147277548),
 ('back', 0.022992335888034177),
 ('animation', 0.022992335888034177),
 ('great', 0.022659113628790806),
 ('world', 0.022659113628790806)]

In [49]:
pegasos = Pegasos(1)
w, e = pegasos.fit(train_X, train_Y, max_epochs = 2)
w.most_common(20)

[('you', 0.069310229923359),
 ('most', 0.06764411862712418),
 ('also', 0.06097967344218593),
 ('very', 0.056981006331222966),
 ('his', 0.05464845051649458),
 ('seen', 0.05364878373875374),
 ('and', 0.046651116294568494),
 ('well', 0.04465178273908694),
 ('life', 0.04431856047984016),
 ('see', 0.04331889370209925),
 ("it's", 0.04031989336887705),
 ('quite', 0.03965344885038322),
 ('many', 0.03765411529490181),
 ('movies', 0.03765411529490179),
 ('fun', 0.03732089303565483),
 ('films', 0.036654448517160966),
 ('best', 0.03565478173942022),
 ('world', 0.03532155948017329),
 ('will', 0.03532155948017317),
 ('people', 0.034655114961679445)]