### SPARSE REPRESENTATION

In [13]:
import math
import random
import os

from collections import Counter

import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
from copy import copy
from scipy.optimize import minimize

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import confusion_matrix



In [14]:

DATA_DIR = '../data/svm/data/'
POS_DATA_DIR = DATA_DIR + 'pos/'
NEG_DATA_DIR = DATA_DIR + 'neg/'
TRAIN_SPLIT_COUNT = 1500


In [15]:

# read review file into a list of strings
# each string = a token from the review
def read_file(filepath):
    f = open(filepath)
    lines = f.read().split(' ')
    symbols = '${}()[].,:;+-*/&|<>=~" '
    words = map(lambda Element: Element.translate(str.maketrans("", "", symbols)).strip(), lines)
    words = filter(None, words)
    return Counter(list(words))

# return list of file names in a directory
# skips directories
def list_dir_files(dirpath):
    file_names = []
    children = os.listdir(dirpath)

    for child in children:
        if os.path.isfile(os.path.join(dirpath, child)):
            file_names.append(child)
    
    return file_names

# read all files in the directory
def read_dir(dirpath):
    file_contents = []
    file_names = list_dir_files(dirpath)

    for file_name in file_names:
        filepath = os.path.join(dirpath, file_name)
        file_content = read_file(filepath)
        file_contents.append(file_content)

    return file_contents

In [23]:
pos_reviews = read_dir(POS_DATA_DIR)
neg_reviews = read_dir(NEG_DATA_DIR)

NUM_POS_REVIEWS = len(pos_reviews)
NUM_NEG_REVIEWS = len(neg_reviews)

pos_review_points = list(zip(pos_reviews, [1] * NUM_POS_REVIEWS, range(1, NUM_POS_REVIEWS + 1)))
neg_review_points = list(zip(neg_reviews, [-1] * NUM_NEG_REVIEWS, range(1, NUM_NEG_REVIEWS + 1)))

all_points = pos_review_points.copy()
all_points.extend(neg_review_points)
random.shuffle(all_points)

train = all_points[:TRAIN_SPLIT_COUNT]
test = all_points[TRAIN_SPLIT_COUNT:]

train_unzip = list(zip(*train))
train_X = list(train_unzip[0])
train_Y = list(train_unzip[1])
train_indices = list(train_unzip[2])

test_unzip = list(zip(*test))
test_X = list(test_unzip[0])
test_Y = list(test_unzip[1])
test_indices = list(test_unzip[2])


### SVM with PEGASOS

In [None]:
"""
Dot product between sparse representation of dictionaries/counters
"""
def sparse_dot(d1, d2):
    if len(d1) < len(d2):
        return sparse_dot(d1, d2)
    
    return sum(d1[k] * v for k, v in d2.items())

"""
Inplace linear combination update for d1 += z * d2
"""
def sparse_lc(d1, z, d2):
    for k, v in d2.items():
        d1[k] += z * v

In [24]:

class Pegasos():
    def __init__(self, l2_reg = 1):
        self.w = Counter()
        self.l2_reg = l2_reg
    
    def fit(self, X, y, max_epochs = 1000):
        epoch = 0
        m = len(X)
        t = 0
        while epoch < max_epochs:
            for j in range(m):
                x_j = X[j]
                y_j = y[j]

                t += 1
                step_size = 1 / (t * self.l2_reg)

                if y_j * sparse_dot(w, x_j) < 1:
                    sparse_lc(self.w, 1 - step_size * self.l2_reg, self.w)
                    sparse_lc(self.w, step_size * y_j, x_j)
                
                else:
                    sparse_lc(self.w, 1 - step_size * self.l2_reg, self.w)
                
                epoch += 1
        
        return self.w
        