In [None]:
# Imports
import numpy as np
import pandas as pd
import re
from collections import defaultdict
from quadprog_wrapper import solve_quadprog
from utils import *

In [None]:
# Load data
# Used nrows to specify how many rows to use from the imported file
# Only used 1000 due to memory issues
train_data = pd.read_csv('train.csv', nrows=1000)
test_data = pd.read_csv('test.csv', nrows=1000)

# Preprocess data
train_data = cleanData_1(train_data)
test_data = cleanData_1(test_data)

In [None]:
# Define vocabulary to hold words
vocabulary = defaultdict(int)
for text in train_data['text']:
    for word in text:
        vocabulary[word] += 1

# Assign numerical IDs to words in vocabulary
word_ids = {}
for i, word in enumerate(vocabulary.keys()):
    word_ids[word] = i

# Convert text data to numerical features
train_X = np.zeros((len(train_data), len(word_ids)))
for i, text in enumerate(train_data['text']):
    for word in text:
        if word in word_ids:
            train_X[i, word_ids[word]] += 1
# Convert for testing data as well
test_X = np.zeros((len(test_data), len(word_ids)))
for i, text in enumerate(test_data['text']):
    for word in text:
        if word in word_ids:
            test_X[i, word_ids[word]] += 1

# Convert label data to numerical labels
train_label = np.where(train_data['label'] == 'real', 1, -1)
test_label = np.where(test_data['label'] == 'real', 1, -1)

In [None]:
# Implementation of SVM loosely based off HW 3 Linear Kernel model
class SVM:
    def __init__(self):
        self.params = {'kernel': 'linear', 'C': 1.0}

    def rbf_kernel(self, row_data, col_data, sigma):
        """
        Compute the Gram matrix between row_data and col_data for the Gaussian radial-basis function (RBF) kernel.

        :param row_data: ndarray of shape (2, m), where each column is a data example
        :type row_data: ndarray
        :param col_data: ndarray of shape (2, n), where each column is a data example
        :type col_data: ndarray
        :param sigma: scalar quantity that scales the Euclidean distance inside the exponent of the RBF value
        :type sigma: float
        :return: a matrix whose (i, j) entry is the kernel value between row_data[:, i] and col_data[:, j]
        :rtype: ndarray
        """
        #############################################
        # TODO: Insert your code below to implement the RBF kernel.
        # This computation should take around 1--3 lines of code if you use matrix operations.
        # One hint on how to accomplish this is the fact that for vectors x, y:
        # (x - y).dot(x - y) = x.dot(x) + y.dot(y) - 2 * x.dot(y)
        #############################################

        return np.exp((1 / (-2 * sigma * sigma)) * ((np.sum(pow(row_data, 2), axis=0, keepdims=True).T + np.sum(
            pow(col_data, 2), axis=0, keepdims=True)) - (2 * (row_data.T.dot(col_data)))))

    def linear_kernel(self, row_data, col_data):
        """
        Compute the Gram matrix between row_data and col_data for the linear kernel.
        :param row_data: ndarray of shape (2, m), where each column is a data example
        :type row_data: ndarray
        :param col_data: ndarray of shape (2, n), where each column is a data example
        :type col_data: ndarray
        :return: a matrix whose (i, j) entry is the kernel value between row_data[:, i] and col_data[:, j]
        :rtype: ndarray
        """
        return row_data.T.dot(col_data)
    # Training
    def train(self, X, y):
        """
        :param X: Data (title, text, etc.)
        :param y: Labels
        :return: the model (model)
        """
        if self.params['kernel'] == 'rbf':
            gram_matrix = self.rbf_kernel(X, X, self.params['sigma'])
        else:
            gram_matrix = self.linear_kernel(X, X)
        # symmetrize to help correct minor numerical errors
        gram_matrix = (gram_matrix + gram_matrix.T) / 2

        n = gram_matrix.shape[0]

        # Setting up the inputs to the quadratic programming solver that solves:
        # minimize      0.5 x^T (hessian) x - (weights)^T x
        # subject to    (eq_coeffs) x = (eq_constants)
        #   and         (lower_bounds) <= x <= (upper_bounds)
        hessian = np.outer(y, y) * gram_matrix
        weights = np.ones(n)

        eq_coeffs = np.zeros((1, n))
        eq_coeffs[0, :] = y
        eq_constants = np.zeros(1)

        lower_bounds = np.zeros(n)
        upper_bounds = self.params['C']

        # Call quadratic program with provided inputs.
        alphas = solve_quadprog(hessian, weights, eq_coeffs, eq_constants, None,
                                None, lower_bounds, upper_bounds)

        self.model = dict()

        # process optimized alphas to only store support vectors and alphas that have nonnegligible support
        tolerance = 1e-6
        sv_indices = alphas > tolerance
        self.model['support_vectors'] = X[:, sv_indices]
        self.model['alphas'] = alphas[sv_indices]
        self.model['params'] = self.params  # store the kernel type and parameters
        self.model['sv_labels'] = y[sv_indices]

        # find all alphas that represent points on the decision margin
        margin_alphas = np.logical_and(
            alphas > tolerance, alphas < self.params['C'] - tolerance)

        # compute the bias value
        if np.any(margin_alphas):
            self.model['bias'] = np.mean(y[margin_alphas].T - (alphas * y).T.dot(gram_matrix[:, margin_alphas]))
        else:
            # there were no support vectors on the margin (this should only happen due to numerical errors)
            self.model['bias'] = 0

    # Prediction
    def predict(self, X):
        gram_matrix = self.linear_kernel(X, self.model['support_vectors'])
        scores = gram_matrix.dot(
            self.model['alphas'] * self.model['sv_labels']) + self.model['bias']
        scores = scores.ravel()
        labels = 2 * (scores > 0) - 1  # threshold and map to {-1, 1}

        return labels, scores

In [None]:
# Train model
svm = SVM()
svm.train(np.array(list(train_X)).T, train_label)

# Test model
predictions = svm.predict(np.array(list(test_X)).T)

# Evaluate model
accuracy = np.mean(predictions == test_label)
print('Accuracy:', accuracy)