In [2]:
import os
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/kscanne/5750/master/data/"
TRAIN_URL = DOWNLOAD_ROOT + "ga-sentiment-train.csv"
TEST_URL = DOWNLOAD_ROOT + "ga-sentiment-test.csv"
VOCAB_URL = DOWNLOAD_ROOT + "ga-sentiment-vocab.txt"
DATASET_PATH = os.path.join("datasets", "ga-sentiment")

def fetch_data(train_url=TRAIN_URL, test_url=TEST_URL, vocab_url=VOCAB_URL, dataset_path=DATASET_PATH):
    if not os.path.isdir(dataset_path):
        os.makedirs(dataset_path)
    train_path = os.path.join(dataset_path, "train.csv")
    test_path = os.path.join(dataset_path, "test.csv")
    vocab_path = os.path.join(dataset_path, "vocab.csv")
    urllib.request.urlretrieve(train_url, train_path)
    urllib.request.urlretrieve(test_url, test_path)
    urllib.request.urlretrieve(vocab_url, vocab_path)


In [3]:
fetch_data()

In [4]:
import csv

def load_data(dataset_path=DATASET_PATH):
    csv_train_path = os.path.join(dataset_path, "train.csv")
    csv_test_path = os.path.join(dataset_path, "test.csv")
    csv_vocab_path = os.path.join(dataset_path, "vocab.csv")
    train = []
    test = []
    vocab = []
    with open(csv_train_path) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            train.append(row)

    with open(csv_test_path) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            test.append(row)

    with open(csv_vocab_path) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        for row in csvreader:
            vocab.append(row)
            
    return train, test, vocab

In [12]:
train, test, vocab = load_data()

In [13]:
import numpy as np

def separate_data(data, n_features):
    arrays = []
    labels = []
    for d in data:
        if int(d[0]) == 1:
            labels.append(1)
        else: 
            labels.append(-1)
        array = [0] * n_features
        for x in d[1:]:
            array[int(x)] = 1
        arrays.append(array)
    return np.array(arrays), np.array(labels)
        

In [14]:
X_train, y_train = separate_data(train, len(vocab))
X_test, y_test = separate_data(test, len(vocab))

In [23]:
from sklearn.utils import shuffle

def perceptron_train(arrays, labels, max_iter):
    n_features = len(arrays[0])
#     print(n_features)
    n_instances = len(arrays)
#     print(n_instances)
    w = np.zeros(n_features)
    b = 0
    for iteration in range(max_iter):
        arrays, labels = shuffle(arrays, labels)
        for i in range(n_instances):
            x = arrays[i]
            y = labels[i]
            a = np.multiply(w, x).sum() + b
            if a*y <= 0:
                w += np.multiply(y, x)
                b += y
    return w, b           

In [24]:
w, b = perceptron_train(X_train, y_train, 5)

5001
18000


In [11]:
print(X_train)

[[1 0 1 ... 0 0 1]
 [1 0 0 ... 0 0 1]
 [0 0 1 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


In [15]:
len(X_train)

18000

In [17]:
X_train[0]

array([1, 0, 1, ..., 0, 0, 1])

In [16]:
len(X_train[0])

5001

In [25]:
def perceptron_test(arrays, labels, weights, bias):
    n_features = len(arrays[0]) # 18,000
    n_instances = len(arrays)   # 5,001
    correct = 0
    for i in range(n_instances):
        x = arrays[i]
        y = labels[i]
        a = np.multiply(w, x).sum() + b
        if a*y >= 0:
            correct += 1
    return correct/n_instances

In [10]:
print(perceptron_test(X_test, y_test, w, b))

0.707
