In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class smart_dict(dict):
    def __missing__(self, key):
        return (-1)

In [3]:
def parseLines(path):
    lines = []
    with open(path, 'r') as file:
        for line in file:
            lines.append(line.replace('\n', ''))
    return lines

In [4]:
def parseFeatures(lines,data_tokens,data_dict):
    features = np.zeros(shape=(len(lines),len(data_tokens)))
    i = 0
    for sentence in lines:
        words = sentence.split()
        for word in words:
            if word in data_dict:
                features[i,data_dict[word]] = 1
        i = i + 1
    return features

In [5]:
def parseLabels(path):
    labels = []
    with open(path, 'r') as file:
        for line in file:
            labels.append(line.replace('\n', '').replace('0',"-1") )
        
    labels = np.asarray(labels, dtype=int)  
    return labels

In [6]:
#Standard Perceptron
class Perceptron(object):
    def __init__(self, no_of_inputs, training_inputs, training_labels,testing_inputs, testing_labels,threshold=20, learning_rate=1):
        self.threshold = threshold
        self.learning_rate = learning_rate
        self.iterTrainAccuracy = []
        self.iterTestAccuracy = []
        self.iterTrainMistakes = []
        self.iterTestMistakes = []
        self.training_data = training_inputs
        self.training_labels = training_labels
        self.testing_data = testing_inputs
        self.testing_labels = testing_labels
        self.weights = np.zeros(no_of_inputs)
    
    def predict(self, inputs):    
        return np.sign(np.dot(inputs, self.weights)) 
    
    def finalTrain(self):
        return self.iterTrainAccuracy[-1]
        
    def finalTest(self):
        return self.iterTestAccuracy[-1]
    
    def test(self):
        mistakes = 0
        for inputs, label in zip(self.testing_data, self.testing_labels):
            prediction = self.predict(inputs)
            if(prediction != label):
                mistakes += 1
        self.iterTestMistakes.append(mistakes)
        self.iterTestAccuracy.append((len(self.testing_data)-mistakes)/len(self.testing_data))
        
    def train(self):
        for _ in range(self.threshold):
            mistakes = 0
            for inputs, label in zip(self.training_data, self.training_labels):
                prediction = self.predict(inputs)
                if(prediction != label):
                    mistakes+=1
                    self.weights += self.learning_rate * (label) * inputs
                
            self.test()      
            self.iterTrainMistakes.append(mistakes)
            self.iterTrainAccuracy.append((len(self.training_data)-mistakes)/len(self.training_data))
        return self.weights    

In [14]:
#Standard Averaged Perceptron
class aperceptron(object): 
    def __init__(self,no_of_inputs, training_inputs, training_labels,threshold=20, learning_rate=1):
        self.threshold = threshold
        self.learning_rate = learning_rate
        self.weights = np.zeros(no_of_inputs)
        self.u = np.zeros(no_of_inputs)
        self.bias = 0
        self.beta = 0
        self.training_data = training_inputs
        self.training_labels = training_labels
        self.count = 1
        self.trainingAccuracy = 0
        self.testingAccuracy = 0
        
    def predict(self,inputs,label):
        return np.sign(np.dot(inputs, self.weights) + self.bias)
        
    def train(self):
    #Average perceptron algorithm
        for _ in range(self.threshold):
            mistakes = 0
            for  inputs, label in zip(self.training_data, self.training_labels):
                prediction = self.predict(inputs,label)
                if (prediction != label):
                    mistakes+=1
                    self.weights += label * inputs * self.learning_rate
                    self.bias += label * self.learning_rate
                    self.u += label * self.count * inputs * self.learning_rate
                    self.beta +=  label * self.count * self.learning_rate
            self.count = self.count + 1
            self.trainingAccuracy = ((len(self.training_data)-mistakes)/len(self.training_data))
        self.weights = self.weights - (1/self.count)*self.u
        self.bias = np.array([self.bias- (1/self.count)*self.beta]) 
        return self.weights
    
    def testAccuracy(self,testing_inputs,testing_labels):
        mistakes = 0
        for inputs, label in zip(testing_inputs, testing_labels):
            prediction = self.predict(inputs,label)
            if prediction <= 0:
                mistakes += 1
        self.testingAccuracy = ((len(testing_inputs)-mistakes)/len(testing_inputs))  

In [15]:
#PART 1---------------------------------------------------------------------

#READ AND PARSE STOPLIST--------------------------
with open("stoplist.txt") as file:
    stop_words = file.read().replace('\n', ' ') 

stopwords = stop_words.split()
stop_words_dict = Counter(stop_words.split())
#-------------------------------------------------

#READ AND ESTABLISH VOLCABULARY-------------------
with open('traindata.txt', 'r') as file:
    text = file.read().replace('\n', ' ')  

token = set(' '.join([word for word in text.split() if word not in stop_words_dict]).split())
training_data_tokens = sorted(token)

training_data_dict = {}
i = 0
for entry in training_data_tokens:
    training_data_dict[entry] = i 
    i = i + 1
#-------------------------------------------------

#CONSTRUCT TRAINING DATA AND LABELS---------------
lines = parseLines('traindata.txt')
training_labels = parseLabels('trainlabels.txt') 
features = parseFeatures(lines,training_data_tokens,training_data_dict)
#-------------------------------------------------

#CONSTRUCT TESTING DATA AND LABELS----------------
lines = parseLines('testdata.txt')
testing_labels = parseLabels('testlabels.txt')
testfeatures = parseFeatures(lines,training_data_tokens,training_data_dict)
#-------------------------------------------------

perceptron = Perceptron(features.shape[1],features, training_labels,testfeatures,testing_labels)
perceptron.train()

Aperceptron = aperceptron(features.shape[1],features, training_labels)
Aperceptron.train()
Aperceptron.testAccuracy(testfeatures,testing_labels)

#PART 2---------------------------------------------------------------------

vowel_dict = smart_dict({'a':1,'e':1,'i':1,'o':1,'u':1})

def readIn(path):
    data = pd.read_table(path, delim_whitespace=True,names = ('Num','im','label','del'),dtype={'Num': np.int64, 'im': str, 'label': type('')})
    del data['Num']
    del data['del']
    data['im'] = data['im'].map(lambda x: x.lstrip('im'))
    return data

d = readIn('ocr_train.txt')
ocr_training_labels = np.array([vowel_dict[x] for x in d['label']], dtype=int)
ocr_training_features = np.array([list(x) for x in d['im']], dtype=int)

d = readIn('ocr_test.txt')
ocr_testing_labels = np.array([vowel_dict[x] for x in d['label']], dtype=int)
ocr_testing_features = np.array([list(x) for x in d['im']], dtype=int)

perceptronOCR = Perceptron(ocr_training_features.shape[1],ocr_training_features, ocr_training_labels,ocr_testing_features,ocr_testing_labels)
perceptronOCR.train()

AperceptronOCR = aperceptron(ocr_training_features.shape[1],ocr_training_features, ocr_training_labels)
AperceptronOCR.train()
AperceptronOCR.testAccuracy(ocr_testing_features,ocr_testing_labels)

with open("output.txt", "w") as text_file:
    print("FORTUNE COOKIE DATA\n",file = text_file)
    for i in range(0,len(perceptron.iterTrainMistakes)):
        print("iteration-" , (i+1) , " train-mistakes: " , perceptron.iterTrainMistakes[i] , " test-mistakes: " , perceptron.iterTestMistakes[i]  ,file = text_file)
    print("\n",file = text_file)
    for i in range(0,len(perceptron.iterTrainAccuracy)):
        print("iteration-" , (i+1) , " training-Accuracy :" , perceptron.iterTrainAccuracy[i] , " testing-Accuracy: " , perceptron.iterTestAccuracy[i]  ,file = text_file)
        
    print("\ntraining-accuracy-standard-perceptron: ", perceptron.finalTrain(), " training-accuracy-averaged-perceptron: " , Aperceptron.trainingAccuracy,file = text_file)
    print("testing-accuracy-standard-perceptron: ",perceptron.finalTest(), " testing-accuracy-averaged-perceptron: " , Aperceptron.testingAccuracy,file = text_file)
    print("\n",file = text_file)
    print("\nOCR DATA\n",file = text_file)
    for i in range(0,len(perceptronOCR.iterTrainMistakes)):
        print("iteration-" , (i+1) , " train-mistakes: " , perceptronOCR.iterTrainMistakes[i] , " test-mistakes: " , perceptronOCR.iterTestMistakes[i]  ,file = text_file)
    print("\n",file = text_file)
    for i in range(0,len(perceptronOCR.iterTrainAccuracy)):
        print("iteration-" , (i+1) , " training-Accuracy :" , perceptronOCR.iterTrainAccuracy[i] , " testing-Accuracy: " , perceptronOCR.iterTestAccuracy[i]  ,file = text_file)
        
    print("\ntraining-accuracy-standard-perceptron: ", perceptronOCR.finalTrain(), " training-accuracy-averaged-perceptron: " , AperceptronOCR.trainingAccuracy,file = text_file)
    print("testing-accuracy-standard-perceptron: ",perceptronOCR.finalTest(), " testing-accuracy-averaged-perceptron: " , AperceptronOCR.testingAccuracy,file = text_file)

In [9]:
Inputs = the learning rate N,  the training threshold T, the input feature vector candidate pairs P and the sign of the corresponding candidate scores L (in the same order as P)
Output: w, the final weight vector.
1. init the weights w = 0
2. init the bias = 0
3. for each training iteration _ in T do
4.    for each pair (kt,lt) , yt in zip(P,L)
5.        prediction = sign((k dot w + bias) - (l dot w + bias))
6.        if prediction != yt then 
7.        w = w + N * yt  * P
8.        bias = bias + N * yt
9.        end if
10.    end for
11. end for
12.return the weight vector w

SyntaxError: invalid syntax (<ipython-input-9-e98bb0546419>, line 1)