In [34]:
import numpy as np
from sklearn.base import BaseEstimator

class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])


class Pegasus(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=20):#,lambda_=0.1):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter  = n_iter
        #self.lambda_ = lambda_

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
    
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.lambda_ = 1/n_samples

        # Perceptron algorithm:
        for t in range(1,self.n_iter+1):

            idx = np.random.randint(n_samples)

            # Print out the index
            print("Index:", idx)

            eta = 1/(self.lambda_*t)

            #for x, y in zip(X, Ye):
            x, y = X[idx], Ye[idx]

            # Compute the output score for this instance.
            score = self.w.dot(x)
            # If there was an error, update the weights.
            if y*score < 1:
                self.w = (1-eta*self.lambda_)*self.w + x*eta*y
            else:
                self.w *= 1-eta*self.lambda_

In [36]:
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# from aml_perceptron import Perceptron, SparsePerceptron

# This function reads the corpus, returns a list of documents, and a list
# of their corresponding polarity labels. 
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


if __name__ == '__main__':
    
    # Read all the documents.
    X, Y = read_data('DAT341-Applied-Machine-Learning/PA4/data/all_sentiment_shuffled.txt')
    
    # Split into training and test parts.
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                    random_state=0)

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),
        
        Pegasus(n_iter=10000)#,lambda_=1)  
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Index: 8449
Index: 4168
Index: 5307
Index: 684
Index: 3299
Index: 359
Index: 2479
Index: 6503
Index: 4734
Index: 8593
Index: 616
Index: 2548
Index: 3661
Index: 3499
Index: 7177
Index: 889
Index: 2723
Index: 9167
Index: 2439
Index: 1965
Index: 921
Index: 8672
Index: 2684
Index: 4135
Index: 6940
Index: 7568
Index: 3327
Index: 341
Index: 176
Index: 6213
Index: 5573
Index: 2572
Index: 985
Index: 8525
Index: 6928
Index: 8141
Index: 5682
Index: 3385
Index: 1763
Index: 1190
Index: 5193
Index: 7484
Index: 5820
Index: 2335
Index: 2209
Index: 4527
Index: 6148
Index: 7998
Index: 2143
Index: 2850
Index: 4279
Index: 6177
Index: 3096
Index: 4780
Index: 4251
Index: 6524
Index: 6164
Index: 991
Index: 5117
Index: 6223
Index: 5413
Index: 2707
Index: 6409
Index: 4017
Index: 4160
Index: 4243
Index: 8677
Index: 7820
Index: 4444
Index: 3538
Index: 239
Index: 1677
Index: 3069
Index: 4069
Index: 8016
Index: 4807
Index: 1541
Index: 5322
Index: 7152
Index: 5335
Index: 8803
Index: 9308
Index: 9464
Index: 9378
In