In [1]:
# models.py

from sentiment_data import *
from utils import *

from collections import Counter
import string
import numpy as np

class FeatureExtractor(object):
    """
    Feature extraction base type. Takes a sentence and returns an indexed list of features.
    """
    def get_indexer(self):
        raise Exception("Don't call me, call my subclasses")

    def extract_features(self, sentence: List[str], add_to_indexer: bool=False) -> Counter:
        """
        Extract features from a sentence represented as a list of words. Includes a flag add_to_indexer to
        :param sentence: words in the example to featurize
        :param add_to_indexer: True if we should grow the dimensionality of the featurizer if new features are encountered.
        At test time, any unseen features should be discarded, but at train time, we probably want to keep growing it.
        :return: A feature vector. We suggest using a Counter[int], which can encode a sparse feature vector (only
        a few indices have nonzero value) in essentially the same way as a map. However, you can use whatever data
        structure you prefer, since this does not interact with the framework code.
        """
        raise Exception("Don't call me, call my subclasses")


class UnigramFeatureExtractor(FeatureExtractor): # 1.29 TODO: make sure works with data structure (SentimentExample object)
    """
    Extracts unigram bag-of-words features from a sentence. It's up to you to decide how you want to handle counts
    and any additional preprocessing you want to do.
    """
    def __init__(self, indexer: Indexer):
        self.indexer = Indexer
        
        #raise Exception("Must be implemented")

    def extract_features(self, sentence: List[str], add_to_indexer: bool=False) -> Counter:
        """
        Extracts unigram bag-of-words features from a sentence reflecting feature counts. 
        The sentence preprocessing involves: lower casing, punctutation removal, contraction expansion
        :param sentence: words in the example to featurize.
        :param add_to_indexer: True if we should grow the dimensionality of the featurizer if new features are encountered.
        :return: A feature vector.
        """        
        # import string # punctuation removal
        
        global features
        
        punctuations = string.punctuation
        sentence = [word.lower() for word in sentence if word not in punctuations] 
        features = Counter(sentence)
        return features
    
        
    def feature_vector_size(self) -> int:
        """
        Get the size of the feature vector.
        :return: vector size.
        """
        return len(features)
        
    # data of data.txt is: [<class '__main__.SentimentExample'>, <class '__main__.SentimentExample'>, ...]
    # (data[0].words)        

class BigramFeatureExtractor(FeatureExtractor):
    """
    Bigram feature extractor analogous to the unigram one.
    """
    def __init__(self, indexer: Indexer):
        raise Exception("Must be implemented")


class BetterFeatureExtractor(FeatureExtractor):
    """
    Better feature extractor...try whatever you can think of!
    """
    def __init__(self, indexer: Indexer):
        raise Exception("Must be implemented")


class SentimentClassifier(object):
    """
    Sentiment classifier base type
    """
    def predict(self, sentence: List[str]) -> int:
        """
        :param sentence: words (List[str]) in the sentence to classify
        :return: Either 0 for negative class or 1 for positive class
        """
        raise Exception("Don't call me, call my subclasses")


class TrivialSentimentClassifier(SentimentClassifier):
    """
    Sentiment classifier that always predicts the positive class.
    """
    def predict(self, sentence: List[str]) -> int:
        return 1


class PerceptronClassifier(SentimentClassifier):
    """
    Implement this class -- you should at least have init() and implement the predict method from the SentimentClassifier
    superclass. Hint: you'll probably need this class to wrap both the weight vector and featurizer -- feel free to
    modify the constructor to pass these in.
    """
    
    #raise Exception("Must be implemented")
    
    def __init__(self, feat_extractor: FeatureExtractor, train_exs: List[SentimentExample]):
        self.feat_extractor = feat_extractor
        self.train_exs = train_exs
        
        self.corpus_vocab = Counter()
        self.weight_vector = Counter()
        
    def create_corpus_vocab(self) -> Counter: #TODO: ??cache??
        """
        Creates the corpus vocabulary by aggregating sparse vectors of each sample (i.e. sentence) 
            in the training dataset.
        :return: A Counter of all the words in the corpus and their frequencies.
        """         
        
        data_size = len(self.train_exs)
        
        # for each example in training data
        for i in range(data_size):
            
            # compute sparse feature vector 
            sentence_feature_vector = self.feat_extractor.extract_features(self.train_exs[i])
            
            # update corpus 
            self.corpus_vocab += sentence_feature_vector
            
        return self.corpus_vocab

                  
    
    def initialize_weight_vector(self) -> Counter: # should be the same size as the corpus vocab
        """
        Initializes the weight vector (same size as the corpus vocabulary) with zeroes.
        :return: A Counter of words with zeroes as weights.
        """           

        for word in list(self.corpus_vocab): 
            self.weight_vector.update({word: 0})
        
        return self.weight_vector
    
    
    @staticmethod
    def dot_product(corpus_weight_Counter: Counter, sentence_feature_Counter: Counter) -> float: 
        """
        Computes the dot product of the weight vector and the feature vector for one sentence. 
        :param corpus_weight_Counter: vector of all corpus words and their current weights
        :param sentence_feature_Counter: vector of sentence features and feature frequencies
        :return: dot product value
        """   
        
        result = 0
        for word in list(sentence_feature_Counter):
            result += corpus_weight_Counter[word]
        return result
    
    
    def predicted_label(self, sentence_feature_Counter: Counter) -> bool: #?? needs self??
        """
        Predicts a sentiment label for a single sample (i.e. sentence).
        :param sentence_feature_Counter: vector of sentence features and feature frequencies
        :return: label 
        """     
        
        if dot_product(self.weight_vector, sentence_feature_vector) > 0:
            return 1
        else:
            return 0 


    def update_weights(self, sentence_feature_Counter, operation = "add"):
        """
        Updates weight vector using feature vector values.
        :param X: 
        :return:  
        """         

        for word in list(sentence_feature_Counter):
            if operation == "add":
                self.weight_vector[word] += 1
            else:
                self.weight_vector[word] -= 1
        
        
    def train_classifier(self, epochs = 10):
        # ?? need to compute loss anywhere??
        
        weight_vector = self.weight_vector.initialize_weight_vector() # ?? not sure this is right
        
        for i in range(epochs): 
            for j in range(len(self.train_exs)): # >> shuffle data
                feature_vector = self.feat_extractor.extract_features(train_exs[j])
                predicted_label = predicted_label(feature_vector)
                
                # TODO: learning rate??
                # pred 0, true label = 1
                if predicted_label < train_exs[j].label:  # ??? pay attention to data structure format here<<<
                    #weight_vector += feature_vector ## TODO: fix feature vector
                    update_weights(feature_vector, operation = "add")
                    
                
                # pred 1, true label = 0
                elif predicted_label > train_exs[j].label:  # ??? pay attention to data structure format here<<<
                    #weight_vector -= feature_vector ## TODO: fix feature vector
                    update_weights(feature_vector, operation = "subtract")
            
            # ?? no need to uodate self.weight_vector since the update_weights does it automatically???
            #self.weight_vector =  weight_vector  # ?? not sure this is right
            
            
        # return anything here??
            
        
        


        
        # Make a Perceptron Classifier Model:
        
            #DONE 1 convert and example into features DONE
            #DONE 2 initialize weight vector
            
            #3 run epochs
            #4 compute label Done
            #5 compute loss 
            #6 update weight vector
                # repeat 3-6
  

 
                
class LogisticRegressionClassifier(SentimentClassifier):
    """
    Implement this class -- you should at least have init() and implement the predict method from the SentimentClassifier
    superclass. Hint: you'll probably need this class to wrap both the weight vector and featurizer -- feel free to
    modify the constructor to pass these in.
    """
    def __init__(self):
        raise Exception("Must be implemented")


def train_perceptron(train_exs: List[SentimentExample], feat_extractor: FeatureExtractor) -> PerceptronClassifier:
    """
    Train a classifier with the perceptron.
    :param train_exs: training set, List of SentimentExample objects
    :param feat_extractor: feature extractor to use
    :return: trained PerceptronClassifier model
    """
    raise Exception("Must be implemented")


def train_logistic_regression(train_exs: List[SentimentExample], feat_extractor: FeatureExtractor) -> LogisticRegressionClassifier:
    """
    Train a logistic regression model.
    :param train_exs: training set, List of SentimentExample objects
    :param feat_extractor: feature extractor to use
    :return: trained LogisticRegressionClassifier model
    """
    raise Exception("Must be implemented")


def train_model(args, train_exs: List[SentimentExample], dev_exs: List[SentimentExample]) -> SentimentClassifier:
    """
    Main entry point for your modifications. Trains and returns one of several models depending on the args
    passed in from the main method. You may modify this function, but probably will not need to.
    :param args: args bundle from sentiment_classifier.py
    :param train_exs: training set, List of SentimentExample objects
    :param dev_exs: dev set, List of SentimentExample objects. You can use this for validation throughout the training
    process, but you should *not* directly train on this data.
    :return: trained SentimentClassifier model, of whichever type is specified
    """
    # Initialize feature extractor
    if args.model == "TRIVIAL":
        feat_extractor = None
    elif args.feats == "UNIGRAM":
        # Add additional preprocessing code here
        feat_extractor = UnigramFeatureExtractor(Indexer())
    elif args.feats == "BIGRAM":
        # Add additional preprocessing code here
        feat_extractor = BigramFeatureExtractor(Indexer())
    elif args.feats == "BETTER":
        # Add additional preprocessing code here
        feat_extractor = BetterFeatureExtractor(Indexer())
    else:
        raise Exception("Pass in UNIGRAM, BIGRAM, or BETTER to run the appropriate system")

    # Train the model
    if args.model == "TRIVIAL":
        model = TrivialSentimentClassifier()
    elif args.model == "PERCEPTRON":
        model = train_perceptron(train_exs, feat_extractor)
    elif args.model == "LR":
        model = train_logistic_regression(train_exs, feat_extractor)
    else:
        raise Exception("Pass in TRIVIAL, PERCEPTRON, or LR to run the appropriate system")
    return model




###############################################------ Unit Tests ------################################################
    
### UnigramFeatureExtractor ###
#unigram_extractor = UnigramFeatureExtractor(Indexer)
#print(unigram_extractor) 
#print(unigram_extractor.extract_features(["I", "am", "here", ".", "I", "made", "this", ",", "Class"]))
#print(unigram_extractor.feature_vector_size()) 
    
### PerceptronClassifier ###
#train_exs = read_blind_sst_examples("data/train.txt")[0]
    
train_exs = ["I", "am", "here", ".", "I", "made", "this", ",", "Class"]
perceptron = PerceptronClassifier(UnigramFeatureExtractor, train_exs)  
#print(type(perceptron))  
#print(perceptron.feat_extractor)
#print(perceptron.train_exs) # prints first sentence of the dataset
#print(perceptron.corpus_vocab)
#print(perceptron.weight_vector)

print(perceptron.create_corpus_vocab())









#testing = FeatureExtractor()
#testing.echo()

TypeError: extract_features() missing 1 required positional argument: 'sentence'