In [1]:
import numpy as np
import pandas as pd
import random, unicodedata, re
pd.options.mode.chained_assignment = None

In [15]:
!pwd

/Users/Matteo/projects/fundamentals-of-ml/kaggle/arxiv_classification/notebooks


In [20]:
def load_data():
    """Loads data from hardcoded directory"""
    train = pd.read_csv("~/projects/fundamentals-of-ml/kaggle/arxiv_classification/data/train.csv")
    test = pd.read_csv("~/projects/fundamentals-of-ml/kaggle/arxiv_classification/data/test.csv")
    return train, test

def split_training_set(df, ratio=0.7):
    """Splits training set into train and validation at a ratio of 70/30"""
    df.sample(frac=1)
    train = df[:int(ratio*df.shape[0])]
    validation = df[int(ratio*df.shape[0]):]
    return train, validation

In [21]:
# Data Cleanup
def process(df, t):
    df[t] = df[t].apply(lambda x : re.sub('[0-9]', '', x))
    df[t] = df[t].apply(lambda x : re.sub('\[[^]]*\]', '', x))
    df[t] = df[t].apply(lambda x : re.sub("<$\.*?>", '', x))
    df[t] = df[t].apply(lambda x : re.sub('(\${1,2})(?:(?!\1)[\s\S])*\1', "", x))
    df[t] = df[t].apply(lambda x : re.sub("\n", " ", x))
    df[t] = df[t].apply(lambda x : x.lower())
    df[t] = df[t].apply(lambda x : x.strip())
    
    
    # remove stop words
    stopwords = ["a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", 
                 "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", 
                 "before", "being", "below", "between", "both", "but", "by", "can", "couldn", 
                 "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", 
                 "doing", "don", "don't", "down", "during", "each", "few", "for", "from", 
                 "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", 
                 "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", 
                 "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", 
                 "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", 
                 "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", 
                 "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", 
                 "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", 
                 "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", 
                 "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", 
                 "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", 
                 "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", 
                 "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", 
                 "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", 
                 "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", 
                 "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", 
                 "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", 
                 "what's", "when's", "where's", "who's", "why's", "would"]
    df[t] = df[t].apply(lambda text: ["".join(w) for w in text.split(" ") if w not in stopwords])
    return df

In [26]:
class NBBernoulliClassifier:
    
    def __init__(self, X_train, y_train, X_test, y_test):
        
        # Data related attributes
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        # Other useful attributes
        self.bin_counts = {}
        self.vocab = []
        self.priors = {}
    
    def get_vocabulary(self):
        for wordstring in self.X_train['Abstract']:
            for word in wordstring:
                if word not in self.vocab:
                    self.vocab.append(word)
    
    def get_priors(self):
        self.unique_classes = set(self.y_train)
        for cat in self.unique_classes:
            self.priors[cat] = list(self.y_train).count(cat)/self.y_train.shape[0]
    
    def get_binary_dict(self):  
        for cat in self.unique_classes:
            self.bin_counts[cat] = {}
        
            for word in self.vocab:
                self.bin_counts[cat].update({word: 0})
        
        # Here we perform a step of getting conditional probabilities which is to get the sum of indicator values of each words in each category.
        for obs_id, wordstring, cat in zip(self.X_train['Id'], self.X_train['Abstract'], self.y_train):                
            for word in wordstring:
                if word in self.vocab:
                    self.bin_counts[cat][word] += 1
    
    def get_conditional_probs(self, alpha=1):
        cat_idx = {}
        
        # Get indices of observations in class cat.
        for cat in self.unique_classes:
            cat_idx[cat] = self.y_train.index[self.y_train == cat].tolist()
        
        # Get num and denom for p(w|C) calculation.
        for cat, worddict in self.bin_counts.items():
            for key in worddict.keys():
                self.bin_counts[cat][key] = self.bin_counts[cat][key] + alpha
                self.bin_counts[cat][key] /= (len(cat_idx[cat]) + alpha*len(self.unique_classes))

    def get_unique_wordlist(self):
        all_values_nested = [[k for k in self.bin_counts[cat].keys()] for cat in self.unique_classes]
        self.unique_wordlist = set([i for nl in all_values_nested for i in nl])
    
    def get_posteriors(self, test_case, alpha):
        """Get posterior for a single test case."""
        self.posteriors = {}
        cat_idx = {}
        
        # Get indices of observations in class cat.
        for cat in self.unique_classes:
            cat_idx[cat] = self.y_train.index[self.y_train == cat].tolist()
        
        self.posteriors[cat] = {}
        for cat in self.unique_classes:
            prob = 0
            
            # Get prior
            prior = self.priors[cat]
            
            # Sum prior and log conditional probs
            for test_word in test_case:
                if test_word not in self.unique_wordlist:
                    prob *= alpha / (len(cat_idx[cat]) + alpha*len(self.unique_classes))
                else:
                    prob *= self.bin_counts[cat][test_word]
                
            self.posteriors[cat] = prob
    
    def train(self):
        nbb.get_vocabulary()
        nbb.get_priors()
        nbb.get_binary_dict()
        nbb.get_conditional_probs()
    
    def predict(self, X, alpha):
        preds = []
        self.get_unique_wordlist()
        for sample in X["Abstract"]:
            self.get_posteriors(sample, alpha)
            preds.append(max(self.posteriors, key=self.posteriors.get))
        X["Pred_Category"] = preds
    
    def error_rate(self, y_pred, y_true):
        er = round(1 - sum(y_pred == y_true)/len(y_pred), 9)
        print(f"Error Rate = {er}")

In [27]:
%%time
train, test = load_data()
train, validation = split_training_set(train)

train = process(train, 'Abstract')
validation = process(validation, 'Abstract')
test = process(test, 'Abstract')

CPU times: user 12.6 s, sys: 221 ms, total: 12.8 s
Wall time: 13.9 s


In [28]:
%%time
nbb = NBBernoulliClassifier(X_train=train[["Id", "Abstract"]], y_train=train["Category"],
                            X_test=validation[["Id", "Abstract"]], y_test=validation["Category"])
nbb.train()

CPU times: user 3min 39s, sys: 1.47 s, total: 3min 41s
Wall time: 3min 55s


In [29]:
for a in [0.001, 0.01, 0.05, 0.1, 0.3, 0.5]:
    nbb.predict(nbb.X_test, alpha=a)
    print(f"alpha: {a}")
    nbb.error_rate(y_pred=nbb.X_test['Pred_Category'], y_true=nbb.y_test)

alpha: 0.001
Error Rate = 0.488444444
alpha: 0.01
Error Rate = 0.488444444
alpha: 0.05
Error Rate = 0.488444444
alpha: 0.1
Error Rate = 0.488444444
alpha: 0.3
Error Rate = 0.488444444
alpha: 0.5
Error Rate = 0.488444444
