In [3]:
from math import log
import glob
from collections import Counter
import numpy as np
from nltk import SnowballStemmer, WordNetLemmatizer
import pandas as pd
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
stop_words = stopwords.words('english')

In [4]:
def preprocess(text):
    token = RegexpTokenizer(r'[a-zA-Z]+')
    wnl = WordNetLemmatizer()
    ps = SnowballStemmer('english')
    
    tokens = token.tokenize(text.lower())
    
    tokens_l = [wnl.lemmatize(token) for token in tokens]
    
    tokens_ls = [ps.stem(token) for token in tokens_l]
    
    return tokens_ls

In [5]:
train = pd.read_csv('reddit-comment-classification-comp-551/reddit_train.csv')
test = pd.read_csv('reddit-comment-classification-comp-551/reddit_test.csv')

In [6]:
y_labels_train = train['subreddits'].values
x_comments_train = train['comments'].values
printable='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \n\''

x_comments_test = test['comments'].values


In [7]:
labels = np.unique(y_labels_train)
labels_to_index = {w: int(np.where(labels==w)[0]) for w in labels}
index_to_labels={v:k for k,v in labels_to_index.items()}
train['labels'] = train['subreddits'].map(labels_to_index)

In [8]:
y = train['labels'].values

In [9]:
cv_unigram = CountVectorizer(lowercase=True,
                     stop_words=stop_words,
                     tokenizer = preprocess,
                    binary=True,min_df=2)




In [10]:
x_train_unigram = cv_unigram.fit_transform(x_comments_train)

X_train, X_test, y_train, y_test = train_test_split(x_train_unigram, y, test_size=0.1, random_state=42)
# x_train_bigram = cv_bigram.fit_transform(x_comments_train)
# x_train_unigram_tfidf = cv_unigram_tfidf.fit_transform(x_comments_train)
# x_train_bigram_tfidf = cv_bigram_tfidf.fit_transform(x_comments_train)


features_unigram = cv_unigram.vocabulary_
# features_bigram = cv_bigram.vocabulary_

  'stop_words.' % sorted(inconsistent))


In [11]:
class BernoulliNB(object):

    def __init__(self,smoothing=True,alpha=0.1):
        self.class_probs = None
        self.feat_probs = None
        self.smoothing = smoothing
        self.alpha=alpha
        self.class_counts=None
    def fit(self, X, y):
        """Train a Bernoulli naive Bayes classifier

        Args:
            X (array or sparse matrix): Each element in this array
                is a feature vector of text
            labels (array): The ground truth label for
                each document
        """

        """Compute log( P(Y) )
        """
        self.class_counts = Counter(y)
        K = len(self.class_counts)
        denominator = float(len(y))
        self.class_probs = np.array([log(v/denominator) for _, v in self.class_counts.items()])

        """Compute log( P(X|Y) )

           Use Laplace smoothing
           n1 + 1 / (n1 + n2 + 2)
        """
        feats=X.shape[1]
        
        self.feat_probs = np.zeros((K,feats),dtype=float)
       
        # Step through each document
        for idx,i in enumerate(y):
            
            for f in X[idx].indices:
                
                self.feat_probs[i][f] += 1.
        
        for i in self.class_counts.keys():
            N = float(self.class_counts[i])
            if self.smoothing==True:
                self.feat_probs[i] = (self.feat_probs[i]+self.alpha)/(N +2.)
            else:
                self.feat_probs[i][:] = (self.feat_probs[i][:])/(N)
            
   
        self.log_feat_probs = np.log(self.feat_probs)
        self.one_minus_log_feat_probs = np.log(1 - self.feat_probs)
        
    def predict(self, X):
        """Make a prediction from text
        """

        y = np.zeros(X.shape[0],dtype=float)

        ones = np.ones(X.shape)        
        temp = X.dot(self.log_feat_probs.transpose()) + (ones - X).dot((self.one_minus_log_feat_probs).transpose())
         
        scores = temp + self.class_probs.transpose()
        pred = np.argmax(scores, axis=1)
 
        return np.array(pred.squeeze())

In [12]:
clf=BernoulliNB()
clf.fit(X_train,y_train)

In [13]:
# print(X_test)
pred=clf.predict(X_test)

In [14]:
print((pred==y_test).sum()/len(y_test))

0.5102857142857142
