In [None]:
# Please run this Jupyter file on Google Colab as the indentaion is according
# to the google colab.

In [3]:
# This is used to import files from local system to Google Colab 
from google.colab import files
import joblib

In [4]:
# This is used to upload files to Google Colab
uploaded = files.upload()

Saving 20news-bydate.tar.gz to 20news-bydate.tar.gz


In [None]:
# This is used to extract the tar file
!tar xvzf 20news-bydate.tar.gz

In [8]:
# This is used to extract the tar file
!tar xvzf mini_newsgroups.tar.gz

In [7]:
import string 
import pandas as pd
import numpy as np
import nltk

In [9]:
from os import listdir
from os.path import isfile,join
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
folder_path = '20news-bydate-train'

files = []
files.append([f for f in listdir(folder_path)])

# checking total number of files gathered
print(files)

# texts contains all the documents present in each of the class
texts=[]
# documents contains the text written in each document
documents = []
# y_train contains the name of the class for each document
y_train = []


for file in files :
    for file_name in file :
        file_path = join(folder_path,file_name)
        texts.append([f for f in listdir(file_path)])
        doc = texts[len(texts)-1]
        for f in doc :
            # new_path gives us the path of each document
            new_path = join(file_path,f)
            d = open(new_path,encoding="utf8", errors='ignore')
            # lines contains the text in the document
            lines = d.read()
            # word_tokenize converts the sentences present in the document 
            # to the words
            documents.append(word_tokenize(lines))
            y_train.append(file_name)

[['alt.atheism', 'sci.crypt', 'rec.autos', 'comp.sys.ibm.pc.hardware', 'comp.graphics', 'rec.sport.hockey', 'misc.forsale', 'talk.politics.mideast', 'talk.religion.misc', 'talk.politics.misc', 'soc.religion.christian', 'rec.sport.baseball', 'sci.med', 'comp.windows.x', 'sci.space', 'rec.motorcycles', 'comp.sys.mac.hardware', 'talk.politics.guns', 'comp.os.ms-windows.misc', 'sci.electronics']]


In [12]:
# converting list to pandas
y_train = pd.DataFrame({'col':y_train})

In [13]:
# checking whether y_train is computed correctly or not
print(y_train.iloc[1000,0])

sci.crypt


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# storing stopwords in stop
stop = stopwords.words('english')

In [16]:
# adding punctuations to stopwords
punctuations = list(string.punctuation)
stop = stop+punctuations

In [17]:
# Initialising lemmatizer
lemmatizer = WordNetLemmatizer()

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [19]:
# function to convert POS TAGS to simpler forms so that they can be
# used in lemmatization

def get_simple_pos(tag) :
    # any pos_tag starting with J is adjective. It may be different 
    # types of adjectives
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    # for simplicity we say that if none of them is there then it is a Noun
    else :
        return wordnet.NOUN

In [20]:
# function to remove stop words and peform lemmatization in each document

def clean_review(words) :
    output_words=[]
    for w in words :
        if w.lower() not in stop :
            # passing the word as it is important because if we pass the lowered
            # case word then we loose some information about part of speech .
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [21]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [22]:
# calling clean_review function for each document in order to remove stop words
# from each document and lemmatize the document
documents = [clean_review(document) for document in documents]

In [23]:
print(len(documents))

11314


In [24]:
# converting all the words present in each document to sentences
text_documents = [" ".join(document) for document in documents]

In [25]:
print(text_documents[0])

bdunn cco.caltech.edu brendan dunn subject amusing atheist agnostic organization california institute technology pasadena lines 8 nntp-posting-host punisher.caltech.edu thanks whoever post wonderful parody people post without reading faq laugh good 5 minute part faq n't mention think might one two ... please n't tell n't joke 'm ready hear yet ... brendan


In [26]:
# Using count vectoriser to create 2000 best features having ngram_range between
# 1 and 2 i.e. we can have the features consiting of 1 word or 2 words

count_vec = CountVectorizer(max_features = 2000,ngram_range=(1,2))
# a here is a sparse matrix having 2000 columns as features
a = count_vec.fit_transform(text_documents)
# this is used to print this sparse matrix
a.todense()

matrix([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [27]:
a.shape

(11314, 2000)

In [28]:
# this is a list of 2000 features
count_vec.get_feature_names()

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '0d',
 '0d 0d',
 '0t',
 '0t 0t',
 '10',
 '100',
 '11',
 '12',
 '128',
 '13',
 '14',
 '145',
 '145 145',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1988',
 '1990',
 '1991',
 '1992',
 '1993',
 '1993apr15',
 '1993apr16',
 '1993apr20',
 '1993apr5',
 '1d9',
 '1d9 1d9',
 '1st',
 '1t',
 '20',
 '200',
 '2000',
 '21',
 '22',
 '23',
 '24',
 '25',
 '250',
 '256',
 '26',
 '27',
 '28',
 '29',
 '2di',
 '2di 2di',
 '2nd',
 '2tm',
 '30',
 '300',
 '31',
 '32',
 '33',
 '34',
 '34u',
 '35',
 '36',
 '37',
 '38',
 '386',
 '39',
 '3d',
 '3t',
 '40',
 '400',
 '408',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '486',
 '49',
 '50',
 '500',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '64',
 '65',
 '66',
 '6ei',
 '6um',
 '70',
 '71',
 '72',
 '75',
 '75u',
 '75u 75u',
 '76',
 '7ey',
 '7u',
 '80',
 '800',
 '81',
 '82',
 '85',
 '86',
 '89',
 '90',
 '91',
 '92',
 '93',
 '95',
 '9v',
 '__',
 '__ __',
 '___',
 '___ ___',

In [29]:
folder_path = '20news-bydate-test'

files = []
files.append([f for f in listdir(folder_path)])
# checking total number of files gathered
print(files)

# texts contains all the documents present in each of the class
texts=[]
# test_documents contains the text written in each document
test_documents = []
# y_test contains the name of the class for each document
y_test = []


for file in files :
    for file_name in file :
        file_path = join(folder_path,file_name)
        texts.append([f for f in listdir(file_path)])
        doc = texts[len(texts)-1]
        for f in doc :
            # new_path gives us the path of each document
            new_path = join(file_path,f)
            d = open(new_path,encoding="utf8", errors='ignore')
            # lines contains the text in the document
            lines = d.read()
            # word_tokenize converts the sentences present in the test_documents 
            # to the words
            test_documents.append(word_tokenize(lines))
            y_test.append(file_name)

[['alt.atheism', 'sci.crypt', 'rec.autos', 'comp.sys.ibm.pc.hardware', 'comp.graphics', 'rec.sport.hockey', 'misc.forsale', 'talk.politics.mideast', 'talk.religion.misc', 'talk.politics.misc', 'soc.religion.christian', 'rec.sport.baseball', 'sci.med', 'comp.windows.x', 'sci.space', 'rec.motorcycles', 'comp.sys.mac.hardware', 'talk.politics.guns', 'comp.os.ms-windows.misc', 'sci.electronics']]


In [30]:
# converting list to pandas
y_test = pd.DataFrame({'col':y_test})

In [31]:
# checking whether y_test is correctly computed or not
print(y_test.iloc[1000,0])

rec.autos


In [32]:
# calling clean_review function for each document in order to remove stop words
# from each document and lemmatize the document
test_documents = [clean_review(document) for document in test_documents]

In [33]:
print(len(test_documents))
print(y_test.shape)

7532
(7532, 1)


In [34]:
# converting all the words present in each document to sentences
test_text_documents = [" ".join(document) for document in test_documents]

In [35]:
# count vectoriser here gives us the most common 
# 2000 features for the test documents
x_test_features = count_vec.transform(test_text_documents)

In [36]:
x_test_features.shape

(7532, 2000)

In [37]:
# fit function is used to create and return a dictionary result that can be 
# used to find the number of occurences of a word in a particular document .

def fit(X_train,Y_train) :
    # result is a dictionary
    result = {}
    # class_values are the different values that Y_train can take
    class_values = set(Y_train.iloc[:,0])
  
    # sum stores the sum of occurrences of all words in all documents
    sum = 0
    for i in range(X_train.shape[1]) :
      sum = sum + X_train.iloc[:,i].sum()
    result["total_data"] = sum
    print(class_values)
    
    
    for current_class in class_values :
      result[current_class] = {}
      # current_class_rows contains those rows from X_train that 
      # have current_class as output in Y_train
      current_class_rows = (Y_train==current_class)
      j=0
      X_train_current = pd.DataFrame(columns=X_train.columns)
      for i in range(len(current_class_rows)) :
        if current_class_rows.iloc[i,0]==True :
          X_train_current.loc[j] = X_train.iloc[i,:]
          j=j+1
      j=0
      Y_train_current = pd.DataFrame(columns=Y_train.columns)
      for i in range(len(current_class_rows)) :
        if current_class_rows.iloc[i,0]==True :
          Y_train_current.loc[j] = Y_train.iloc[i,:]
          j=j+1
    
   
      # total is used to store the occurrences of all the words in the documents
      # belonging to a particular class
      total = 0
      for i in range(X_train_current.shape[1]) :
        total = total + X_train_current.iloc[:,i].sum()
      result[current_class]["total_count"] = total
    
      # num_features stores the total number of features  
      num_features = X_train.shape[1]
      for j in range(0,num_features) :
        feature_sum = X_train_current.iloc[:,j].sum()
        result[current_class][j] = feature_sum
      # return the dictionary result
      
    return result   

In [38]:
# this function returns the probability of the given document x belonging
# to the current_class

def probability(dictionary, x, current_class):
    features = count_vec.get_feature_names()
    # here i have used logarithmic probability
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    for word in x :
      # find the probability of a word only if it is present in features
      if word in features :
        for i in range(len(features)) :
          if(features[i]==word) :
            break
        # 1 is added for laplace correction
        num = dictionary[current_class][i]+1
        # len(dictionary[current_class].keys()) is added for laplace correction
        den = dictionary[current_class]["total_count"] + len(dictionary[current_class].keys())
        current_probablity = np.log(num) - np.log(den)
        output = output + current_probablity
    return output

In [39]:
# this function is used to find and return the best class for each point in the
# test document
def predictSinglePoint(dictionary, x):
    # this gives us all the keys of this dictionary
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        # p_current_class stores the probability of the current class
        p_current_class = probability(dictionary, x, current_class)
        # here we check whether the probability of the current class is greater
        # than the best probability that we have got till now
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [50]:
def predict(dictionary, X_test):
    y_pred = []
    count=0
    for x in X_test:
        # for each point we predict the class and append this class to y_predict
        x_class = predictSinglePoint(dictionary, x)
        print(x_class,count)
        count+=1
        y_pred.append(x_class)
    return y_pred

In [41]:
print(a.shape)
print(y_train.shape)

(11314, 2000)
(11314, 1)


In [42]:
# converting the sparse matrix a to dataframe
a = pd.DataFrame(a.toarray())

In [43]:
# calling the fit function on a and y_train
dictionary = fit(a,y_train)

{'rec.sport.baseball', 'talk.politics.mideast', 'rec.autos', 'talk.religion.misc', 'sci.crypt', 'comp.graphics', 'talk.politics.misc', 'soc.religion.christian', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'sci.space', 'misc.forsale', 'comp.sys.mac.hardware', 'talk.politics.guns', 'comp.os.ms-windows.misc', 'rec.sport.hockey', 'sci.electronics', 'sci.med', 'rec.motorcycles', 'alt.atheism'}


In [44]:
z='rec.sport.baseball'
print(dictionary[z]["total_count"])

54907


In [None]:
# calling the  predict function on dictionary and test_documents
Y_pred = predict(dictionary,test_documents)

In [52]:
print(len(Y_pred))
print(len(y_test))

7532
7532


In [53]:
# this is classification_report and confusion matrix for my implementation of
# Naive Bayes
print(classification_report(y_test,Y_pred))
print(confusion_matrix(y_test,Y_pred))
# From the confusion matrix i can see that I got an accuracy of 0.73

                          precision    recall  f1-score   support

             alt.atheism       0.65      0.55      0.59       319
           comp.graphics       0.49      0.71      0.58       389
 comp.os.ms-windows.misc       0.50      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.46      0.60      0.52       392
   comp.sys.mac.hardware       0.60      0.68      0.63       385
          comp.windows.x       0.67      0.66      0.66       395
            misc.forsale       0.76      0.75      0.76       390
               rec.autos       0.74      0.81      0.77       396
         rec.motorcycles       0.82      0.85      0.83       398
      rec.sport.baseball       0.77      0.80      0.79       397
        rec.sport.hockey       0.94      0.80      0.86       399
               sci.crypt       0.86      0.82      0.84       396
         sci.electronics       0.59      0.64      0.61       393
                 sci.med       0.76      0.68      0.72       396
         

In [55]:
# Here i have used Multinomial Naive Bayes already implemented in SKlearn
clf = MultinomialNB()
clf.fit(a, y_train)
# filename = 'finalized_model.sav'
# joblib.dump(model, filename)
Y_pred = clf.predict(x_test_features)
print(classification_report(y_test,Y_pred))
print(confusion_matrix(y_test,Y_pred))
# I can see from the classifiation report and confusion matrix that here i have 
# got an accurcy of 0.88

  y = column_or_1d(y, warn=True)


                          precision    recall  f1-score   support

             alt.atheism       0.65      0.66      0.65       319
           comp.graphics       0.48      0.72      0.58       389
 comp.os.ms-windows.misc       0.50      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.45      0.59      0.51       392
   comp.sys.mac.hardware       0.57      0.69      0.62       385
          comp.windows.x       0.72      0.66      0.69       395
            misc.forsale       0.65      0.83      0.73       390
               rec.autos       0.67      0.79      0.72       396
         rec.motorcycles       0.66      0.89      0.76       398
      rec.sport.baseball       0.74      0.80      0.77       397
        rec.sport.hockey       0.93      0.75      0.83       399
               sci.crypt       0.91      0.78      0.84       396
         sci.electronics       0.57      0.61      0.59       393
                 sci.med       0.81      0.63      0.71       396
         