<a href="https://colab.research.google.com/github/paddy-03/20Newsgroup-Classification/blob/master/20newsgroups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **20 News groups classification**

In [0]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

In [0]:
os.chdir('/home/swarup03/Study_Material/Cognizance codes/20newsgroups')
folders = os.listdir()
path = str(os.getcwd())
folders = sorted(folders) # consists of all the 20 groups is alphabetical order

In [0]:
word_dict={}  # Used for storing all the words that we encounter in our documents

In [0]:
stop_words = stopwords.words('english') # list of stopwords

In [0]:
data = []  # Used to store all our data, we will use this to make X_train
y_train = [] # for y_train 
m=0 # denoting the folder number or group number

# Extracting all the words from text files

In [0]:
for folder in folders:
    #print(folder)
    
    files = os.listdir(os.chdir(str(path) + '/'+folder))
    new_path = os.getcwd() #path for each sub-folder
    os.chdir(path)
    for file in files:
       
        try:
            
            
            reader = open(new_path+'/'+file,'r',errors='ignore')
            text=reader.read()
            text=text.lower() # text consists of the content
    
            tokenizer = RegexpTokenizer(r'\w+') # filtering out all punctuations
            tokens = tokenizer.tokenize(text) # tokens consists of all the words in the text
            
            wordnet_lemmatizer = WordNetLemmatizer() # grouping together diff types of same word
            tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # eg. write and writes 
                     
            # Filtering out all stopwords, meaning less words (1993apr) and words less than 5 alphabets
            tokens=[w for w in tokens if len(w)>=4 and w not in stop_words and w.isalpha()]
            
            data.append(tokens) # will be used for making X_train
            y_train.append(m)
            
            counter = nltk.Counter(tokens) # storing count of words
            
            for word in counter.keys():
                if word not in word_dict.keys():
                    word_dict[word]=counter[word]
                else:
                    word_dict[word]+=counter[word]
            
        except:
            continue 
    m+=1
    
        
     

In [0]:
len(data)

16315

# Extracting out top 2000 words with ignoring the first 15 which occur in almost each document

In [0]:
import operator
vocabulary=sorted(word_dict.items(),key=operator.itemgetter(1),reverse=True)[15:2000]


In [0]:
vocabulary = [x[0] for x in vocabulary]
#print(vocabulary)
# words in vocabulary will be used as features

# Formatting our dataset

In [0]:
X_train = np.zeros((len(data),len(vocabulary)))
index=0
vocab_index=0

In [0]:
for doc in data:
    for word in doc:
        vocab_index=0
        for feature in vocabulary:
            if feature == word:
                X_train[index][vocab_index]+=1
            vocab_index+=1
    index+=1
            

In [0]:
len(X_train),len(y_train)

(16315, 16315)

# Preparing our Test Data

In [0]:
os.chdir('/home/swarup03/Study_Material/Cognizance codes/test20ng')
folders = os.listdir()
folders=[int(i) for i in folders if i.isdigit()]
path = str(os.getcwd())
folders = sorted(folders)
#folders

In [0]:
test_data = []
y_test = []
m=0 

In [0]:
for folder in folders:
    #print(folder)
    
    files = os.listdir(os.chdir(str(path) + '/'+str(folder)))
    new_path = os.getcwd()
    os.chdir(path)
    for file in files:
        try:
                 
            reader = open(new_path+'/'+file,'r',errors='ignore')
            
            text=str(reader.read())
            text=text.lower()
        
            tokenizer = RegexpTokenizer(r'\w+') # filtering out all punctuations
            tokens = tokenizer.tokenize(text)
            
            wordnet_lemmatizer = WordNetLemmatizer() # grouping together diff types of same word
            tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # eg. write and writes 
            
            tokens=[w for w in tokens if len(w)>=4 and w not in stop_words and w.isalpha()]
            
            test_data.append(tokens)
            y_test.append(m)
            
        except:
            continue
    m+=1
     

In [0]:
X_test = np.zeros((len(test_data),len(vocabulary)))
index=0
vocab_index=0

In [0]:
for doc in test_data:
    for word in doc:
        vocab_index=0
        for feature in vocabulary:
            if feature == word:
                X_test[index][vocab_index]+=1
            vocab_index+=1
    index+=1
            

# Implementing MNB (sklearn)

In [0]:
clf = MultinomialNB(alpha=1)
clf.fit(X_train,y_train)


MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [0]:
clf.score(X_test,y_test)

0.8465507876154263

In [0]:
clf.score(X_train,y_train)

0.8889365614465216

In [0]:
y_pred_clf = clf.predict(X_test)
print(classification_report(y_test,y_pred_clf))

             precision    recall  f1-score   support

          0       0.75      0.87      0.81       168
          1       0.86      0.74      0.80       171
          2       0.80      0.85      0.82       192
          3       0.64      0.73      0.68       190
          4       0.69      0.75      0.72       176
          5       0.89      0.79      0.84       175
          6       0.89      0.93      0.91       177
          7       0.80      0.88      0.84       174
          8       0.89      0.91      0.90       182
          9       0.94      0.94      0.94       198
         10       0.95      0.94      0.94       200
         11       0.94      0.97      0.96       171
         12       0.88      0.85      0.86       207
         13       0.95      0.84      0.89       175
         14       0.95      0.92      0.93       198
         15       0.96      0.99      0.98       200
         16       0.85      0.87      0.86       170
         17       0.90      0.89      0.89   

# Saving the training and testing data in csv format

In [0]:
os.chdir('/home/swarup03/Study_Material/Cognizance codes')

In [0]:
np.savetxt('X_train.csv',X_train,delimiter=',',fmt='%1.0f')

In [0]:
np.savetxt('Y_train.csv',y_train,fmt='%1.0f')

In [0]:
np.savetxt('X_test.csv',X_test,delimiter=',',fmt='%1.0f')

In [0]:
np.savetxt('Y_test.csv',y_test,fmt='%1.0f')

# My own Implementation of MNB 

In [0]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [0]:
def probability(x,c,res):
    output= np.log(res[c]['total_count']) - np.log(res['total_data'])
    num_features = len(res[c].keys())-2
    for j in range(num_features):
        if x[j] > 0:
            output+= np.log(x[j]) + np.log(res[c][j]+1) - np.log(res[c]['total_words']+num_features)
     
    return output
    

In [0]:
def predictsinglepoint(x,res):
    classes = res.keys()
    best_p=-1
    best_class=-1
    first_run = True
    for c in classes:
        if c!= 'total_data':
           # print(c)
            p_current = probability(x,c,res)
            #print(res[c])
            #print('%.20f'%p_current)
            if(first_run or p_current>best_p):
                best_p = p_current
                best_class = c
            #print(best_p)
            first_run=False
    return best_class

In [0]:
def predict(X,res):
    y_pred=[]
    
    for x in X:
        #print(x[:10])
        x_class = predictsinglepoint(x,res)
        y_pred.append(x_class)
    return y_pred

In [0]:
def fit(X,y):
    res={}
    classes = set(y)
    res['total_data']=len(y)
    for c in classes:
        res[c]={}
        #print(y==0)
        c_rows = (y==c)
        X_current = X[c_rows]
        y_current = y[c_rows]
        num_features=X.shape[1]
        res[c]['total_words']=0
        res[c]['total_count']=len(y_current)
        for j in range(num_features):
            res[c][j]=X_current[:,j].sum()
            res[c]['total_words']+=res[c][j]
    return res
    

In [0]:
res=fit(X_train,y_train)

In [0]:
y_pred = predict(X_test,res)

In [0]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.70      0.91      0.79       168
          1       0.82      0.75      0.78       171
          2       0.83      0.80      0.81       192
          3       0.66      0.72      0.69       190
          4       0.65      0.80      0.71       176
          5       0.91      0.78      0.84       175
          6       0.91      0.94      0.93       177
          7       0.86      0.89      0.87       174
          8       0.89      0.92      0.91       182
          9       0.96      0.97      0.97       198
         10       0.98      0.97      0.98       200
         11       0.96      0.96      0.96       171
         12       0.84      0.90      0.87       207
         13       0.94      0.83      0.88       175
         14       0.96      0.86      0.91       198
         15       0.98      0.99      0.99       200
         16       0.83      0.92      0.87       170
         17       0.92      0.90      0.91   

In [0]:
def score(y,y_pred):
    ans=0
    for i in range(len(y_pred)):
        if y_pred[i]==y_test[i]:
            ans+=1
        
    return ans/len(y_pred)

In [0]:
score(y_test,y_pred)

0.8511678435632808

# **Comparison**

## **1) Accuracy score**

### a) MNB sklearn

Score = **0.8465507876154263**

### b) MNB (own implementation)

Score = **0.8511678435632808**

## **2) Classification Report**

### a) MNB (sklearn)

      precision    recall  f1-score   support

          0       0.75      0.87      0.81       168
          1       0.86      0.74      0.80       171
          2       0.80      0.85      0.82       192
          3       0.64      0.73      0.68       190
          4       0.69      0.75      0.72       176
          5       0.89      0.79      0.84       175
          6       0.89      0.93      0.91       177
          7       0.80      0.88      0.84       174
          8       0.89      0.91      0.90       182
          9       0.94      0.94      0.94       198
         10       0.95      0.94      0.94       200
         11       0.94      0.97      0.96       171
         12       0.88      0.85      0.86       207
         13       0.95      0.84      0.89       175
         14       0.95      0.92      0.93       198
         15       0.96      0.99      0.98       200
         16       0.85      0.87      0.86       170
         17       0.90      0.89      0.89       175
         18       0.75      0.66      0.70       174
         19       0.68      0.61      0.65       209

    avg / total   0.85      0.85      0.85      3682


### b) MNB (own implementation)

      precision    recall  f1-score   support

          0       0.70      0.91      0.79       168
          1       0.82      0.75      0.78       171
          2       0.83      0.80      0.81       192
          3       0.66      0.72      0.69       190
          4       0.65      0.80      0.71       176
          5       0.91      0.78      0.84       175
          6       0.91      0.94      0.93       177
          7       0.86      0.89      0.87       174
          8       0.89      0.92      0.91       182
          9       0.96      0.97      0.97       198
         10       0.98      0.97      0.98       200
         11       0.96      0.96      0.96       171
         12       0.84      0.90      0.87       207
         13       0.94      0.83      0.88       175
         14       0.96      0.86      0.91       198
         15       0.98      0.99      0.99       200
         16       0.83      0.92      0.87       170
         17       0.92      0.90      0.91       175
         18       0.77      0.70      0.73       174
         19       0.73      0.53      0.61       209

    avg / total   0.86      0.85      0.85      3682