#### Text Classification Comparisons

The objective of this notebook is to compare various text classification models. The goal each model is to classify input texts as one of the ten categories. We choose the model that maximizes the accuracy of classification prediction 

- Naive Bayesian Classifier

- Support Vector Machine (optimized through Stochastic Gradient Descent)

- Logistic Regression Classifier



In [0]:
# Packages
import pandas as pd
import math, scipy, numpy
import re
import os
from os import listdir
from os.path import isfile, join
from numpy import random
import time

# used in the count of words/
import string

# natural language tool kit
import nltk.data 

# for tokenizing sentences according by the words
from nltk.tokenize import sent_tokenize, word_tokenize # $ pip install nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('stopwords')

# Metrics and Feature Extraction
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Pipeline 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
from sklearn.externals import joblib



In [0]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Loading Data Files

Puts essays into data frame with associated labels

In [0]:
'''
    For the given path, get the List of all files in the directory tree 
    From: https://thispointer.com/python-how-to-get-list-of-files-in-directory-and-sub-directories/
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

In [0]:
tags = ['business',
 'education',
 'entertainment',
 'health',
 'ideas',
 'international',
 'politics',
 'science',
 'technology']

In [0]:

start_time = time.time()
# Creating list of essays and associated labels for Atlantic essays
essay = [] 
label1 = [] # word label
for k in range(len(tags)):
  #name of directory
  
    dirname = '/content/drive/My Drive/writrly_proj_files/Atlantic_essays/'+tags[k] #gdrive implementation
    #dirname = './Atlantic_subj_essays/'+tags[k]

    # getting raw list of files
    raw_file_list = getListOfFiles(dirname)

    # eliminating the .DS Store files
    raw_file_list = [x for x in raw_file_list if not ('.DS_Store' in x)];
    
    for elem in raw_file_list:   

        with open(elem, 'r') as file:
            data = file.read().replace('\n\n', '   ')
            essay.append(data)
            label1.append(tags[k])
    
    
# Creating list of short stories 
tag_short = 'short-story'
  
dirname = '/content/drive/My Drive/writrly_proj_files/short_stories/' #gdrive implementation
#dirname = './Atlantic_subj_essays/'+tags[k]

# getting raw list of files
raw_file_list = getListOfFiles(dirname)

# eliminating the .DS Store files
raw_file_list = [x for x in raw_file_list if not ('.DS_Store' in x)];

for elem in raw_file_list:   

  with open(elem, 'r') as file:
      data = file.read().replace('\n\n', '   ')
      essay.append(data)
      label1.append(tag_short)
            
print('Run Time:', str(time.time()-start_time), ' sec')

Run Time: 2.501847982406616  sec


In [0]:
#Checking length
len(essay)

2174

In [0]:
## cleaning text

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [0]:
## cleaning essay text
essay = [clean_text(elem) for elem in essay]

In [0]:
# create a dataframe using essays and labels
masterDF = pd.DataFrame()
masterDF['essay'] = essay
masterDF['topic'] = label1

In [0]:
# save data frame to csv [already saved]
# masterDF.to_csv('master_df.csv')

In [0]:
# Split data into training and test sets 
train_x, test_x, train_y, test_y = train_test_split(masterDF['essay'], masterDF['topic'], random_state = 42)

In [0]:
# classes for classifier keyed on index
reverse_encode = ['business',
 'education',
 'entertainment',
 'health',
 'ideas',
 'international',
 'politics',
 'science',
 'short-story',
 'technology']

### Naive Bayes Classifier

In [0]:
# Naive bayesian classifier pipeline
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(train_x, train_y)

# %%time
pred_y = nb.predict(test_x)

print('accuracy %s' % accuracy_score(pred_y, test_y))
print(classification_report(test_y, pred_y,target_names=reverse_encode))

accuracy 0.6525735294117647
               precision    recall  f1-score   support

     business       0.72      0.49      0.58        47
    education       0.57      0.95      0.72        62
entertainment       0.96      0.73      0.83        62
       health       1.00      0.21      0.35        62
        ideas       0.30      0.56      0.40        57
international       1.00      0.03      0.07        29
     politics       0.61      0.87      0.72        55
      science       0.72      0.84      0.78        56
  short-story       1.00      0.96      0.98        47
   technology       0.78      0.63      0.69        67

     accuracy                           0.65       544
    macro avg       0.77      0.63      0.61       544
 weighted avg       0.75      0.65      0.63       544



### Linear Support Vector Machine

In [0]:
# Support vector machine pipeline
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(train_x, train_y)
pred_y = sgd.predict(test_x)

print('accuracy %s' % accuracy_score(pred_y, test_y))
print(classification_report(test_y, pred_y,target_names=reverse_encode))

accuracy 0.7996323529411765
               precision    recall  f1-score   support

     business       0.72      0.72      0.72        47
    education       0.87      0.85      0.86        62
entertainment       0.78      0.90      0.84        62
       health       0.85      0.66      0.75        62
        ideas       0.73      0.42      0.53        57
international       0.83      0.86      0.85        29
     politics       0.75      0.91      0.82        55
      science       0.77      0.96      0.86        56
  short-story       0.90      1.00      0.95        47
   technology       0.80      0.76      0.78        67

     accuracy                           0.80       544
    macro avg       0.80      0.81      0.80       544
 weighted avg       0.80      0.80      0.79       544



### Multiclass Logistic Regression

In [0]:
# Multiclass logistic regression pipeline
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(solver = 'liblinear', 
                                           multi_class = 'ovr' ,
                                           n_jobs=1, C=1e5)),
               ])
logreg.fit(train_x, train_y)
pred_y = logreg.predict(test_x)

print('accuracy %s' % accuracy_score(pred_y, test_y))
print(classification_report(test_y, pred_y,target_names=reverse_encode))

accuracy 0.8308823529411765
               precision    recall  f1-score   support

     business       0.70      0.79      0.74        47
    education       0.93      0.82      0.87        62
entertainment       0.82      0.90      0.86        62
       health       0.89      0.76      0.82        62
        ideas       0.71      0.60      0.65        57
international       0.81      0.86      0.83        29
     politics       0.84      0.89      0.87        55
      science       0.84      0.95      0.89        56
  short-story       0.94      1.00      0.97        47
   technology       0.82      0.79      0.80        67

     accuracy                           0.83       544
    macro avg       0.83      0.84      0.83       544
 weighted avg       0.83      0.83      0.83       544

