## Import all necessary libraries

In [33]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.svm import SVC
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report
import re
from sklearn.linear_model import LogisticRegression
import glob
import shutil
import xlrd
import os
from io import StringIO

## Data

* Creating a List of Investment Related Documents in a text file

In [34]:
# gets the names of sheets in investment excel
xls = xlrd.open_workbook(r'KPI_extraction_file.xlsx', on_demand=True)
sheets =  xls.sheet_names()

# writes the sheet names into a text file
with open(r"investment_related.txt", "w") as myF:
    for x in sheets:
        myF.write(x+"\n")

* Creating Labelled Dataset to train Classifier

In [35]:
#gets list of all files in the folder
allFiles = os.listdir("./TextFilesAllPapers/")

true_files=''

#gets a list of filenames where the document is investment related
with open('investment_related.txt', 'r') as file:
    true_files = file.read().replace('\n', ' ')
    true_files = true_files.replace('.pdf', '')


true_files = true_files.split()

destination0 = "./data/0/"
destination1 = "./data/1/"

#divides investment related and non related files in 2 different folders
for file in allFiles:
    x = file.split('.')
    if x[0] in true_files:
        shutil.copy("./TextFilesAllPapers/"+x[0]+".txt",destination1)
    else:
        shutil.copy("./TextFilesAllPapers/"+x[0]+".txt",destination0)

## Load the dataset

In [2]:
datafiles = load_files("./data")

## Create Training and Test datasets.
* The dataset has been split into Training and Test sets using the 80:20 ratio.

In [3]:
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(datafiles.data, datafiles.target, test_size = 0.2, random_state = 0)

In [36]:
def getChList(docStr):
    s = re.sub(" \d+", "", str(docStr))
    ss = re.sub(" \W", "", s)
    chList= ss.split(" ")
    return chList

## Tokenize the Documents

* Tokenize the documents and run Machine Learning algorithms for classification.

### A. TF-IDF Vectorizer

In [37]:
vectorizer = TfidfVectorizer(binary = True, decode_error = u'ignore')
word_tokenizer = vectorizer.build_tokenizer()
doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)
doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
doc_test_vec= vectorizer.transform(doc_str_list_test)

### A.1 Multinomial Naive Bayes

In [5]:
clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  
doc_test_predicted = clf.predict(doc_test_vec)

In [11]:
target_names=["0","1"]
acc = np.mean(doc_test_predicted == doc_class_list_test)
print("The accuracy for Multinomial Naive Bayes Classifier is ",acc*100)
print("The classification report for Multinomial Naive Bayes Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted, target_names=target_names))
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted).ravel()
print("Confusion matrix for Multinomial Naive Bayes Classifier is:\n\n", tn, fp, fn, tp)

The accuracy for Multinomial Naive Bayes Classifier is  70.0
The classification report for Multinomial Naive Bayes Classifier is as followed:

               precision    recall  f1-score   support

           0       0.70      1.00      0.82        98
           1       0.00      0.00      0.00        42

   micro avg       0.70      0.70      0.70       140
   macro avg       0.35      0.50      0.41       140
weighted avg       0.49      0.70      0.58       140

Confusion matrix for Multinomial Naive Bayes Classifier is:

 98 0 42 0


### A.2 Linear Support Vector Machine

In [12]:
clf1 = svm.SVC(kernel='linear')
clf1.fit(doc_train_vec, doc_class_list_train)
doc_test_predicted1=clf1.predict(doc_test_vec)

In [13]:
acc = np.mean(doc_test_predicted1 == doc_class_list_test)
print("The accuracy for Linear SVM Classifier is ",acc*100)
print("The classification report for Linear SVM Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted1, target_names=target_names))
print("Confusion matrix for Linear SVM Classifier is:\n\n", metrics.confusion_matrix(doc_class_list_test, doc_test_predicted1))

The accuracy for Linear SVM Classifier is  80.71428571428572
The classification report for Linear SVM Classifier is as followed:

               precision    recall  f1-score   support

           0       0.84      0.89      0.87        98
           1       0.70      0.62      0.66        42

   micro avg       0.81      0.81      0.81       140
   macro avg       0.77      0.75      0.76       140
weighted avg       0.80      0.81      0.80       140

Confusion matrix for Linear SVM Classifier is:

 [[87 11]
 [16 26]]


In [14]:
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted1).ravel()
print("Confusion matrix for Linear SVM Classifier is:\n\n", tn, fp, fn, tp)

Confusion matrix for Linear SVM Classifier is:

 87 11 16 26


### A.3 Logistic Regression

In [16]:
clf2 = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(doc_train_vec, doc_class_list_train)
doc_test_predicted2 = clf2.predict(doc_test_vec)

In [17]:
acc = np.mean(doc_test_predicted2 == doc_class_list_test)
print("The accuracy for Logistic Regression Classifier is ",acc*100)
print("The classification report for Logistic Regression Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted2, target_names=target_names))
print("Confusion matrix for Logistic Regression Classifier is:\n\n", metrics.confusion_matrix(doc_class_list_test, doc_test_predicted2))

The accuracy for Logistic Regression Classifier is  82.85714285714286
The classification report for Logistic Regression Classifier is as followed:

               precision    recall  f1-score   support

           0       0.86      0.91      0.88        98
           1       0.75      0.64      0.69        42

   micro avg       0.83      0.83      0.83       140
   macro avg       0.80      0.78      0.79       140
weighted avg       0.82      0.83      0.82       140

Confusion matrix for Logistic Regression Classifier is:

 [[89  9]
 [15 27]]


In [18]:
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted2).ravel()
print("Confusion matrix for Logistic Regression Classifier is:\n\n", tn, fp, fn, tp)

Confusion matrix for Logistic Regression Classifier is:

 89 9 15 27


### B. Count Vectorizer

In [21]:
vectorizer2 = CountVectorizer(binary = True, decode_error = u'ignore')
word_tokenizer2 = vectorizer2.build_tokenizer()
# doc_terms_list_train2 = list(getChList(doc_str) for doc_str in doc_str_list_train)
# doc_terms_list_test2 = list(getChList(doc_str) for doc_str in doc_str_list_test)
doc_train_vec1 = vectorizer2.fit_transform(doc_str_list_train)
doc_test_vec1 = vectorizer2.transform(doc_str_list_test)

### B.1 Multinomial Naive Bayes

In [22]:
clf3 = MultinomialNB().fit(doc_train_vec1, doc_class_list_train)  
doc_test_predicted3 = clf3.predict(doc_test_vec1)

In [23]:
target_names=["0","1"]
acc = np.mean(doc_test_predicted3 == doc_class_list_test)
print("The accuracy for Multinomial Naive Bayes Classifier is ",acc*100)
print("The classification report for Multinomial Naive Bayes Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted3, target_names=target_names))
print("Confusion matrix for Multinomial Naive Bayes Classifier is:\n\n", metrics.confusion_matrix(doc_class_list_test, doc_test_predicted3))

The accuracy for Multinomial Naive Bayes Classifier is  75.71428571428571
The classification report for Multinomial Naive Bayes Classifier is as followed:

               precision    recall  f1-score   support

           0       0.86      0.78      0.82        98
           1       0.58      0.71      0.64        42

   micro avg       0.76      0.76      0.76       140
   macro avg       0.72      0.74      0.73       140
weighted avg       0.78      0.76      0.76       140

Confusion matrix for Multinomial Naive Bayes Classifier is:

 [[76 22]
 [12 30]]


In [24]:
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted3).ravel()
print("Confusion matrix for Multinomial Naive Bayes Classifier is:\n\n", tn, fp, fn, tp)

Confusion matrix for Multinomial Naive Bayes Classifier is:

 76 22 12 30


### B.2 Linear Support Vector Machine

In [25]:
clf4 = svm.SVC(kernel='linear')
clf4.fit(doc_train_vec1, doc_class_list_train)
doc_test_predicted4=clf4.predict(doc_test_vec1)

In [26]:
acc = np.mean(doc_test_predicted4 == doc_class_list_test)
print("The accuracy for Linear SVM Classifier is ",acc*100)
print("The classification report for Linear SVM Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted4, target_names=target_names))
print("Confusion matrix for Linear SVM Classifier is:\n\n", metrics.confusion_matrix(doc_class_list_test, doc_test_predicted4))

The accuracy for Linear SVM Classifier is  81.42857142857143
The classification report for Linear SVM Classifier is as followed:

               precision    recall  f1-score   support

           0       0.87      0.87      0.87        98
           1       0.69      0.69      0.69        42

   micro avg       0.81      0.81      0.81       140
   macro avg       0.78      0.78      0.78       140
weighted avg       0.81      0.81      0.81       140

Confusion matrix for Linear SVM Classifier is:

 [[85 13]
 [13 29]]


In [27]:
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted4).ravel()
print("Confusion matrix for Linear SVM Classifier Classifier is:\n\n", tn, fp, fn, tp)

Confusion matrix for Linear SVM Classifier Classifier is:

 85 13 13 29


### B.3 Logistic Regression

In [28]:
clf5 = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(doc_train_vec1, doc_class_list_train)
doc_test_predicted5 = clf5.predict(doc_test_vec1)

In [29]:
acc = np.mean(doc_test_predicted5 == doc_class_list_test)
print("The accuracy for Logistic Regression Classifier is ",acc*100)
print("The classification report for Logistic Regression Classifier is as followed:\n\n",
      classification_report(doc_class_list_test, doc_test_predicted5, target_names=target_names))
print("Confusion matrix for Logistic Regression Classifier is:\n\n", metrics.confusion_matrix(doc_class_list_test, doc_test_predicted5))

The accuracy for Logistic Regression Classifier is  80.71428571428572
The classification report for Logistic Regression Classifier is as followed:

               precision    recall  f1-score   support

           0       0.86      0.87      0.86        98
           1       0.68      0.67      0.67        42

   micro avg       0.81      0.81      0.81       140
   macro avg       0.77      0.77      0.77       140
weighted avg       0.81      0.81      0.81       140

Confusion matrix for Logistic Regression Classifier is:

 [[85 13]
 [14 28]]


In [30]:
tn, fp, fn, tp = metrics.confusion_matrix(doc_class_list_test, doc_test_predicted5).ravel()
print("Confusion matrix for Logistic Regression Classifier is:\n\n", tn, fp, fn, tp)

Confusion matrix for Logistic Regression Classifier is:

 85 13 14 28
