## Document Classification for MIT Dataset

In [46]:
import io
import re
import string
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

    List of stopwords in English category

In [15]:
stop_words = set(stopwords.words('english'))

    Import the Label present in the excel file and Store it in List

In [16]:
file_loc = "Labels.xlsx"
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols = "A,B")
sub_name = df["course"].tolist()

    Subject name and the final text of each document

In [17]:
subject_name = []
final_txt = []
complete_path = []

    Read the document and the further pre-processing

In [23]:
#Path where all pdf are present
# base = "C:\\Users\\vnitin\\Documents\\Dataset\\"
base = "C:\\Users\\vnitin\\Documents\\Processed\\"
for path in os.listdir(base):
    for file in os.listdir(os.path.join(base, path)):
        if file.endswith(".txt"):
            full_path = os.path.join((os.path.join(base, path)),file)
            pdf = open(full_path, encoding="utf8")
            complete_path.append(full_path)
            
            #List of all to which the document need to be classified
            subject_name.append(path)
            
            #Convert all content to lowercase
            pdf_content = pdf.read().lower()
            
            #remove text of length till 3
            cleared = re.sub(r'\b\w{1,3}\b', '', pdf_content)
            
            #remove all junk character and numbers apart from letters
            cleared_text = re.sub('[^a-zA-Z]+', ' ', cleared)

            #lemmatization is a task to shorten up the long word with the meaningful dictionary word
            
            processed_txt = ""
            words = cleared_text.split()
            lemmatizer = WordNetLemmatizer()
            for r in words:
                if not r in stop_words:
                    processed_txt+=str(str(lemmatizer.lemmatize(r) + " "))
            final_txt.append(processed_txt)
#print(final_txt)

    Split the dataset into training and test set

In [24]:
X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(final_txt, subject_name, complete_path,test_size = 0.2, random_state=42)

    Build the pipeline with countVectorizer, TfidfVectorizer and model

In [30]:
#clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [50]:
clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None))])

In [47]:
#clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1))])

    Fit into a model ex. Multinomial Naive Bias Classifier

In [51]:
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

    Prediction testing and accuracy check

In [52]:
#for doc, category in zip(z_test, predicted):
#    print(doc,"=>",category)

print(np.mean(predicted == y_test))

0.7271274470635237
