In [None]:
# In this notebook we'll do NLP on the titles of math dissertations.

# First thing to do is load the combined corpus and thesis titles.

In [1]:
import matplotlib.pyplot as plt
import nltk
#nltk.download('averaged_perceptron_tagger')
import numpy as np
import os
import pandas as pd
import pickle
import re
import seaborn as sns
#import spacy
import string
from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
with open("20190722/combined_msc_corpus.pickle", "rb") as f:
    combined_msc_corpus = pickle.load(f)
with open("20190722/thesis_msc.pickle", "rb") as f:
    thesis_msc = pickle.load(f)
#with open('count_rf_20190724.pickle', 'rb') as f:
#    winning_pipe = pickle.load(f)
with open('20190724/msc_to_fill_20190724.pickle', 'rb') as f:
    msc_to_fill = pickle.load(f)
with open('20190724/thesis_msc_titled_to_fill_20190724.pickle', 'rb') as f:
    thesis_msc_titled_to_fill = pickle.load(f)

In [4]:
thesis_msc.head()
thesis_msc_filled = thesis_msc[thesis_msc['msc']!=-1].copy()
thesis_msc_unfilled = thesis_msc[thesis_msc['msc']==-1].copy()
thesis_msc_titled = thesis_msc[thesis_msc['thesis']!=""].copy()
thesis_msc_untitled = thesis_msc[thesis_msc['thesis']==""].copy()

In [5]:
print(f"msc filled: {len(thesis_msc)} = {len(thesis_msc_filled)} filled + {len(thesis_msc_unfilled)} unfilled")
print(f"msc titled: {len(thesis_msc)} = {len(thesis_msc_titled)} titled +  {len(thesis_msc_untitled)} untitled")

msc filled: 246182 = 138183 filled + 107999 unfilled
msc titled: 246182 = 220190 titled +  25992 untitled


In [6]:
# There are 25,992 theses that are not titled in this list.
# While we can still use their classifications for mapping purposes,
# we cannot use them for NLP purposes. Drop them here.

# This means merely using thesis_msc_titled from here on.
thesis_msc_titled_filled   = thesis_msc_titled[thesis_msc_titled['msc']!=-1].copy()
thesis_msc_titled_unfilled = thesis_msc_titled[thesis_msc_titled['msc']==-1].copy()

In [7]:
to_print1 = f"msc titled: {len(thesis_msc_titled)} = "
to_print2 = f"{len(thesis_msc_titled_filled)} filled + {len(thesis_msc_titled_unfilled)} unfilled"
print(to_print1 + to_print2)

msc titled: 220190 = 132116 filled + 88074 unfilled


In [8]:
# Next, we need to prepare the vocabulary. Our MSC corpus keys are the MSC subject codes.
combined_msc_corpus.keys()
# We can examine each key's values to see how we should process our keywords.

dict_keys([58, 68, 74, 30, 3, 6, 65, 62, 8, 90, 0, 44, 54, 97, 5, 57, 52, 91, 37, 14, 35, 78, 60, 49, 47, 76, 94, 26, 20, 11, 85, 41, 83, 39, 42, 55, 53, 46, 13, 16, 81, 22, 51, 70, 34, 93, 92, 43, 18, 15, 17, 32, 86, 40, 1, 82, 45, 31, 28, 80, 33, 12, 19])

In [9]:
# If there is {} with ", see " inside, remove the {} from that entry.
# This will clean up the MSC data.
cleaned_combined_msc_corpus = {}
see_also_regex = r"\{(.*), see (.*)\}"
see_also = re.compile(see_also_regex)
for k, v in tqdm(combined_msc_corpus.items()):
    cleaned_combined_msc_corpus[k] = []
    for t in v:
        if len(t)>=5: # just drop everything less than 5 char; not even an MSC code
            cleaned_combined_msc_corpus[k].append(see_also.sub("", t))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))




We have two primary NLP questions to explore:
    1. Can we classify the unclassified thesis titles according to MSC?
    2. Can we discover an evolution of topics, with possible similarities across "field", 
        over time, based solely on thesis titles?

In [26]:
# function from stackoverflow
# https://stackoverflow.com/questions/48865150/pipeline-for-text-cleaning-processing-in-python
# modified by MC
# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.stem.wordnet import wordnet, WordNetLemmatizer

default_lemmatizer = WordNetLemmatizer()
default_stemmer = PorterStemmer()
default_stopwords = set(stopwords.words('english')) 
# or any other list of your choice - wrapped in set

def get_wordnet_pos(word):
# Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) # it's a noun if it's not found

def tokenize_text(text):
    return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(characters)))
    return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

def lemmatize_text(text, lemmatizer=default_lemmatizer):
    tokens = tokenize_text(text)
    return ' '.join([lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens])

def stem_text(text, stemmer=default_stemmer):
    tokens = tokenize_text(text)
    return ' '.join([stemmer.stem(t) for t in tokens])

def remove_stopwords(text, stop_words=default_stopwords):
    tokens = [w for w in tokenize_text(text) if w not in stop_words]
    return ' '.join(tokens)

    # cleaning pipeline in this function: 
    # remove extra spaces, lowercase, remove stopwords, stem_or_lem
    
def clean_text(text, stem_or_lem = 'stem'):

    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    text = remove_special_characters(text) # remove punctuation and symbols
    text = remove_stopwords(text) # remove stopwords
    if stem_or_lem == 'stem':
        text = stem_text(text) # stemming
    elif stem_or_lem == 'lem':
        text = lemmatize_text(text) # lemmatizing
    else: # intentionally breaking the argument so neither occurs
        pass 
    #text.strip(' ') # strip whitespaces again?
    
    # remove stems and lems of optional stopwords?

    return text

In [11]:
categories = list(cleaned_combined_msc_corpus.keys())

In [51]:
#with open('cleaned_combined_msc_corpus_20190729.pickle', 'wb') as f:
#    pickle.dump(cleaned_combined_msc_corpus, f)

In [None]:
# https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05
# LDA - not using... this is just a note.

In [27]:
thesis_msc_titled_filled.columns

Index(['thesis', 'msc'], dtype='object')

In [28]:
# 20190729: Must add blanks with classification msc = -1 for "no classification"

In [29]:
# Let's generate 5 blank titles to classify as msc -1.
blanks = pd.DataFrame({ 'thesis': ["", " ", "  ", "   ", "\t"], 'msc': [-1] * 5 })

In [30]:
thesis_msc_titled_filled = thesis_msc_titled_filled.append(blanks, ignore_index=True)

In [48]:
# save thesis_msc_titled_filled for train-validation-test in both NLP suites
#with open('thesis_msc_titled_filled_20190729.pickle', 'wb') as f:
#    pickle.dump(thesis_msc_titled_filled, f)

In [31]:
# thesis_msc_titled_filled are for train and test.
# thesis_msc_titled_unfilled are for actual classification.
# msc titled: 220190 = 132116 filled + 88074 unfilled

X = thesis_msc_titled_filled['thesis'].copy()
y = thesis_msc_titled_filled['msc'].copy()
feature_cols = ['thesis']

In [32]:
X_tv, X_test, y_tv, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
X_train, X_valid, y_train, y_valid = train_test_split(X_tv, y_tv, random_state=42, test_size=0.3)
print(f"Created train-validation-test split for X, y with sizes ", end="")
print(f"train: {len(X_train)}, ", end="")
print(f"validation: {len(X_valid)}, ", end="")
print(f"test: {len(X_test)}.")

Created train-validation-test split for X, y with sizes train: 64741, validation: 27747, test: 39638.


In [33]:
# pipe.fit has problems with pandas data frames.
X_train_list = list(X_train)
X_valid_list = list(X_valid.values)
X_test_list  = list(X_test.values)

y_train_array = np.array(y_train)
y_valid_array = np.array(y_valid)
y_test_array  = np.array(y_test)

In [38]:
type(X_train_list), type(y_train_array), type(X_valid_list), \
    type(y_valid_array), type(X_test_list), type(y_test_array)

(list, numpy.ndarray, list, numpy.ndarray, list, numpy.ndarray)

In [39]:
# Add cleaned_combined_msc_corpus to the training set.
for k, v in cleaned_combined_msc_corpus.items():
    # convert this list into a dictionary into a data frame
    # and append it to the training dataframe.
    X_train_list.extend(v)
    y_train_array = np.array(list(y_train_array) + [k]*len(v))

In [40]:
len(X_train_list), len(y_train_array), len(X_valid_list), len(y_valid_array), len(X_test_list), len(y_test_array)

(203580, 203580, 27747, 27747, 39638, 39638)

In [41]:
len(X_train), len(y_train), len(X_valid), len(y_valid), len(X_test), len(y_test)

(64741, 64741, 27747, 27747, 39638, 39638)

In [42]:
# We need to add French and German common words to the stop words list.
# Other than that, these topics look relatively good.
french_stop = ['problmes', 'le', 'par', 'non', 'une', 'para', 'du', 'dans', 
               'pour', 'sur', 'les', 'en', 'la', 'des', 'et']
german_stop = ['theorie', 'bei', 'eine', 'ein', 'im', 'auf', 'des', 'mit', 
               'fr', 'ber', 'zur', 'die', 'von', 'und', 'der', 
               'den', 'unter', 'durch', 'einer', 'eines', 'das']
stop_words = ENGLISH_STOP_WORDS.union(french_stop)
stop_words = stop_words.union(german_stop)

In [49]:
#with open('custom_stop_words.pickle', 'wb') as f:
#    pickle.dump(stop_words, f)

In [None]:
# Now that we've integrated our MSC corpus into our training titles, 
# we can run different models to explore accuracy, etc.
results = []

vectorizers = [('count', CountVectorizer(stop_words=stop_words)), 
               ('tfidf', TfidfVectorizer(stop_words=stop_words))]


classifiers =  [('nb', MultinomialNB(verbose=10)), 
               ('rf', RandomForestClassifier(n_estimators=10, random_state=42, verbose=10)),
               ('knn', KNeighborsClassifier(verbose=10))  #, 
               ('lr', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=100, 
                                         random_state=42, verbose=10)), 
               ('svc', LinearSVC(max_iter=1000, random_state=42, verbose=10))
               ]
# TODO add some boosters
# count + rf appears to work fantastically...

for v in vectorizers:
    for c in classifiers:
        print(f"({v[0]}, {c[0]}): ", end="")
        pipe = Pipeline(steps=[v, c])
        pipe.fit(X_train_list, y_train_array)
        labels = pipe.predict(X_valid_list)
# https://stackoverflow.com/questions/43162506/
#undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi
        acc = round(accuracy_score(y_valid_array, labels), 4)
        # average='macro' vs 'weighted'
        f1  = round(f1_score(y_valid_array, labels, average='weighted', labels=np.unique(labels)), 4)
        pre = round(precision_score(y_valid_array, labels, average='weighted', labels=np.unique(labels)), 4)
        rec = round(recall_score(y_valid_array, labels, average='weighted', labels=np.unique(labels)), 4)

        print(f" acc: {acc} / f1: {f1} / pre: {pre} / rec: {rec}")
        results.append((v[0], c[0], acc, f1, pre, rec))

## HERE IS THE NEARLY-99% ACCURACY MODEL
(possibly overfit random forest?)

In [None]:
# (count, rf): acc: 0.9875 / f1: 0.9875 / pre: 0.9876 / rec: 0.9875
# That's some absurdly high accuracy. I think we have a winner.
winning_steps = [('count', CountVectorizer(stop_words='english')),
    ('rf', RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42, verbose=0))]
winning_pipe = Pipeline(steps=winning_steps)
winning_pipe.fit(X_train_list, y_train_array)
winning_labels = winning_pipe.predict(X_valid_list)

In [None]:
winning_acc = round(accuracy_score(y_valid_array, winning_labels), 4)
# average='macro' vs 'weighted'
winning_f1  = round(f1_score(y_valid_array, winning_labels, 
                             average='weighted', labels=np.unique(winning_labels)), 4)
winning_pre = round(precision_score(y_valid_array, winning_labels, 
                                    average='weighted', labels=np.unique(winning_labels)), 4)
winning_rec = round(recall_score(y_valid_array, winning_labels, 
                                 average='weighted', labels=np.unique(winning_labels)), 4)

print(f"VALIDATION: acc: {winning_acc} / f1: {winning_f1} / pre: {winning_pre} / rec: {winning_rec}")

In [None]:
X_test_array = np.array(X_test_list)
#X_test_array.shape
type(X_test_array)

In [None]:
# Now to run the real test data.
real_test_labels = winning_pipe.predict(X_test_array)
real_acc = round(accuracy_score(y_test_array, real_test_labels), 4)
# average='macro' vs 'weighted'
real_f1  = round(f1_score(y_test_array, real_test_labels, 
                          average='weighted', labels=np.unique(real_test_labels)), 4)
real_pre = round(precision_score(y_test_array, real_test_labels, 
                                 average='weighted', labels=np.unique(real_test_labels)), 4)
real_rec = round(recall_score(y_test_array, real_test_labels, 
                              average='weighted', labels=np.unique(real_test_labels)), 4)

print(f"TEST: acc: {real_acc} / f1: {real_f1} / pre: {real_pre} / rec: {real_rec}")

In [None]:
#with open('count_rf_20190724.pickle', 'wb') as f:
with open('count_rf_20190729.pickle', 'wb') as f:
    pickle.dump(winning_pipe, f)

In [None]:
# thesis_msc_titled_unfilled is now ready to have their MSC classified.
len(thesis_msc_titled_unfilled)
thesis_msc_titled_to_fill = np.array(thesis_msc_titled_unfilled['thesis'])

In [None]:
thesis_msc_titled_to_fill.shape

In [None]:
msc_to_fill = winning_pipe.predict(thesis_msc_titled_to_fill)

In [None]:
with open('msc_to_fill_20190724.pickle', 'wb') as f:
    pickle.dump(msc_to_fill, f)
with open('thesis_msc_titled_to_fill_20190724.pickle', 'wb') as f:
    pickle.dump(thesis_msc_titled_to_fill, f)