# 1. Import Libraries

Import all the necessary libraries which will be used in this project.



In [0]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import re
import gensim
import gensim.corpora as corpora
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda 


# 2. Load data file

Load the data file containing research papers into a dataframe called 'dataset'.

In [0]:
#Load data file
dataset = pd.read_csv(r'research_papers.csv', encoding='ISO-8859–1')
dataset.head()

# 3. Clean Data

I dropped the unnecessary columns like 'ID', 'Author','Year', 'Conference/Journal', and focused solely on the 'Abstract' and 'Conclusion' columns of each paper entry. For papers with no conclusions, I filled the empty cell with the text "No conclusion". Next, I merged the two columns 'Abstract' and 'Conclusion' to form a new column called 'PaperText'. 

In [0]:
#Remove the unecessary columns
dataset = dataset.drop(columns=['Id', 'Reference', 'Codes', 'Authors', 'Year', 'Conference/ Journal'], axis=1)

#Fill in the empty cells
dataset = dataset.fillna('No conclusion')

#Merge abstract and conclusion
dataset['Paper_Text'] = dataset["Abstract"] + dataset["Conclusion"]

#show first 5 records
dataset.head()

# 4. Preprocess Data

Tokenize each sentence into a list of words, remove punctuations, remove stopwords and words of length less than 3, and then lemmatize.

In [0]:
#function for lemmatization
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

# tokenization
tokenized_data = dataset['Paper_Text'].apply(lambda x: x.split()) 

# Remove punctuation
tokenized_data = tokenized_data.apply(lambda x: [re.sub('[-,()\\!?]', '', item) for item in x])
tokenized_data = tokenized_data.apply(lambda x: [re.sub('[.]', ' ', item) for item in x])

#turn to lowercase
tokenized_data = tokenized_data.apply(lambda x: [item.lower() for item in x])

# remove stop-words and words of length less than 3
stop_words = stopwords.words('english')
stop_words.extend(['from','use', 'using','uses','user', 'users', 'well', 'study', 'survey', 'think'])
tokenized_data = tokenized_data.apply(lambda x: [item for item in x if item not in stop_words and len(item)>3])

#lemmatize by calling lemmatization function
tokenized_data= tokenized_data.apply(lambda x: [get_lemma(item) for item in x])


# 5. Create Bigram and Trigram
Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

Some examples in our corpus are: ‘visually_impaired’, ‘programming_language’, ‘block_based_programming’, 'programming environment' etc.

Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams

In [0]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tokenized_data], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[tokenized_data[0]]])

In [0]:
# Define functions for creating bigrams and trigrams.
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [0]:
# Form Bigrams
tokenized_data_bigrams = make_bigrams(tokenized_data)

 # Form Trigrams
tokenized_data_trigrams = make_trigrams(tokenized_data)

In [0]:
# de-tokenization, combine tokens together
detokenized_data = []
for i in range(len(dataset)):
    t = ' '.join(tokenized_data_trigrams[i])
    detokenized_data.append(t)
    
dataset['clean_text']= detokenized_data 
documents = dataset['clean_text']

# 6. Perform Exploratory Analysis
To verify whether the preprocessing happened correctly, we’ll make a word cloud using the wordcloud package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.

In [0]:
# Join the different processed titles together.
long_string = ','.join(list(documents.values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

# 7. Create Document-Term Matrix
This is the first step towards topic modeling. We need to represent each and every term and document as a vector.We will use sklearn's TfidfVectorizer to create a document-term matrix using only 1000 terms (words) from our corpus.

In [0]:
#Set variable number of terms 
no_terms = 1000

# NMF uses the tf-idf count vectorizer
# Initialise the count vectorizer with the English stop words
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=no_terms, stop_words='english')
# Fit and transform the text
document_matrix = vectorizer.fit_transform(documents)

#get features
feature_names = vectorizer.get_feature_names()


# 8. Apply Topic Model
We will use the document-term matrix and decompose it into multiple matrices. We will use sklearn's NMF to perform the task of matrix decomposition. The number of topics can be specified by using the n_components parameter.

In [0]:
#Set variables umber of topics and top words.
no_topics = 10
no_top_words = 10

# Function for displaying topics
def display_topic(model, feature_names, num_topics, no_top_words, model_name):    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    print("Model Result:")
    word_dict = {};
    for i in range(num_topics):
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-no_top_words - 1:-1]
        words = [feature_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i)] = words;
    dict = pd.DataFrame(word_dict);
    dict.to_csv('%s.csv' % model_name)
    return dict

# Apply NMF topic model to document-term matrix
nmf_model = NMF(n_components=no_topics, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd').fit(document_matrix)
#display topics for nmf model
display_topic(nmf_model, feature_names, no_topics, no_top_words, 'NMF_Model_Result')


# 9. Analyzing our NMF model

To analyze the model, we visualize the topics for interpretability. To do so, we’ll use a popular visualization package, pyLDAvis which generates an inter-topic distance map. This map is designed to help in the understanding and interpreting of individual topics, and understanding the relationships between the topics. The closer the distance between topics and the more the overlap between topics the worse the performance of the model. 

Looking at the plot below we can see that for the most part NMF produced topics that had good distance and no overlaps, this means that NMF produces distinctive topics.
									
									

In [0]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = sklearn_lda.prepare(nmf_model, document_matrix, vectorizer)
LDAvis_prepared

# 10. Classify papers under topics

Using the 10 topics generated by our NMF model, we categorize each paper in our corpus under one of the 10 topics.

In [0]:
#Use NMF model to assign topic to papers in corpus
nmf_topic_values = nmf_model.transform(document_matrix)
dataset['NMF Topic'] = nmf_topic_values.argmax(axis=1)

#Save dataframe to csv file
dataset.to_csv('final_results.csv')
dataset.head(10)