In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import gensim
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import *
import re
from gensim.utils import simple_preprocess

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Semester 6/Natural Language Processing/Assignment 2/Papers.csv')
data.head(5)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  2422 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   paper_text  7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [7]:
df = pd.DataFrame(data = data['paper_text'], columns = ['paper_text'], index = range(len(data)))
df.head()

Unnamed: 0,paper_text
0,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,Bayesian Query Construction for Neural\nNetwor...
4,"Neural Network Ensembles, Cross\nValidation, a..."


In [8]:
df.shape

(7241, 1)

## Preprocessing

In [4]:
np.random.seed(123)

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [13]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

def preprocess_text(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [14]:
processed_df = df['paper_text'].map(preprocess_text)

## Feature Extraction

In [15]:
dictionary = gensim.corpora.Dictionary(processed_df)

In [16]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_df]

In [17]:
tfidf = gensim.models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

## Running LDA using TF - IDF

In [18]:
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [19]:
for id,topic in lda_model.print_topics(-1):
    print(id,'\n', topic)

0 
 0.001*"cluster" + 0.001*"neuron" + 0.001*"spike" + 0.001*"kernel" + 0.001*"network" + 0.001*"imag" + 0.001*"cell" + 0.001*"layer" + 0.001*"graph" + 0.000*"posterior"
1 
 0.001*"imag" + 0.001*"kernel" + 0.001*"cluster" + 0.001*"network" + 0.001*"queri" + 0.000*"layer" + 0.000*"neuron" + 0.000*"train" + 0.000*"hash" + 0.000*"cell"
2 
 0.001*"kernel" + 0.001*"regret" + 0.001*"submodular" + 0.001*"convex" + 0.001*"rank" + 0.001*"neuron" + 0.001*"imag" + 0.001*"graph" + 0.001*"matrix" + 0.001*"layer"
3 
 0.001*"label" + 0.001*"imag" + 0.001*"kernel" + 0.001*"classifi" + 0.001*"train" + 0.001*"cluster" + 0.001*"graph" + 0.000*"neuron" + 0.000*"spike" + 0.000*"network"
4 
 0.001*"neuron" + 0.001*"cluster" + 0.001*"kernel" + 0.001*"polici" + 0.001*"network" + 0.001*"spike" + 0.001*"chip" + 0.001*"graph" + 0.001*"imag" + 0.001*"train"
5 
 0.001*"spike" + 0.001*"regret" + 0.001*"polici" + 0.001*"action" + 0.001*"kernel" + 0.001*"neuron" + 0.001*"reward" + 0.001*"network" + 0.000*"imag" + 0.0