### Import librerie

In [1]:
import nltk
#nltk.download('punkt')
from nltk import word_tokenize 
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('words')
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from time import time

In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [4]:
import gensim
from gensim.models import LdaSeqModel

### Caricamento dataset

In [40]:
df = pd.read_csv('C:/Users/micky/OneDrive/Desktop/Tesi DS/data/NIPS.csv')
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [41]:
df = df[df['abstract']!= 'Abstract Missing']

In [42]:
df.shape

(3924, 7)

### Pre-processing

In [43]:
df = df[['year', 'abstract']]

In [44]:
import random
random.seed(1234)
df = df.sample(frac=0.02, replace=False, random_state=1) #campionamento del dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 6224 to 6222
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      78 non-null     int64 
 1   abstract  78 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [45]:
df = df.sample(frac=0.3, replace=False, random_state=1)

In [46]:
df['abstract'] = df['abstract'].str.lower() #minuscolo 
df['abstract'] = df['abstract'].str.replace(r'[0-9]+', ' ') #rimozione numeri
df['abstract'] = df['abstract'].str.replace(r'http\S+', ' ') #rimozione link
df['abstract'] = df['abstract'].str.replace(r'[^a-zA-Z0-9 ]', ' ') #rimozione caratteri speciali
df['abstract'] = df['abstract'].str.replace('\\b\\w{1,2}\\b', ' ') #rimozione singole lettere
df['abstract'] = df['abstract'].str.replace(' +', ' ') #rimozione spazi extra

In [47]:
df = df.sort_values(by=['year'])  #ordinamento cronologico necessario per il DTM
df.head()

Unnamed: 0,year,abstract
2751,2008,randomized neural networks are immortalized th...
3280,2010,partially observable markov decision processes...
4212,2012,multi agent plan recognition mapr aims recogni...
4117,2012,unsupervised clustering scattered noisy and hi...
4549,2013,for classifying time series nearest neighbor a...


In [48]:
df = df[df['abstract'].map(lambda x: len(x)!=0)]

In [49]:
series_slices = df.groupby(["year"]).size().values
series_slices

array([1, 1, 2, 3, 1, 3, 3, 9], dtype=int64)

In [50]:
corp = df['abstract']

In [51]:
stemmer = SnowballStemmer(language='english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


vect = CountVectorizer(tokenizer=tokenize, 
                       stop_words='english', 
                       lowercase=True) 


X = vect.fit_transform(corp.values.astype('U')) #astype per convertire in stringa

doc_term_matrix = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

doc_term_matrix.head()

Unnamed: 0,abl,absolut,abstract,access,accomplish,account,accur,accuraci,achiev,action,...,wherea,whi,wide,wire,word,work,world,worthi,written,yield
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,1,0,1,0,0,1,0,...,1,0,1,0,0,0,0,0,0,0


In [52]:
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False) 
id_map = dict((v, k) for k, v in vect.vocabulary_.items())

In [53]:
t0 = time()
ldaseq = LdaSeqModel(corpus=corpus, time_slice=series_slices, id2word=id_map, num_topics=3)
print("done in %0.3fs." % (time() - t0))

done in 55.665s.
