In [1]:
# https://www.kaggle.com/datasets/benhamner/nips-papers?select=papers.csv

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
from bs4 import BeautifulSoup
import string
from nltk.stem import PorterStemmer
from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
nltk.download('english')
tqdm.pandas()

[nltk_data] Error loading english: Package 'english' not found in
[nltk_data]     index


In [4]:
df = pd.read_csv('papers.csv')

In [5]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [6]:
df.sample(5)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
580,1530,1998,A Precise Characterization of the Class of Lan...,,1530-a-precise-characterization-of-the-class-o...,Abstract Missing,A Precise Characterization of the Class of\nLa...
4043,466,1991,A Contrast Sensitive Silicon Retina with Recip...,,466-a-contrast-sensitive-silicon-retina-with-r...,Abstract Missing,A Contrast Sensitive Silicon Retina with\nReci...
11,1008,1994,Multidimensional Scaling and Data Clustering,,1008-multidimensional-scaling-and-data-cluster...,Abstract Missing,Multidimensional Scaling and Data Clustering\n...
3261,3954,2010,Learning Kernels with Radiuses of Minimum Encl...,,3954-learning-kernels-with-radiuses-of-minimum...,"In this paper, we point out that there exist s...",Learning Kernels with Radiuses of Minimum\nEnc...
2441,3214,2007,Markov Chain Monte Carlo with People,,3214-markov-chain-monte-carlo-with-people.pdf,Abstract Missing,Markov Chain Monte Carlo with People\n\nAdam N...


In [7]:
df.shape

(7241, 7)

In [8]:
df.isnull().sum()

id               0
year             0
title            0
event_type    4819
pdf_name         0
abstract         0
paper_text       0
dtype: int64

### Preprocessing data

In [9]:
punc = string.punctuation

In [10]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
new_words = ['fig', 'figure', 'image', 'sample', 'using', 'show', 'result', 'large', 'also', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
stop_words = list(stop_words.union(new_words))

In [13]:
stop_words

['four',
 'above',
 'those',
 'isn',
 'an',
 "isn't",
 "she's",
 'these',
 'itself',
 'ma',
 'same',
 "haven't",
 "should've",
 're',
 'again',
 'me',
 'for',
 'nine',
 'most',
 'about',
 'before',
 'this',
 'against',
 'we',
 'just',
 'sample',
 'who',
 'three',
 'because',
 "you're",
 'below',
 'that',
 'from',
 'only',
 "you'll",
 'whom',
 'when',
 's',
 'large',
 'her',
 'as',
 'he',
 'using',
 'image',
 'theirs',
 'mightn',
 'it',
 "aren't",
 "didn't",
 'so',
 "mightn't",
 'off',
 'were',
 'yourselves',
 'am',
 'wouldn',
 'should',
 'under',
 'will',
 "that'll",
 'out',
 'haven',
 'until',
 'hasn',
 "needn't",
 'my',
 'very',
 'which',
 'm',
 "you'd",
 'further',
 'the',
 'mustn',
 'to',
 'them',
 'into',
 'can',
 've',
 'no',
 'more',
 'all',
 'nor',
 'she',
 'six',
 'their',
 'll',
 'do',
 'weren',
 'two',
 'down',
 't',
 'five',
 'other',
 'they',
 "weren't",
 'fig',
 'now',
 'result',
 "mustn't",
 'his',
 'its',
 'aren',
 'himself',
 'once',
 'than',
 "wouldn't",
 'such',
 'fi

In [14]:
df.sample(5)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
1034,1945,2001,Natural Language Grammar Induction Using a Con...,,1945-natural-language-grammar-induction-using-...,Abstract Missing,Natural Language Grammar Induction using a\nCo...
1683,253,1989,Subgrouping Reduces Complexity and Speeds Up L...,,253-subgrouping-reduces-complexity-and-speeds-...,Abstract Missing,638\n\nZipser\n\nSubgrouping Reduces Complexit...
4412,4994,2013,Optimal Neural Population Codes for High-dimen...,Poster,4994-optimal-neural-population-codes-for-high-...,How does neural population process sensory inf...,Fisher-Optimal Neural Population Codes for\nHi...
6363,6756,2017,A Scale Free Algorithm for Stochastic Bandits ...,Poster,6756-a-scale-free-algorithm-for-stochastic-ban...,Existing strategies for finite-armed stochasti...,A Scale Free Algorithm for Stochastic Bandits ...
3066,3779,2009,Graph Zeta Function in the Bethe Free Energy a...,,3779-graph-zeta-function-in-the-bethe-free-ene...,We propose a new approach to the analysis of L...,Graph Zeta Function in the Bethe Free Energy a...


In [15]:
# preprocessing on text dataset

In [16]:
stemming = PorterStemmer()

In [17]:
def preprocessing(text):
    text = text.lower()

    text = BeautifulSoup(text, 'html.parser')
    text = text.get_text()
    if not text.strip():
        return 'empty_text'
    text = nltk.word_tokenize(text)

    text = [word for word in text if word not in stop_words and word not in punc]

    text = [word for word in text if len(word) > 3]

    text = [word for word in text]

    text = [stemming.stem(word) for word in text]


    return ' '.join(text)


In [18]:
preprocessing("""1 1run 10 runner running lover loving love  This is PYth!.>n is meant to represent 
punctuation marks that you want to filter out, you need to
include a proper condition to check whether
the word is not
a punctuation mark. Here's the correcte
<html>
</html>
              """)

'1run runner run lover love love pyth meant repres punctuat mark want filter need includ proper condit check whether word punctuat mark correct'

In [19]:
docs = df['paper_text'].progress_apply(preprocessing)


# n_jobs = 16  # Number of jobs (adjust based on your CPU cores)
# processed_texts = Parallel(n_jobs=n_jobs)(
#     delayed(preprocessing)(text) for text in tqdm(df['paper_text'], desc="Processing")
# )



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 7241/7241 [11:45<00:00, 10.26it/s]


In [20]:
docs

0       self-organ associ databas applic hisashi suzuk...
1       mean field theori layer visual cortex applic a...
2       store covari associ long term potenti depress ...
3       bayesian queri construct neural network model ...
4       neural network ensembl cross valid activ learn...
                              ...                        
7236    singl transistor learn synaps paul hasler chri...
7237    bia varianc combin least squar estim ronni mei...
7238    real time cluster cmo neural engin serrano-got...
7239    learn direct global motion class psychophysica...
7240    correl interpol network real-tim express analy...
Name: paper_text, Length: 7241, dtype: object

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.90, max_features=5000, ngram_range=(1, 3))

In [22]:
word_count_vectors = cv.fit_transform(tqdm(df['paper_text'], desc="Vectorizing"))

Vectorizing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 7241/7241 [02:08<00:00, 56.23it/s]


In [23]:
word_count_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 2, ..., 2, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
word_count_vectors.toarray().shape

(7241, 5000)

### Tfidf

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

In [26]:
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)

In [27]:
type(word_count_vectors)

scipy.sparse._csr.csr_matrix

In [28]:
tfidf_transformer = tfidf.fit(word_count_vectors)

In [29]:
type(tfidf_transformer)

sklearn.feature_extraction.text.TfidfTransformer

In [30]:
type(tfidf_transformer)

sklearn.feature_extraction.text.TfidfTransformer

### KeyWords

In [31]:
df.sample(2)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
1513,2376,2003,Iterative Scaled Trust-Region Learning in Kryl...,,2376-iterative-scaled-trust-region-learning-in...,Abstract Missing,Iterative scaled trust-region learning in\nKry...
5251,5754,2015,COEVOLVE: A Joint Point Process Model for Info...,Oral,5754-coevolve-a-joint-point-process-model-for-...,Information diffusion in online social network...,COEVOLVE: A Joint Point Process Model for\nInf...


In [32]:
cv.get_feature_names_out()

array(['00', '00 00', '000', ..., 'zk', 'zn', 'zt'], dtype=object)

In [33]:
len(cv.get_feature_names_out())

5000

In [34]:
feature_names = cv.get_feature_names_out()

In [35]:
feature_names

array(['00', '00 00', '000', ..., 'zk', 'zn', 'zt'], dtype=object)

In [None]:
def get_keywords(idx, docs, topN=10):
    # getting words count and importance
    docs_words_count = tfidf_transformer.transform(cv.transform([docs[idx]]))

    # sorting sparse matrix
    docs_words_count = docs_words_count.tocoo()
    tuples = zip(docs_words_count.col, docs_words_count.data)
    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

    # getting top 10 keywords
    sorted_items = sorted_items[:topN]


    score_vals = []
    features_vals = []

    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        features_vals.append(feature_names[idx])


    # final result
    results = {}
    for idx in range(len(features_vals)):
        results[features_vals[idx]] = score_vals[idx]

    return results
    
    

def print_keywords(idx, keywords, df):
    print("======== [Title] ========")
    print(df["title"][idx])
    
    print()
    
    print('======== [Abstract] ========')
    print(df['abstract'][idx])

    print()
    
    print('======== [KeyWords] ========')

    for k in keywords:
        print(k, keywords[k])

idx = 3339
keywords = get_keywords(idx, docs)
print_keywords(idx, keywords, df)

Approximate inference in continuous time Gaussian-Jump processes

We present a novel approach to inference in conditionally Gaussian continuous time stochastic processes, where the latent process is a Markovian jump process. We first consider the case of jump-diffusion processes, where the drift of a linear stochastic differential equation can jump at arbitrary time points. We derive partial differential equations for exact inference and present a very efficient mean field approximation. By introducing a novel lower bound on the free energy, we then generalise our approach to Gaussian processes with arbitrary covariance, such as the non-Markovian RBF covariance. We present results on both simulated and real data, showing that the approach is very accurate in capturing latent dynamics and can be useful in a number of real data modelling tasks.

process 0.618
infer 0.304
gaussian 0.224
posterior 0.213
gaussian process 0.211
time 0.165
model 0.157
smooth 0.156
backward 0.137
system 0.135


In [37]:
import pickle

pickle.dump(cv, open('count_vectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('tfidf_transformer.pkl', 'wb'))
pickle.dump(feature_names, open('feature_names.pkl', 'wb'))
