In [5]:
import pandas as pd

In [6]:
papers = pd.read_csv('dblp-v10.csv') # from https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset

In [7]:
papers = papers[papers['n_citation'] > 10]
papers = papers[~papers['references'].isna()]
papers = papers[~papers['abstract'].isna()]
papers = papers[papers['year'] >= 2010]
papers = papers[papers['venue'].isin((papers['venue'].value_counts() > 1).index)]
papers['title'] = papers['title'].str.replace('"', '').str.replace('\\', '') # fix string formatting for sql/neo4j compatibility
papers

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
6,A CMOS oscillator employing differential trans...,"['Andrea Mazzanti', 'Pietro Andreani']",50,"['0a09db01-264a-4bdf-942c-d33cceb35d3c', '36c9...",A Push–Pull Class-C CMOS VCO,IEEE Journal of Solid-state Circuits,2013,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65
12,Performance analysts profile their programs to...,"['Todd Mytkowicz', 'Amer Diwan', 'Matthias Hau...",95,"['0b279971-aa22-4622-aaf5-5cd6d4f70756', '16f8...",Evaluating the accuracy of Java profilers,programming language design and implementation,2010,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca
25,"In this paper, a weighting-delay-based method ...","['Huaguang Zhang', 'Zhenwei Liu', 'Guang-Bin H...",266,"['11560f85-2543-466f-b338-5ef10a073db7', '14fe...",Novel Weighting-Delay-Based Stability Criteria...,IEEE Transactions on Neural Networks,2010,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61
26,Search for information is no longer exclusivel...,['Jian-Yun Nie'],108,"['05da32dc-92b9-49a6-86f0-54b6dfc51502', '0d2d...",Cross-Language Information Retrieval,Synthesis Lectures on Human Language Technologies,2010,4ab5e4bd-08e2-4007-825c-d34ce7cb231f
40,"Although microRNAs (miRNAs), other non-coding ...","['Jun-Hao Li', 'Shun Liu', 'Hui Zhou', 'Liang-...",315,"['1776fa43-e959-4b2d-976a-f25e9532d169', '1fcb...","starBase v2.0: decoding miRNA-ceRNA, miRNA-ncR...",Nucleic Acids Research,2014,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a
...,...,...,...,...,...,...,...,...
999952,The incorporation of cognitive radio (CR) capa...,"['Fan Zhang', 'Tao Jing', 'Yan Huo', 'Kaiwei J...",50,"['093b1875-3e4d-4874-beaf-4d67d7db5575', '1387...",Outage Probability Minimization for Energy Har...,Sensors,2017,f323b4a1-e0bc-4f4c-be91-f3257b5937e2
999975,"In the Big Data Era, the management of energy ...","['Amine Roukh', 'Ladjel Bellatreche', 'Selma B...",50,"['027f2cc3-296c-4204-9aa4-4fb48dee7e1b', '05aa...",Eco-Physic: Eco-Physical design initiative for...,Information Systems,2017,fb32d855-1a45-46ff-b0bd-895b8862b979
999993,Zero-shot Learning (ZSL) can leverage attribut...,"['Yang Long', 'Li Liu', 'Ling Shao']",50,"['09ba177f-b3f2-43b3-b991-d2517b8f261d', '120b...",Towards Fine-Grained Open Zero-Shot Learning: ...,workshop on applications of computer vision,2017,fe9797ad-9ea5-4339-ab07-9d75735b08db
999994,Abstract The article presents a narrative revi...,"['Jingbo Meng', 'Lourdes Martinez', 'Amanda J....",50,"['0d3ddc81-b0e9-401a-96ab-cf4bed7db6f5', '134a...",Research on Social Networking Sites and Social...,"Cyberpsychology, Behavior, and Social Networking",2017,feb7d7ee-58d8-4a48-853e-c6751957997e


In [8]:
authors = (
    papers['authors']
    .str.strip('[')
    .str.strip(']')
    .str.replace('"', '')
    .str.replace("'", '')
    #.str.replace(r'\s+', '', regex=True)
    .str.split(', ')
    .str[:3]
).explode().reset_index()

authors = pd.merge(left=papers[['id']], right=authors, left_index=True, right_on='index', how='left').drop(columns=['index']).reset_index(drop=True)
authors.columns = ['paper_id', 'name']
authors['relation'] = 'authored'
author_paper = authors[['name', 'paper_id', 'relation']]
author_paper['name'] = author_paper['name'].str.replace('(TYPE=name) (SCHEME=Vancouver) ', '') #fix weird name
author_paper = author_paper.sort_values('name')
author_paper

Unnamed: 0,name,paper_id,relation
86367,A A El-Sherif,8d794d7d-328b-44a2-aeea-ddf788de1cfd,authored
233139,A Aart Blokhuis,fe533809-930a-4b4c-9e17-0454794cd062,authored
121468,A Aart Blokhuis,a87798c8-309c-47e2-a65c-d11940448a5b,authored
212337,A Ahsan Shabbir,ee8ffea8-8e3d-4c8a-a9ad-6408f61dc241,authored
326172,A Aleksandra Kuzmanovska,1cac7914-173d-4cbc-ad6a-a39cc1b3bc1a,authored
...,...,...,...
280032,Šárka Gergelitsová,ecbe9960-5d98-456c-b002-1137ec87c6ae,authored
86485,Žarko Mijajlović,8d93d8a1-7c04-474f-8d7d-d13e8fb7a303,authored
162192,Žarko Čučej,c786f75d-cbd8-4355-87fa-7da1692bb025,authored
130923,Željka Stojanac,afb3d2e2-b65b-49ca-8d9a-f39c477794c6,authored


In [9]:
authors = pd.DataFrame(author_paper['name'].unique())
authors.columns = ['name']
authors

Unnamed: 0,name
0,A A El-Sherif
1,A Aart Blokhuis
2,A Ahsan Shabbir
3,A Aleksandra Kuzmanovska
4,A André Kuijsters
...,...
194900,Šárka Gergelitsová
194901,Žarko Mijajlović
194902,Žarko Čučej
194903,Željka Stojanac


In [10]:
references = (
    papers['references']
    .str.strip('[')
    .str.strip(']')
    .str.replace("'", '')
    .str.split(', ')
).explode().reset_index()
references = references[references['references'].isin(papers['id'])]
references = pd.merge(left=papers[['id']], right=references, left_index=True, right_on='index', how='left').drop(columns=['index']).reset_index(drop=True)
references.columns = ['paper_id', 'ref_id']
references = references.dropna()
references['relation'] = 'cites'
references

Unnamed: 0,paper_id,ref_id,relation
0,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,b8b564ca-0651-4fd4-8db0-0f7710e25ce1,cites
1,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,bb1ed638-e3c3-46f7-adf9-6977afca564a,cites
2,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,e3a35338-4747-46f7-b752-5690a5c3c84a,cites
6,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,4ba5fec7-0d8e-4fe3-98f7-a3b4a7bb2895,cites
7,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,c0604659-e1b2-42d4-9401-fa5f00bfd98f,cites
...,...,...,...
273488,fe9797ad-9ea5-4339-ab07-9d75735b08db,e2f7a74a-8430-4463-94ce-fe85dfd309f9,cites
273489,fe9797ad-9ea5-4339-ab07-9d75735b08db,f2097934-77d0-49cd-9ae2-06f4c54aa781,cites
273490,feb7d7ee-58d8-4a48-853e-c6751957997e,ff612321-75f5-460c-8f10-68b1c5052a2e,cites
273491,ff87696d-35fb-4872-aab5-5ff6285f10e6,5abebb43-207b-403b-a6f2-4f31e9ac0fef,cites


In [11]:
abstracts = papers[['id', 'abstract']].reset_index(drop=True)
abstracts.columns = ['paper_id', 'abstract']
abstracts

Unnamed: 0,paper_id,abstract
0,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A CMOS oscillator employing differential trans...
1,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Performance analysts profile their programs to...
2,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,"In this paper, a weighting-delay-based method ..."
3,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Search for information is no longer exclusivel...
4,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"Although microRNAs (miRNAs), other non-coding ..."
...,...,...
134283,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,The incorporation of cognitive radio (CR) capa...
134284,fb32d855-1a45-46ff-b0bd-895b8862b979,"In the Big Data Era, the management of energy ..."
134285,fe9797ad-9ea5-4339-ab07-9d75735b08db,Zero-shot Learning (ZSL) can leverage attribut...
134286,feb7d7ee-58d8-4a48-853e-c6751957997e,Abstract The article presents a narrative revi...


In [12]:
venue_paper = papers[['venue', 'id']].reset_index(drop=True).sort_values('venue')
venue_paper['relation'] = 'published'
venue_paper

Unnamed: 0,venue,id,relation
55977,A Quarterly Journal of Operations Research,bd56daa6-67f2-480c-9ed6-1d8c20ca4aa0,published
58759,A Quarterly Journal of Operations Research,c2febfe6-2fcb-4945-bcd0-f0c99878ba57,published
57570,A Quarterly Journal of Operations Research,c086ad01-909f-44ff-b15b-72bfa011d956,published
26878,A Quarterly Journal of Operations Research,8232108f-036e-477a-9d33-aa806811b077,published
74265,A Quarterly Journal of Operations Research,e2ca2598-86fa-4996-8dd3-301ddabb2e81,published
...,...,...,...
97892,worst case execution time analysis,9ef4aa7c-6683-4d4d-95dc-f9d85b0d5b13,published
9004,worst case execution time analysis,5d2add4b-ff53-4ea9-9a1b-b035919dd064,published
103019,worst case execution time analysis,7c023e34-dd88-4e11-9e08-4d36c5825a74,published
32664,worst case execution time analysis,8dd2dc85-9257-41bd-ac36-ba01191756ae,published


In [13]:
venues = pd.DataFrame(venue_paper['venue'].unique())
venues.columns = ['name']
venues

Unnamed: 0,name
0,A Quarterly Journal of Operations Research
1,ACM Communications in Computer Algebra
2,ACM Computing Surveys
3,ACM Crossroads Student Magazine
4,ACM Journal of Experimental Algorithms
...,...
2874,world congress on computational intelligence
2875,world haptics conference
2876,world of wireless mobile and multimedia networks
2877,world summit on the knowledge society


In [14]:
papers = papers[['id', 'title', 'year', 'n_citation']]
papers

Unnamed: 0,id,title,year,n_citation
6,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A Push–Pull Class-C CMOS VCO,2013,50
12,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Evaluating the accuracy of Java profilers,2010,95
25,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,Novel Weighting-Delay-Based Stability Criteria...,2010,266
26,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Cross-Language Information Retrieval,2010,108
40,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"starBase v2.0: decoding miRNA-ceRNA, miRNA-ncR...",2014,315
...,...,...,...,...
999952,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,Outage Probability Minimization for Energy Har...,2017,50
999975,fb32d855-1a45-46ff-b0bd-895b8862b979,Eco-Physic: Eco-Physical design initiative for...,2017,50
999993,fe9797ad-9ea5-4339-ab07-9d75735b08db,Towards Fine-Grained Open Zero-Shot Learning: ...,2017,50
999994,feb7d7ee-58d8-4a48-853e-c6751957997e,Research on Social Networking Sites and Social...,2017,50


## LDA

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

df = abstracts.copy()

# Fill missing values and ensure text is treated as a string
df['abstract'] = df['abstract'].fillna('').astype(str)

# Create a CountVectorizer object
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the abstracts
dtm = vectorizer.fit_transform(df['abstract'])

# Create and fit the LDA model
lda = LatentDirichletAllocation(n_components=15, random_state=42)  # n_components is # of topics
lda.fit(dtm)

# Get the topic distribution for each document
topic_distribution = lda.transform(dtm)

# Add a new column to the dataframe that indicates the most probable topic for each document
df['topic'] = topic_distribution.argmax(axis=1)

# Display examples from each topic
for topic_num in range(lda.n_components):
    print(f"Topic {topic_num} examples:")
    # Filter documents belonging to the topic
    topic_examples = df[df['topic'] == topic_num]['abstract'].sample(n=3, random_state=42)
    for i, example in enumerate(topic_examples):
        print(f"Example {i + 1}: {example}")
    print("\n")

# Now DataFrame 'df' has a new column 'topic' which represents the assigned topic for each document.
df

Topic 0 examples:
Example 1: Using the quantitative analysis of real-time myocardial contrast echocardiography (RT MCE), clinicians can assess the myocardial perfusion of patients, noninvasively and accurately. We designed a workstation to assist clinicians to automatically implement the accurate analysis of RT MCE. The workstation can compute some hemodynamic parameters of myocardial microcirculation, e.g., myocardial blood flow, myocardial blood flow mean velocity, and myocardial blood volume. Our new methods involved in the quantitative analysis of RT MCE are summarized as follows. 1) A novel orthogonal array optimization (OAO) technique was proposed and used to estimate the unknown parameters of the nonlinear model to guarantee numerical stability. 2) Brox's coarse-to-fine warping optical flow technique was employed to automatically track the region of interest located inside the myocardial area to ensure the accuracy of the quantitative analysis. Finally, we illustrate some exampl

Unnamed: 0,paper_id,abstract,topic
0,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A CMOS oscillator employing differential trans...,4
1,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Performance analysts profile their programs to...,11
2,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,"In this paper, a weighting-delay-based method ...",1
3,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Search for information is no longer exclusivel...,11
4,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"Although microRNAs (miRNAs), other non-coding ...",9
...,...,...,...
134283,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,The incorporation of cognitive radio (CR) capa...,7
134284,fb32d855-1a45-46ff-b0bd-895b8862b979,"In the Big Data Era, the management of energy ...",8
134285,fe9797ad-9ea5-4339-ab07-9d75735b08db,Zero-shot Learning (ZSL) can leverage attribut...,2
134286,feb7d7ee-58d8-4a48-853e-c6751957997e,Abstract The article presents a narrative revi...,3


In [19]:
df['topic'].value_counts()

topic
3     16537
7     15506
2     14075
11    11798
10    10754
13     9634
4      8567
8      7267
14     7259
9      7204
5      6428
0      5378
1      5200
6      4975
12     3706
Name: count, dtype: int64

## Combine topic

In [20]:
papers = papers.merge(df, left_on='id', right_on='paper_id', how='inner')
papers

Unnamed: 0,id,title,year,n_citation,paper_id,abstract,topic
0,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A Push–Pull Class-C CMOS VCO,2013,50,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A CMOS oscillator employing differential trans...,4
1,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Evaluating the accuracy of Java profilers,2010,95,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Performance analysts profile their programs to...,11
2,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,Novel Weighting-Delay-Based Stability Criteria...,2010,266,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,"In this paper, a weighting-delay-based method ...",1
3,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Cross-Language Information Retrieval,2010,108,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Search for information is no longer exclusivel...,11
4,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"starBase v2.0: decoding miRNA-ceRNA, miRNA-ncR...",2014,315,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"Although microRNAs (miRNAs), other non-coding ...",9
...,...,...,...,...,...,...,...
134283,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,Outage Probability Minimization for Energy Har...,2017,50,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,The incorporation of cognitive radio (CR) capa...,7
134284,fb32d855-1a45-46ff-b0bd-895b8862b979,Eco-Physic: Eco-Physical design initiative for...,2017,50,fb32d855-1a45-46ff-b0bd-895b8862b979,"In the Big Data Era, the management of energy ...",8
134285,fe9797ad-9ea5-4339-ab07-9d75735b08db,Towards Fine-Grained Open Zero-Shot Learning: ...,2017,50,fe9797ad-9ea5-4339-ab07-9d75735b08db,Zero-shot Learning (ZSL) can leverage attribut...,2
134286,feb7d7ee-58d8-4a48-853e-c6751957997e,Research on Social Networking Sites and Social...,2017,50,feb7d7ee-58d8-4a48-853e-c6751957997e,Abstract The article presents a narrative revi...,3


In [21]:
papers = papers.drop(columns=['paper_id'])[['id', 'title', 'year', 'n_citation', 'topic']]

In [22]:
papers

Unnamed: 0,id,title,year,n_citation,topic
0,4ab3f7cd-140b-4e29-99d4-f4e8006c4f65,A Push–Pull Class-C CMOS VCO,2013,50,4
1,4ab4c0a1-3c5a-44c6-bdd4-3a0618d303ca,Evaluating the accuracy of Java profilers,2010,95,11
2,4ab5e3f4-9b58-4fbb-9bde-ee2f2185cc61,Novel Weighting-Delay-Based Stability Criteria...,2010,266,1
3,4ab5e4bd-08e2-4007-825c-d34ce7cb231f,Cross-Language Information Retrieval,2010,108,11
4,4ab6c0ff-2d1a-4cb0-acdc-8ca7517cb14a,"starBase v2.0: decoding miRNA-ceRNA, miRNA-ncR...",2014,315,9
...,...,...,...,...,...
134283,f323b4a1-e0bc-4f4c-be91-f3257b5937e2,Outage Probability Minimization for Energy Har...,2017,50,7
134284,fb32d855-1a45-46ff-b0bd-895b8862b979,Eco-Physic: Eco-Physical design initiative for...,2017,50,8
134285,fe9797ad-9ea5-4339-ab07-9d75735b08db,Towards Fine-Grained Open Zero-Shot Learning: ...,2017,50,2
134286,feb7d7ee-58d8-4a48-853e-c6751957997e,Research on Social Networking Sites and Social...,2017,50,3


### export cleaned data to csv