In [11]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from profile_api import fetch_scholar_profile, convert_to_csv
from constants import *

In [None]:
research_topics = [
    "Natural Language Processing", "Information Retrieval", "Artificial Intelligence", 
    "Machine Learning", "Data Mining", "Computer Vision", "Human-Computer Interaction", 
    "Quantum Computing", "Cryptography", "Blockchain Technology", "Virtual Reality", 
    "Augmented Reality", "Internet of Things", "Cloud Computing", "Edge Computing", 
    "Cybersecurity", "Robotics", "Bioinformatics", "Algorithm Design", "Big Data Analytics", 
    "Software Engineering", "Database Systems", "Network Security", "Parallel Computing", 
    "Distributed Systems", "Mobile Computing", "Computer Graphics", "Game Development", 
    "Computational Biology", "Neural Networks", "Deep Learning", "Reinforcement Learning", 
    "Semantic Web", "Computer Architecture", "Operating Systems", "Digital Signal Processing", 
    "Wireless Communications", "Compiler Design", "High-Performance Computing", "Embedded Systems", 
    "3D Printing", "Pattern Recognition", "E-commerce Technology", "Web Development Technologies", 
    "Green Computing", "Ubiquitous Computing", "Grid Computing", "Computational Chemistry", 
    "Digital Forensics", "E-learning Technologies"
]

In [None]:
all_data = []
for topic in research_topics:
    print(f"Fetching data for: {topic}")
    data = fetch_scholar_profile(topic)
    all_data.extend(data)
convert_to_csv(all_data, "authors_search_output.csv")

In [12]:
csv_author_df = pd.read_csv('authors_search_output.csv')

In [13]:
# add unique id for each of the query
csv_author_df['position_id'] = csv_author_df.groupby('query').cumcount()

In [14]:
with open(AUTHORID_TO_AUTHOR_NAME_PATH, "rb") as f:
    authorid_to_author_name = pickle.load(f)

In [15]:
json_author_df = pd.DataFrame(list(authorid_to_author_name.items()), columns=['author_id', 'author_name'])

In [16]:
json_author_df

Unnamed: 0,author_id,author_name
0,2312688602,Makoto Satoh
1,2482909946,Ryo Muramatsu
2,2128134587,Mizue Kayama
3,2101782692,Kazunori Itoh
4,2114054191,Masami Hashimoto
...,...,...
4398133,2497555398,Kasper Damgård
4398134,2478958856,Hans Guldager
4398135,2114300893,Jonas Lindstrøm Jensen
4398136,2494318089,Pascal Paillier


In [17]:
author_merged_df = pd.merge(
    csv_author_df,
    json_author_df,
    left_on=['name'],
    right_on=['author_name'],
    how='inner'  # or 'inner' if you only want records with a match
)
len(author_merged_df)

45391

In [18]:
author_merged_df.head(10)

Unnamed: 0,name,link,serpapi_link,author_id_x,affiliations,email,cited_by,interests,thumbnail,id,query,position_id,author_id_y,author_name
0,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2112598428,Nigel Collier
1,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2707106106,Nigel Collier
2,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2893285974,Nigel Collier
3,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2691406354,Nigel Collier
4,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2661055139,Nigel Collier
5,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2306401722,Nigel Collier
6,Nigel Collier,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=ZMel...,ZMelBa0AAAAJ,"Professor of Natural Language Processing, Univ...",Verified email at cam.ac.uk,10624.0,"[{'title': 'Natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing,0,2988943520,Nigel Collier
7,Siegfried Handschuh,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=zl_3...,zl_3HgQAAAAJ,Chair of Data Science and Natural Language Pro...,Verified email at unisg.ch,9187.0,"[{'title': 'Data Science', 'serpapi_link': 'ht...",https://scholar.googleusercontent.com/citation...,1,Natural Language Processing,1,2012478052,Siegfried Handschuh
8,Siegfried Handschuh,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=zl_3...,zl_3HgQAAAAJ,Chair of Data Science and Natural Language Pro...,Verified email at unisg.ch,9187.0,"[{'title': 'Data Science', 'serpapi_link': 'ht...",https://scholar.googleusercontent.com/citation...,1,Natural Language Processing,1,2569908838,Siegfried Handschuh
9,Siegfried Handschuh,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=zl_3...,zl_3HgQAAAAJ,Chair of Data Science and Natural Language Pro...,Verified email at unisg.ch,9187.0,"[{'title': 'Data Science', 'serpapi_link': 'ht...",https://scholar.googleusercontent.com/citation...,1,Natural Language Processing,1,2786482469,Siegfried Handschuh


In [33]:
author_merged_unique_df = author_merged_df[['position_id','query','author_name']].drop_duplicates()

In [34]:
author_merged_unique_df = author_merged_unique_df.rename(columns={'position_id': 'rel'})
author_merged_unique_df['rel'] = author_merged_unique_df['rel'].apply(lambda x: 5-int(x/20))

In [27]:
len(author_merged_unique_df)

3339

In [28]:
train_df, test_df = train_test_split(author_merged_unique_df, test_size=0.25, random_state=42)  # random_state for reproducibility

In [36]:
author_merged_unique_df.to_csv('train_test_author_data.csv')
train_df.to_csv('train_author_data.csv')
test_df.to_csv('test_author_data.csv')

In [30]:
len(test_df)

835