In [None]:
from tqdm import tqdm
import json
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from paper_api import fetch_scholar_profile, convert_to_csv
# from constants import *

In [None]:
# !pip3 install serpapi
!pip3 install requests pandas python-dotenv
# !python3.11 -m pip install --upgrade pip

In [None]:
research_topics = [
    "Natural Language Processing", "Information Retrieval", "Artificial Intelligence", 
    "Machine Learning", "Data Mining", "Computer Vision", "Human-Computer Interaction", 
    "Quantum Computing", "Cryptography", "Blockchain Technology", "Virtual Reality", 
    "Augmented Reality", "Internet of Things", "Cloud Computing", "Edge Computing", 
    "Cybersecurity", "Robotics", "Bioinformatics", "Algorithm Design", "Big Data Analytics", 
    "Software Engineering", "Database Systems", "Network Security", "Parallel Computing", 
    "Distributed Systems", "Mobile Computing", "Computer Graphics", "Game Development", 
    "Computational Biology", "Neural Networks", "Deep Learning", "Reinforcement Learning", 
    "Semantic Web", "Computer Architecture", "Operating Systems", "Digital Signal Processing", 
    "Wireless Communications", "Compiler Design", "High-Performance Computing", "Embedded Systems", 
    "3D Printing", "Pattern Recognition", "E-commerce Technology", "Web Development Technologies", 
    "Green Computing", "Ubiquitous Computing", "Grid Computing", "Computational Chemistry", 
    "Digital Forensics", "E-learning Technologies"
]

In [None]:
all_data = []
for topic in research_topics:
    print(f"Fetching data for: {topic}")
    data = fetch_scholar_profile(topic)
    all_data.extend(data)
convert_to_csv(all_data, "paper_search_output.csv")

In [None]:
csv_df = pd.read_csv('paper_search_output.csv')
csv_df

In [None]:
# add unique id for each of the query
csv_df['position_id'] = csv_df.groupby('query').cumcount()

In [None]:
data = []
with open(PAPER_DATA_PATH, "r") as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT+2)):
        paper_data = {}
        if i == 0 or i == TOTAL_PAPER_COUNT+1:
            continue
        if i == 1:
            doc = json.loads(line)
        else:
            doc = json.loads(line[1:])
        
        if 'authors' not in doc:
            continue
        paper_data['paper_id'] = doc['id']
        paper_data['author_name'] = doc['authors'][0]['name']
        paper_data['title'] = doc['title']
        paper_data['year'] = doc['year']
        paper_data['doi_link'] = doc['doi']
        data.append(paper_data)
json_df = pd.DataFrame(data) 

In [None]:
title_merged_df = pd.merge(
    csv_df,
    json_df,
    left_on=['title','year'],
    right_on=['title','year'],
    how='inner'
)
len(title_merged_df)

In [None]:
doi_link = []
for i in tqdm(range(csv_df.shape[0])):
    if isinstance(csv_df.iloc[i]['link'], str) and '/10.' in csv_df.iloc[i]['link']:
        doi_link.append(csv_df.iloc[i]['link'][csv_df.iloc[i]['link'].find('/10.')+1:])
    else:
        doi_link.append(None)
csv_df['doi_link'] = doi_link

doi_merged_df = pd.merge(
    csv_df,
    json_df,
    left_on='doi_link',
    right_on='doi_link',
    how='inner'
)
len(doi_merged_df)

In [None]:
paper_merged_unique_df = pd.concat([title_merged_df, doi_merged_df])[['query', 'position_id', 'paper_id']].drop_duplicates()

In [None]:
paper_merged_unique_df = paper_merged_unique_df.rename(columns={'position_id': 'rel'})
paper_merged_unique_df = paper_merged_unique_df.rename(columns={'paper_id': 'docid'})
paper_merged_unique_df['rel'] = paper_merged_unique_df['rel'].apply(lambda x: 5-int(x/20))

In [None]:
print("Load docid list")
with open(DOCID_LIST_PATH, 'rb') as f:
    docid_list = pickle.load(f)
paper_merged_unique_df = paper_merged_unique_df[paper_merged_unique_df['docid'].isin(docid_list)]

In [None]:
train_df, test_df = train_test_split(paper_merged_unique_df, test_size=0.25, random_state=42)

In [None]:
train_df.to_csv('train_paper_data.csv')
test_df.to_csv('test_paper_data.csv')

In [None]:
print(len(train_df))
print(len(test_df))