# Area of Interest - Recommendation System

In [1]:
import os
import json
import pandas as pd
from datasets import load_dataset

## Arxiv Dataset

In [2]:
arxiv_filepath = os.path.join(os.getcwd(), "datasets", "articles", "arxiv", "arxiv-metadata-oai-snapshot.json")

with open(arxiv_filepath, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

df = pd.DataFrame(data)

print(df.head(10))

          id           submitter  \
0  0704.0001      Pavel Nadolsky   
1  0704.0002        Louis Theran   
2  0704.0003         Hongjun Pan   
3  0704.0004        David Callan   
4  0704.0005  Alberto Torchinsky   
5  0704.0006        Yue Hin Pong   
6  0704.0007   Alejandro Corichi   
7  0704.0008        Damian Swift   
8  0704.0009         Paul Harvey   
9  0704.0010  Sergei Ovchinnikov   

                                             authors  \
0  C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                    Ileana Streinu and Louis Theran   
2                                        Hongjun Pan   
3                                       David Callan   
4           Wael Abu-Shammala and Alberto Torchinsky   
5                           Y. H. Pong and C. K. Law   
6  Alejandro Corichi, Tatjana Vukasinac and Jose ...   
7                                    Damian C. Swift   
8  Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...   
9                                 Sergei Ov

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2362706 entries, 0 to 2362705
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   submitter       object
 2   authors         object
 3   title           object
 4   comments        object
 5   journal-ref     object
 6   doi             object
 7   report-no       object
 8   categories      object
 9   license         object
 10  abstract        object
 11  versions        object
 12  update_date     object
 13  authors_parsed  object
dtypes: object(14)
memory usage: 252.4+ MB


## Computer Science Research Papers Dataset

In [10]:
cs_dataset_dir = os.path.join(os.getcwd(), "datasets", "articles", "CS_dataset")
cs_dataset_oct = os.path.join(cs_dataset_dir, "2023-10")
cs_dataset_nov = os.path.join(cs_dataset_dir, "2023-11")

### Octomber & November dataset

In [11]:
df_oct = pd.DataFrame()
df_nov = pd.DataFrame()

# read each excel file from the directory and append to the correct dataframe
def read_and_concat(directory):
    # individual dataframes
    frames = []
    for file in os.listdir(directory):
        if file.endswith(".xls") or file.endswith(".xlsx"):
            file_path = os.path.join(directory, file)
            temp_df = pd.read_excel(file_path)
            frames.append(temp_df)
    if frames:
        return pd.concat(frames, ignore_index=True)
    else:
        return pd.DataFrame()

# read and concatenate files for October
df_oct = read_and_concat(cs_dataset_oct)

# read and concatenate files for November
df_nov = read_and_concat(cs_dataset_nov)


# display the first few rows of the DataFrames
print("DataFrame for October:")
print(df_oct.head())

print('*' * 70)

print("\nDataFrame for November:")
print(df_nov.head())


DataFrame for October:
   Unnamed: 0                id  \
0           0  arXiv:2310.03744   
1           1  arXiv:2310.03743   
2           2  arXiv:2310.03742   
3           3  arXiv:2310.03740   
4           4  arXiv:2310.03739   

                                               title  \
0  Improved Baselines with Visual Instruction Tun...   
1  The Un-Kidnappable Robot: Acoustic Localizatio...   
2  A High-Performance Design, Implementation, Dep...   
3  ContactGen: Generative Contact Modeling for Gr...   
4  Aligning Text-to-Image Diffusion Models with R...   

                    title_zh  \
0              通过可视化指令调整改进基线   
1          不可绑架的机器人：偷窥者的声学定位   
2  Slim Fly网络的高性能设计、实现、部署和评估   
3   ContactGen：用于抓取生成的生成接触建模   
4       使用奖励反向传播对齐文本到图像的扩散模型   

                                             authors  \
0  \nAuthors:\nHaotian Liu, \n\nChunyuan Li, \n\n...   
1  \nAuthors:\nMengyu Yang, \n\nPatrick Grady, \n...   
2  \nAuthors:\nNils Blach, \n\nMaciej Besta, \n\n...   
3  \nAuthors

### DataFrames Visualization

In [12]:
df_oct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9429 entries, 0 to 9428
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     9429 non-null   int64 
 1   id             9429 non-null   object
 2   title          9429 non-null   object
 3   title_zh       9429 non-null   object
 4   authors        9429 non-null   object
 5   url            9429 non-null   object
 6   subjects       9429 non-null   object
 7   subject_split  9429 non-null   object
dtypes: int64(1), object(7)
memory usage: 589.4+ KB


In [13]:
df_nov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4110 entries, 0 to 4109
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     4110 non-null   int64 
 1   id             4110 non-null   object
 2   title          4110 non-null   object
 3   title_zh       4110 non-null   object
 4   authors        4110 non-null   object
 5   url            4110 non-null   object
 6   subjects       4110 non-null   object
 7   subject_split  4110 non-null   object
dtypes: int64(1), object(7)
memory usage: 257.0+ KB


In [14]:
df_oct.head(1)

Unnamed: 0.1,Unnamed: 0,id,title,title_zh,authors,url,subjects,subject_split
0,0,arXiv:2310.03744,Improved Baselines with Visual Instruction Tun...,通过可视化指令调整改进基线,"\nAuthors:\nHaotian Liu, \n\nChunyuan Li, \n\n...",https://arxiv.org/pdf/2310.03744.pdf,\nSubjects: Computer Vision and Pattern Recogn...,['Computer Vision and Pattern Recognition (cs....


## Preprocess Dataset - combine and remove unused data

In [25]:
# combine octomber + november dataset
cs_df = pd.concat([df_oct, df_nov], ignore_index=True)

# remove first column (== entry number)
cs_df = cs_df.drop(cs_df.columns[0], axis=1)
# remove column containing title in chinese
cs_df = cs_df.drop('title_zh', axis=1, errors='ignore')
# remove column containing keywords as string
cs_df = cs_df.drop('subjects', axis=1, errors='ignore')

In [26]:
print("Combined DataFrame:")
print(cs_df.head())

Combined DataFrame:
                 id                                              title  \
0  arXiv:2310.03744  Improved Baselines with Visual Instruction Tun...   
1  arXiv:2310.03743  The Un-Kidnappable Robot: Acoustic Localizatio...   
2  arXiv:2310.03742  A High-Performance Design, Implementation, Dep...   
3  arXiv:2310.03740  ContactGen: Generative Contact Modeling for Gr...   
4  arXiv:2310.03739  Aligning Text-to-Image Diffusion Models with R...   

                                             authors  \
0  \nAuthors:\nHaotian Liu, \n\nChunyuan Li, \n\n...   
1  \nAuthors:\nMengyu Yang, \n\nPatrick Grady, \n...   
2  \nAuthors:\nNils Blach, \n\nMaciej Besta, \n\n...   
3  \nAuthors:\nShaowei Liu, \n\nYang Zhou, \n\nJi...   
4  \nAuthors:\nMihir Prabhudesai, \n\nAnirudh Goy...   

                                    url  \
0  https://arxiv.org/pdf/2310.03744.pdf   
1  https://arxiv.org/pdf/2310.03743.pdf   
2  https://arxiv.org/pdf/2310.03742.pdf   
3  https://arxiv.org/pdf/2