## Import libraries

In [None]:
!pip install PyGithub requests

In [None]:
import numpy as np
import pandas as pd
from github import Github
import requests
import re
import functools as ft

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import datasets

To answer our research questions we took advantage of the Papers with Code platform. This platform provides free and open Machine Learning papers, code, datasets, methods and evaluation tables. On the web page we downloaded three different datasets which were hold on the About section.
- Papers with abstracts 
- Links between papers and code 
- Methods

All data is licensed under the CC BY-SA license. 
To reproduce the results the data used in the thesis can be accessed in https://doi.org/10.5281/zenodo.6788250


In [None]:
#Load abstracts
abstracts = pd.read_json("/content/drive/MyDrive/readjson/Papers_with_abstracts.json")
abstracts = abstracts[['paper_url', 'title', 'abstract']]
#Join papers title and abstract
abstracts['text'] = abstracts['title'] + ' ' + abstracts['abstract']
#Keep only paper url and text
abstracts = abstracts[['paper_url','text']]

In [None]:
#Load github information
github = pd.read_json("/content/drive/MyDrive/readjson/Links_between_papers_and_code.json")
#Keep only paper and repository url
github = github[['paper_url','repo_url']]

In [None]:
#Load label information
label = pd.read_json("/content/drive/MyDrive/readjson/Methods.json")
label = label[['url','paper','collections' ]]

## Treat multilabeled instances

In [None]:
#Example of multilabeled abstract
paper_data = list(label['collections'])
paper_data[33]

[{'area': 'Reinforcement Learning',
  'area_id': 'reinforcement-learning',
  'collection': 'Environment Design Methods'},
 {'area': 'General',
  'area_id': 'general',
  'collection': 'Adversarial Training'}]

First we collect each paper's url and their corresponding labels for each observation

In [None]:
paper_data = list(label['paper'])
label_data = list(label['collections'])
paper_url = []
categ = []
#For each instance in the dataframe
for i in range(len(label)):
  #Extract paper url from paper column
  if paper_data[i] is not None:
    paper_url.append(paper_data[i]['url'])
  else:
    title.append(None)
    paper_url.append(None)
  #Extract labels from collections column
  #Collect labels in multi list in case of multilabelled observations
  multi = []
  for j in range(len(label_data[i])):
    #Check not to set the same label more than once
    if label_data[i][j]['area'] not in multi:
      multi.append(label_data[i][j]['area'])
  categ.append(multi)
label['paper_url'] = paper_url
label['label'] = categ
label = label[['paper_url', 'label']]

In [None]:
dfs = [abstracts, github, label]
#Merge all three datasets by paper_url as identifier 
df_final = ft.reduce(lambda left, right: pd.merge(left, right, on='paper_url'), dfs)

In [None]:
#Distinguish multilbaled instances in different rows, now each row correponds
#only to one label
df_final = df_final.explode('label')
df_final = df_final.reset_index(drop=True)

df_final

Unnamed: 0,paper_url,text,repo_url,label
0,https://paperswithcode.com/paper/sniper-effici...,SNIPER: Efficient Multi-Scale Training We pres...,https://github.com/Hwang64/PSIS,[Computer Vision]
1,https://paperswithcode.com/paper/sniper-effici...,SNIPER: Efficient Multi-Scale Training We pres...,https://github.com/MahyarNajibi/SNIPER,[Computer Vision]
2,https://paperswithcode.com/paper/sniper-effici...,SNIPER: Efficient Multi-Scale Training We pres...,https://github.com/starimpact/arm_SNIPER,[Computer Vision]
3,https://paperswithcode.com/paper/sniper-effici...,SNIPER: Efficient Multi-Scale Training We pres...,https://github.com/PaddlePaddle/PaddleDetection,[Computer Vision]
4,https://paperswithcode.com/paper/hierarchical-...,Hierarchical interpretations for neural networ...,https://github.com/csinva/hierarchical-dnn-int...,[General]
...,...,...,...,...
28647,https://paperswithcode.com/paper/auco-resnet-a...,AUCO ResNet: an end-to-end network for Covid-1...,https://github.com/vincenzodentamaro/aucoresnet,[Audio]
28648,https://paperswithcode.com/paper/recurrent-tre...,Recurrent Trend Predictive Neural Network for ...,https://github.com/mertnakip/Recurrent-Trend-P...,[Sequential]
28649,https://paperswithcode.com/paper/simple-baseli...,Simple Baselines for Image Restoration Althoug...,https://github.com/megvii-research/NAFNet,[Computer Vision]
28650,https://paperswithcode.com/paper/deit-iii-reve...,DeiT III: Revenge of the ViT A Vision Transfor...,https://github.com/facebookresearch/deit,[Computer Vision]


## Get README files from Github API

In [None]:
#Define personal access token for accessing public repositories
#Note that this token expires on July 18 2022
#A token can be generated through Developer settings in Github
#For more information refer to Github's documentation
g = Github('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')
readmes = []
col = list(df_final['repo_url'])
for i in range(len(col)):
  aux = df_final['repo_url'][i]
  aux = re.sub('https://github.com/', '', aux)
  #Access different repositories in dataset and obtaninig their README files
  try:
    repo = g.get_repo(aux)
    readme = repo.get_readme()
    readme = readme.decoded_content
    readmes.append(readme)
  except:
    readmes.append(None)
#Add readmes to our dataset
df_final['readmes'] = readmes

## Save final dataset

In [None]:
#Save final dataset 
with open("/content/drive/MyDrive/TFM_FINAL/scrapped_data.csv", 'w', encoding = 'utf-8-sig') as f:
  df_final.to_csv(f)