### Restructuring data to create an efficient dataframe

In [1]:
# Importing the required  packages
import json
from tqdm import tqdm
import requests
from random import choice
import pandas as pd

In [2]:
# Reading output file of '03_Get_metadata.ipynb'
df = pd.read_csv('/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Julia_data_collection/Julia_Packages_List_Repo_Deps.csv')

In [3]:
# Function to get information like name, dependencies and authors from the json metadata
def get_info(j):
    d = json.loads(j)
    empty_d = dict()
    if 'name' in d:
        empty_d['name'] = d['name']
    else:
        empty_d['name'] = ""
    if 'deps' in d:
        empty_d['deps'] = ", ".join(list(d['deps'].keys()))
    else:
        empty_d['deps'] = ""
    if 'authors' in d:
        empty_d['authors'] = ", ".join(list(d['authors']))
    else:
        empty_d['authors'] = ""
    return empty_d

In [4]:
# Calling get_info() for each json object in the dataframe and storing in a single list
parsed = [get_info(i) for i in df['Metadata']]

In [5]:
# Separating name, dependencies and authors
pname = [i['name'] for i in parsed]
deps = [i['deps'] for i in parsed]
authors = [i['authors'] for i in parsed]

In [6]:
# Creating separate columns for dependencies and authors in the dataframe
df['dependencies'] = deps
df['authors'] = authors

In [7]:
df.head()

Unnamed: 0,Package,RepoURL,Metadata,dependencies,authors
0,REPLTreeViews,https://github.com/pfitzseb/REPLTreeViews.jl.git,"{""name"": ""REPLTreeViews"", ""uuid"": ""00000000-11...","IterTools, REPL, TreeViews",
1,TuringGLM,https://github.com/TuringLang/TuringGLM.jl.git,"{""name"": ""TuringGLM"", ""uuid"": ""0004c1f4-53c5-4...","Distributions, GLM, LazyArrays, LinearAlgebra,...","Jose Storopoli <jose@storopoli.io>, Rik Huijze..."
2,BayesHistogram,https://github.com/francescoalemanno/BayesHist...,"{""name"": ""BayesHistogram"", ""uuid"": ""000d9b38-6...",,Francesco Alemanno <francescoalemanno710[at]gm...
3,KCP_jll,https://github.com/JuliaBinaryWrappers/KCP_jll...,"{""name"": ""KCP_jll"", ""uuid"": ""000eeb74-f857-587...","Pkg, Libdl",
4,ProgressView,https://github.com/eschnett/ProgressView.jl.git,"{""name"": ""ProgressView"", ""uuid"": ""001e2b2b-9a5...",Test,Erik Schnetter <schnetter@gmail.com>


In [8]:
# Are we trying to extract further information from scraping this? (I can see readme, license and downloads being extracted)
u = "https://github.com/pfitzseb/REPLTreeViews.jl.git"

In [9]:
# Where do this file comes from? (API_keys.json)
with open('/home/sreenath_a/Projects/OSS/API_keys.json', 'r') as myfile:
    data=myfile.read()
keys = json.loads(data)['keys']

In [10]:
def get_license(u):
    api_token = choice(keys)
    owner = u.split("/")[-2]
    repo = u.split("/")[-1][:-4]
    url = 'https://api.github.com/graphql'
    headers = {'Authorization': 'token %s' % api_token}
    query = """
    query {
    repository(owner: "#OWNER#", name: "#REPO#") {
        licenseInfo {
        name
        spdxId
        }
    }
    }
    """
    try:
        query = query.replace("#OWNER#", owner).replace("#REPO#", repo)
        response = requests.post(url, headers=headers, json={'query': query})
        data = response.json()
        return data['data']['repository']['licenseInfo']['spdxId']
    except:
        return ""

In [11]:
df['license'] = df['RepoURL'].apply(get_license)

In [12]:
def get_readme(u):
    try:
        owner = u.split("/")[-2]
        repo = u.split("/")[-1][:-4]
        url = "https://raw.githubusercontent.com/{user}/{repo_name}/master/README.md".format(user=owner, repo_name=repo)
        response = requests.get(url)
        if response.status_code != 200:
            return ""
        else:
            return response.text
    except:
        return ""

In [13]:
readme = list()
for rurl in tqdm(df['RepoURL'].tolist()):
    readme.append(get_readme(rurl))

100%|██████████| 8857/8857 [21:27<00:00,  6.88it/s]  


In [14]:
df['Readme'] = readme

In [15]:
def get_downloads(Package):
    try:
        url = f"https://pkgs.genieframework.com/api/v1/badge/{Package}"
        response = requests.get(url)
        if response.status_code != 200:
            return -1
        else:
            return int(response.json()['message'])
    except:
        return -1

In [16]:
# downloads = list()
# for pname in tqdm(df['Package'].tolist()):
#     downloads.append(get_downloads(pname))

In [17]:
# df['Downloads'] = downloads

In [20]:
df.to_excel('/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Julia_data_collection/Julia_Packages_all.xlsx', engine='xlsxwriter')

In [3]:
df = pd.read_excel('/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Julia_data_collection/Julia_Packages_all.xlsx')

In [4]:
df.to_csv('/home/sreenath_a/Projects/OSS/nsf-oss/Data_Collection/Julia_data_collection/Julia_Packages_all.csv', index=False)