In [1]:
import os
import sys
import time
import math
import json
import subprocess
import pandas as pd

In [22]:
#-------------------------------------------------
# using the csv file of CRITICALITY_SCORE and searching for 
# repos from 200_repos; this excludes new/current projects
# download: https://storage.cloud.google.com/ossf-criticality-score
# UDPATE: looks like a monthly run is published here (wish I had of known!)
#-------------------------------------------------

# Read '200_repos.csv' into DataFrame df
#
# NaN is assigned to empty cells
df = pd.read_csv('200_repos.csv')

In [None]:
# subset dataframes for testing
# use .copy() as slicing will not allow for assignment
df10 = df.iloc[:10].copy()
df33 = df.iloc[:33].copy()

In [23]:
dfs = df[['CMC_id', 'source_code', 'forge']].copy()
dfc = pd.read_csv('project_criticality_all.csv')
num = 0
for row in dfs.itertuples():
    # only search if github
    if row.forge == 'github':
        # only search for strings; floats (NaN) are skipped
        if isinstance(row.source_code, str):
            url = str(row.source_code)
            # loop through df2 (criticality) looking for source code url
            for row2 in dfc.itertuples():
                if url == row2.url:
                    dfs.at[row.Index, 'criticality'] = row2.criticality_score
                    num += 1
                    break
            sys.stdout.write(".")
            sys.stdout.flush()
print(str(num), 'criticality scores found and updated')

........................................................................................................................................................................66 criticality scores found and updated


In [26]:
# update MERGED sheet with new data
# 'CMC_id' is the key, drop 'repo', and 'forge' before the merge
# to prevent duplicate columns
dfs.drop(columns = ['source_code', 'forge'], inplace = True)
dfm = pd.merge(df,dfs,on = ['CMC_id'], how = 'outer')

In [28]:
# write out new data
dfs.to_csv('200_crit.csv', encoding='utf-8', index = 0)
dfm.to_csv('200_merged.csv', encoding='utf-8', index = 0)

In [None]:
#-------------------------------------------------
# directly call CRITICALITY_SCORE
# require github token and command line access
#-------------------------------------------------
#
# >> repo: https://github.com/ossf/criticality_score
# 0. make sure github access token is exported to PATH (see methodology notes)
# 1. install: pip3 install criticality-score
# 2. check PATH: WARNING: The script criticality_score is installed in '/home/user/.local/bin' which is not on PATH.
# >> export PATH="/home/user/.local/bin:$PATH"
# 3. get 'GITHUB_AUTH_TOKEN' and export path on command line or set env variable in jupyter
# 
# Set the environment variable 'GITHUB_AUTH_TOKEN'
# (this is a short-cut; in the future look into pycrosskit)
# >key = 'GITHUB_AUTH_TOKEN'
# >os.environ[key] = 'secret'
#
# read out the value
# >value = os.getenv(key)
# >print("Value of 'GITHUB_AUTH_TOKEN' environment variable :", value) 
#
# 4. run: criticality_score --repo https://github.com/bitcoin/bitcoin
#
'''     
['name: bitcoin',
 'url: https://github.com/bitcoin/bitcoin',
 'language: C++',
 'created_since: 142',
 'updated_since: 0',
 'contributor_count: 961',
 'org_count: 4',
 'commit_frequency: 54.8',
 'recent_releases_count: 3',
 'updated_issues_count: 1920',
 'closed_issues_count: 1467',
 'comment_frequency: 2.7',
 'dependents_count: 348588',
 'criticality_score: 0.86651']
'''

In [45]:
# Read '200_repos.csv' into DataFrame df
df = pd.read_csv('200_repos.csv')
# keep 'source_code' location and 'forge'
df_in = df[['source_code', 'forge']].copy()
# subset dataframes for testing
#df_in = df_in.iloc[96:101].copy()
#df33 = dfs.iloc[:33].copy()

In [44]:
# ------------------------------------------------
# dfParse builds a dataFrame using bash output
#  @output the command line output from calling criticality_score
#  @firstTime boolean to initialize the dataframe the first loop call
#  @dataframe the dataframe to be updated and returned
# ------------------------------------------------
def dfParse(output, firstUpdate, dataframe):
    jout = json.dumps(output)    #jout is a str
    out_dict = json.loads(jout)   #out_dict is a list
    
    # catch a possible traceback
    if 'Traceback' in out_dict[0]:
        print('found traceback')
        return dataframe
    
    # prepare the dataFrame, initialize with column headers the same
    # as the criticality_score output
    df = pd.DataFrame(out_dict)
    df.rename(columns = {0:'metric'}, inplace = True)
    df[['metric','value']] = df.metric.str.split(expand = True)
    df = df.transpose(copy = True)

    # remove index column (with labels 'metric' & 'value')
    # and reset the index
    df.reset_index(drop=True, inplace=True)

    # rename columns according to first row; then drop the row
    df = df.rename(columns = df.iloc[0]).drop(df.index[0])

    if firstUpdate:
        dataframe = df.copy()
    else:
        # append row[1] to df
        dataframe = dataframe.append(df, ignore_index = True)   
        
    return dataframe

# ------------------------------------------------
# variables
# ------------------------------------------------
start = time.time()
total = 0
updated = 0
firstUpdate = True
df_out = pd.DataFrame
# ------------------------------------------------
# main loop requires dataFrame: 'df_in'
#                     returns: 'df_out'
# df_out does not have CMC_id and some will be missed;
# should be able to merge back on 'url:'='source_code'
# ------------------------------------------------
# This takes a while, criticality_score has a built-in
# rate limiter for handling github API limit.
# Sample Output:
#
#
#
# ------------------------------------------------
for row in df_in.itertuples():
    # proceed if github and source_code is a string and not private
    if (row.forge == 'github') and isinstance(row.source_code, str) and (row.source_code != 'private'):
        cmd = 'criticality_score --repo ' + row.source_code
        output = !{cmd}
        # if first element is ['name':'bitcoin'], output is as expected, can parse
        if 'name' in output[0]: 
            #firstUpdate = True
            #if num > 0: firstTime = False
            df_out = dfParse(output, firstUpdate, df_out)
            firstUpdate = False
            updated += 1
    total += 1
    sys.stdout.write(".")
    sys.stdout.flush()
    
# log some output with a timer
print('\n',str(df_in.shape[0]), 'total projects evaluated\n', 
      str(updated), 'criticality scores updated\n', 
      str(total - updated), 'repos private or missing\n', 
      'Total time elapsed:', round((time.time() - start)/60, 1), 'minutes')


.....
 5 total projects evaluated
 1 criticality scores updated
 4 repos private or missing
 Total time elapsed: 0.1 minutes


In [None]:
# write out new data
#df_in.to_csv('crit_output_all.csv', encoding='utf-8', index = 0)
#dft = df_in[['url:','criticality_score:']].copy()
#dft.to_csv('200_crit.csv', encoding='utf-8', index = 0)

In [None]:
df_in