In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = 'libraries'
_DATA_DIR = '../data/' + data

## 1. Load data

In [3]:
df = pd.read_json(_DATA_DIR + '/code2lib.json')
df.head()

Unnamed: 0,file,method_name,references
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read_file,[os.path.join]
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup_method,"[flexmock.flexmock.should_receive.and_return, ..."
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_returns_correct_attribu...,[sportsipy.ncaaf.teams.Teams]
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_dataframe_returns_dataf...,"[pandas.DataFrame, sportsipy.ncaaf.teams.Teams..."
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_all_teams_dataframe_ret...,[sportsipy.ncaaf.teams.Teams.dataframes.drop_d...


In [4]:
def get_project_name(path):
    s = path.split('/')
    return f"{s[5]}--{s[6]}"

In [5]:
# Get unique libraries
df['libraries'] = df.references.map(lambda x: list(set([s.split('.')[0] for s in x])))
df['references'] = df['references'].apply(lambda x: list(set(x))) # Dont store duplicates
df['project'] = df.file.map(lambda x: get_project_name(x))
df.head()

Unnamed: 0,file,method_name,references,libraries,project
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read_file,[os.path.join],[os],roclark--sportsipy.git
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup_method,"[sportsipy.ncaaf.teams.Teams, flexmock.flexmoc...","[flexmock, sportsipy]",roclark--sportsipy.git
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_returns_correct_attribu...,[sportsipy.ncaaf.teams.Teams],[sportsipy],roclark--sportsipy.git
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_dataframe_returns_dataf...,"[pandas.DataFrame, pandas.concat.drop_duplicat...","[pandas, sportsipy]",roclark--sportsipy.git
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_ncaaf_integration_all_teams_dataframe_ret...,[sportsipy.ncaaf.teams.Teams.dataframes.drop_d...,[sportsipy],roclark--sportsipy.git


## 2. Briefly inspect the data

In [6]:
stats = pd.DataFrame(np.concatenate(df.libraries.values).ravel(), columns=['library'])
stats.head()
stats['count'] = stats.groupby('library')['library'].transform('count')
stats = stats.drop_duplicates(subset=['library']).sort_values(by=['count'], ascending=False).reset_index(drop=True)
stats

Unnamed: 0,library,count
0,numpy,52846
1,tensorflow,20035
2,torch,19808
3,os,13844
4,sklearn,10199
...,...,...
4469,fairseq_cli,1
4470,yolov3,1
4471,grequests,1
4472,score_comb,1


In [7]:
imports = np.concatenate(df.references.values).ravel()
print(f"Total number of methods {len(df)}\nNumber of unique method names {len(np.unique(df.method_name))}")
print(f"Total number of libraries {len(stats)}\nTotal number of import references {len(imports)}\nTotal number of unique import references {len(np.unique(imports))}")

Total number of methods 265346
Number of unique method names 146969
Total number of libraries 4474
Total number of import references 647050
Total number of unique import references 119143


## 3. Preprocess

In [8]:
# Aggregate libraries by project name
project_libraries = df.groupby(['project'])['libraries'].sum().map(lambda x: list(set(x)))
project_libraries

project
0x454447415244--HandwritingRecognitionSystem.git                                         [tensorflow]
100--Solid.git                                                           [collections, random, numpy]
1033020837--Basic4AI.git                                  [sklearn, numpy, pandas, matplotlib, torch]
AFAgarap--cnn-svm.git                               [sys, tensorflow_datasets, tensorflow, argpars...
AIHunters--AcurusTrack.git                          [cv2, pandas, single_shot, pipeline, timeit, c...
                                                                          ...                        
zalandoresearch--fashion-mnist.git                  [sklearn, collections, datetime, numpy, config...
zalandoresearch--pytorch-dilated-rnn.git                             [drnn, collections, torch, time]
zhuzilin--NP_ML.git                                                        [scipy, matplotlib, numpy]
zqhZY--semanaly.git                                                       

In [9]:
# Check frequency of libraries
df_proj_libs = pd.DataFrame(np.concatenate(project_libraries).ravel(), columns=['library'])
df_proj_libs['count'] = df_proj_libs.groupby('library')['library'].transform('count')
df_proj_libs = df_proj_libs.drop_duplicates(subset=['library']).sort_values(by=['count'], ascending=False).reset_index(drop=True)
df_proj_libs

Unnamed: 0,library,count
0,numpy,788
1,os,614
2,collections,436
3,time,393
4,sklearn,349
...,...,...
4469,signedsageconvolution,1
4470,sgcn,1
4471,simgnn,1
4472,splitter,1


In [10]:
# Get all libraries when are referenced > 1 (i.e. not project specific)
shared_project_libraries = df_proj_libs[df_proj_libs['count'] > 1]
print(f"{len(shared_project_libraries)} out of {len(stats)} declared libraries are project specific ({len(shared_project_libraries) / len(stats):.2f}%)")
shared_libs = list(shared_project_libraries.library)
shared_project_libraries

1057 out of 4474 declared libraries are project specific (0.24%)


Unnamed: 0,library,count
0,numpy,788
1,os,614
2,collections,436
3,time,393
4,sklearn,349
...,...,...
1052,tokens,2
1053,glfw,2
1054,flasgger,2
1055,log_utils,2


In [11]:

def camel_case_split(identifier, joinToken):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier,
    )
    return f'{joinToken}'.join([m.group(0).lower() for m in matches])

def snake_case_split(identifier, joinToken):
    return f'{joinToken}'.join([x for x in identifier.split('_') if x != ''])

In [12]:
df_processed = df.copy()
# Drop all library references that are project specific
df_processed['references'] = df_processed['references'].apply(lambda x: [s for s in x if s.split('.')[0] in shared_libs])
df_processed['libraries'] = df_processed['libraries'].apply(lambda x: [s for s in x if s in shared_libs])
# Some rows may now include no references - drop these
df_processed = df_processed[~df_processed.references.str.len().eq(0)]
df_processed['references'] = df_processed['references'].apply(lambda x: ",".join(x))

# split camel/snake case method names
df_processed['method_name'] = df_processed.method_name.map(lambda x: snake_case_split(x, '|'))
df_processed['method_name'] = df_processed.method_name.map(lambda x: camel_case_split(x, '|'))

df_processed

Unnamed: 0,file,method_name,references,libraries,project
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read|file,os.path.join,[os],roclark--sportsipy.git
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup|method,flexmock.flexmock.should_receive.and_return,[flexmock],roclark--sportsipy.git
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|ncaaf|integration|dataframe|returns|dataf...,"pandas.DataFrame,pandas.concat.drop_duplicates",[pandas],roclark--sportsipy.git
5,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|ncaaf|empty|page|returns|no|teams,"flexmock.flexmock.should_receive.and_return,fl...",[flexmock],roclark--sportsipy.git
9,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|invalid|default|year|reverts|to|previous|...,flexmock.flexmock.should_receive.and_return,[flexmock],roclark--sportsipy.git
...,...,...,...,...,...
265339,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,generate|test|data,numpy.random.seed,[numpy],SeldonIO--alibi.git
265340,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,init,numpy.random.seed,[numpy],SeldonIO--alibi.git
265341,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,mock|kmeans,sklearn.utils.resample,[sklearn],SeldonIO--alibi.git
265342,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,call,shap.utils._legacy.DenseData,[shap],SeldonIO--alibi.git


In [13]:
tokens = df_processed['references'].str.split(',').values

num_tokens_per_method = [len(l) for l in tokens]
print(f"Max number of tokens in method {np.max(num_tokens_per_method)}\nMin number of tokens in a method {np.min(num_tokens_per_method)}\nAverage number of tokens per method {np.mean(num_tokens_per_method):.2f}")

Max number of tokens in method 74
Min number of tokens in a method 1
Average number of tokens per method 2.33


In [14]:
tokens

array([list(['os.path.join']),
       list(['flexmock.flexmock.should_receive.and_return']),
       list(['pandas.DataFrame', 'pandas.concat.drop_duplicates']), ...,
       list(['sklearn.utils.resample']),
       list(['shap.utils._legacy.DenseData']), list(['numpy.sum'])],
      dtype=object)

In [15]:
print(f"Number of unique method names: {len(df_processed)}\nNumber of unique tokens {len(np.unique(tokens))}")

Number of unique method names: 220486
Number of unique tokens 96436


In [16]:
df_processed = df_processed.method_name.to_frame().merge(df_processed.references, left_index=True, right_index=True)

## 4. Partition into sets

In [17]:
train, test = train_test_split(df_processed, test_size=0.2, shuffle=True)
print(f"{len(train)} train samples\n{len(test)} test samples")
train

176388 train samples
44098 test samples


Unnamed: 0,method_name,references
62945,test|print|tensor|dtype,"paddle.rand,paddle.disable_static,paddle.enabl..."
106230,test|pow,"pyro.distributions.Uniform,pyro.distributions...."
99389,test|op|grad,trax.tf_numpy.jax_tests.test_util
201657,get|top|n,numpy.argsort
159796,call,"tensorflow,tensorflow.concat"
...,...,...
16807,test|txt,numpy.loadtxt
119508,walk,dm_control.rl.control.Environment
120701,test|init,mock.MagicMock
262324,test|extract|box|classifier|features|returns|e...,"tensorflow.shape,tensorflow.global_variables_i..."


## 5. Save

In [18]:
import csv
train.to_csv(_DATA_DIR+'/train.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
test.to_csv(_DATA_DIR+'/test.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')