In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
DATASET_NAME = 'libraries_small'
_DATA_DIR = f'../data/{DATASET_NAME}'

## 1. Load data

In [3]:
df = pd.read_json(_DATA_DIR + '/libraries.json')
df.head()

Unnamed: 0,file,method_name,references
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read_file,[os.path.join]
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,__init__,[sportsipy.constants.LOSS]
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup_method,"[flexmock.flexmock.should_receive.and_return, ..."
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_nfl_integration_returns_correct_attribute...,[sportsipy.nfl.teams.Teams]
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_nfl_integration_dataframe_returns_dataframe,"[pandas.DataFrame, sportsipy.nfl.teams.Teams, ..."


In [4]:
# Optional - filter out rows where the method name doesnt contain any of the chosen subtokens
classes = { 0: 'train', 1: 'save', 2: 'process', 3: 'forward', 4: 'predict' }

df = df[df.method_name.str.contains("|".join(classes.values()))]
df

Unnamed: 0,file,method_name,references
531,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_no_save_returns_none,"[mock.PropertyMock, mock.PropertyMock]"
532,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_save_returns_name,"[mock.PropertyMock, mock.PropertyMock]"
591,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_single_goalies,"[mock.PropertyMock, mock.PropertyMock, mock.Pr..."
592,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[mock.PropertyMock, mock.PropertyMock, mock.Pr..."
593,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[mock.PropertyMock, mock.PropertyMock, mock.Pr..."
...,...,...,...
265114,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,_get_negative_prediction,"[spotlight.sampling.sample_items, spotlight.to..."
265115,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,_get_multiple_negative_predictions,[spotlight.torch_utils.gpu.view]
265116,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,predict,"[spotlight.torch_utils.gpu.train, numpy.atleas..."
265153,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,test_predict_movielens,[spotlight.datasets.movielens.get_movielens_da...


In [5]:
# Assign categories based on method name
df['category'] = df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
df

Unnamed: 0,file,method_name,references,category
531,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_no_save_returns_none,"[mock.PropertyMock, mock.PropertyMock]",1
532,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_save_returns_name,"[mock.PropertyMock, mock.PropertyMock]",1
591,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_single_goalies,"[mock.PropertyMock, mock.PropertyMock, mock.Pr...",1
592,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[mock.PropertyMock, mock.PropertyMock, mock.Pr...",1
593,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[mock.PropertyMock, mock.PropertyMock, mock.Pr...",1
...,...,...,...,...
265114,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,_get_negative_prediction,"[spotlight.sampling.sample_items, spotlight.to...",4
265115,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,_get_multiple_negative_predictions,[spotlight.torch_utils.gpu.view],4
265116,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,predict,"[spotlight.torch_utils.gpu.train, numpy.atleas...",4
265153,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,test_predict_movielens,[spotlight.datasets.movielens.get_movielens_da...,4


In [6]:
def get_project_name(path):
    s = path.split('/')
    return f"{s[5]}--{s[6]}"

In [7]:
# Get unique libraries
df['libraries'] = df.references.map(lambda x: list(set([s.split('.')[0] for s in x])))
df['references'] = df['references'].apply(lambda x: list(set(x))) # Dont store duplicates
df['project'] = df.file.map(lambda x: get_project_name(x))
df.head()

Unnamed: 0,file,method_name,references,category,libraries,project
531,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_no_save_returns_none,[mock.PropertyMock],1,[mock],roclark--sportsipy.git
532,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_save_returns_name,[mock.PropertyMock],1,[mock],roclark--sportsipy.git
591,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_single_goalies,[mock.PropertyMock],1,[mock],roclark--sportsipy.git
592,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,[mock.PropertyMock],1,[mock],roclark--sportsipy.git
593,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,[mock.PropertyMock],1,[mock],roclark--sportsipy.git


## 2. Briefly inspect the data

In [8]:
stats = pd.DataFrame(np.concatenate(df.libraries.values).ravel(), columns=['library'])
stats.head()
stats['count'] = stats.groupby('library')['library'].transform('count')
stats = stats.drop_duplicates(subset=['library']).sort_values(by=['count'], ascending=False).reset_index(drop=True)
stats

Unnamed: 0,library,count
0,torch,4576
1,numpy,3236
2,tensorflow,1306
3,os,1144
4,chainer,983
...,...,...
1207,chazutsu,1
1208,eta,1
1209,robo,1
1210,aup,1


In [9]:
imports = np.concatenate(df.references.values).ravel()
print(f"Total number of methods {len(df)}\nNumber of unique method names {len(np.unique(df.method_name))}")
print(f"Total number of libraries {len(stats)}\nTotal number of import references {len(imports)}\nTotal number of unique import references {len(np.unique(imports))}")

Total number of methods 19529
Number of unique method names 7085
Total number of libraries 1212
Total number of import references 48058
Total number of unique import references 13996


## 3. Preprocess

In [10]:
# Aggregate libraries by project name
project_libraries = df.groupby(['project'])['libraries'].sum().map(lambda x: list(set(x)))
project_libraries

project
1033020837--Basic4AI.git                                                       [numpy, torch]
AFAgarap--cnn-svm.git                                                     [tensorflow, torch]
AIHunters--AcurusTrack.git                                                        [numpy, os]
AKSHAYUBHAT--ComputationalHealthcare.git                            [collections, entity, re]
ANSSI-FR--SecuML.git                        [copy, conf, train, prediction, pandas, numpy,...
                                                                  ...                        
yzhao062--pyod.git                                            [numpy, xgboost, pyod, sklearn]
zalandoresearch--pytorch-dilated-rnn.git                                  [time, torch, drnn]
zhuzilin--NP_ML.git                                                            [numpy, scipy]
zqhZY--semanaly.git                                                                   [torch]
zyfra--ebonite.git                          [pyjacks

In [11]:
# Check frequency of libraries
df_proj_libs = pd.DataFrame(np.concatenate(project_libraries).ravel(), columns=['library'])
df_proj_libs['count'] = df_proj_libs.groupby('library')['library'].transform('count')
df_proj_libs = df_proj_libs.drop_duplicates(subset=['library']).sort_values(by=['count'], ascending=False).reset_index(drop=True)
df_proj_libs

Unnamed: 0,library,count
0,numpy,426
1,os,262
2,torch,257
3,tensorflow,193
4,sklearn,160
...,...,...
1207,_docstring_check,1
1208,babi,1
1209,chainerx,1
1210,thin_stack,1


In [12]:
# Get all libraries when are referenced > 1 (i.e. not project specific)
shared_project_libraries = df_proj_libs[df_proj_libs['count'] > 1]
print(f"{len(shared_project_libraries)} out of {len(stats)} declared libraries are project specific ({len(shared_project_libraries) / len(stats):.2f}%)")
shared_libs = list(shared_project_libraries.library)
shared_project_libraries

263 out of 1212 declared libraries are project specific (0.22%)


Unnamed: 0,library,count
0,numpy,426
1,os,262
2,torch,257
3,tensorflow,193
4,sklearn,160
...,...,...
258,ray,2
259,skll,2
260,net,2
261,GPy,2


In [13]:

def camel_case_split(identifier, joinToken):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier,
    )
    return f'{joinToken}'.join([m.group(0).lower() for m in matches])

def snake_case_split(identifier, joinToken):
    return f'{joinToken}'.join([x for x in identifier.split('_') if x != ''])

In [14]:
df_processed = df.copy()
# Drop all library references that are project specific
df_processed['references'] = df_processed['references'].apply(lambda x: [s for s in x if s.split('.')[0] in shared_libs])
df_processed['libraries'] = df_processed['libraries'].apply(lambda x: [s for s in x if s in shared_libs])
# Some rows may now include no references - drop these
df_processed = df_processed[~df_processed.references.str.len().eq(0)]
df_processed['references'] = df_processed['references'].apply(lambda x: ",".join(x))

# split camel/snake case method names
df_processed['method_name'] = df_processed.method_name.map(lambda x: snake_case_split(x, '|'))
df_processed['method_name'] = df_processed.method_name.map(lambda x: camel_case_split(x, '|'))

df_processed

Unnamed: 0,file,method_name,references,category,libraries,project
531,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|no|save|returns|none,mock.PropertyMock,1,[mock],roclark--sportsipy.git
532,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|save|returns|name,mock.PropertyMock,1,[mock],roclark--sportsipy.git
591,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|single|goalies,mock.PropertyMock,1,[mock],roclark--sportsipy.git
592,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|multiple|goalies|empty|field,mock.PropertyMock,1,[mock],roclark--sportsipy.git
593,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|multiple|goalies|empty|field,mock.PropertyMock,1,[mock],roclark--sportsipy.git
...,...,...,...,...,...,...
265065,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,user|based|train|test|split,"numpy.logical_not,numpy.iinfo.max,numpy.iinfo.min",0,[numpy],maciejkula--spotlight.git
265077,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,predict|process|ids,torch.from_numpy,2,[torch],maciejkula--spotlight.git
265109,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,forward,torch.nn.functional.softmax.unsqueeze.expand_a...,3,[torch],maciejkula--spotlight.git
265116,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,predict,"numpy.atleast_2d,torch.from_numpy",4,"[numpy, torch]",maciejkula--spotlight.git


In [15]:
tokens = df_processed['references'].str.split(',').values

num_tokens_per_method = [len(l) for l in tokens]
print(f"Max number of tokens in method {np.max(num_tokens_per_method)}\nMin number of tokens in a method {np.min(num_tokens_per_method)}\nAverage number of tokens per method {np.mean(num_tokens_per_method):.2f}")

Max number of tokens in method 28
Min number of tokens in a method 1
Average number of tokens per method 2.24


In [16]:
tokens

array([list(['mock.PropertyMock']), list(['mock.PropertyMock']),
       list(['mock.PropertyMock']), ...,
       list(['torch.nn.functional.softmax.unsqueeze.expand_as', 'torch.nn']),
       list(['numpy.atleast_2d', 'torch.from_numpy']),
       list(['pandas.DataFrame', 'pandas.values', 'pandas.sort_values.groupby.first', 'pandas'])],
      dtype=object)

In [17]:
print(f"Number of unique method names: {len(df_processed)}\nNumber of unique tokens {len(np.unique(tokens))}")

Number of unique method names: 16774
Number of unique tokens 7981


In [18]:
df_processed = df_processed.method_name.to_frame().merge(df_processed.references, left_index=True, right_index=True)

## 4. Partition into sets

In [19]:
train_size, val_size, test_size = 0.8, 0.1, 0.1
train, remainder = train_test_split(df_processed, test_size=(1-train_size), shuffle=True)
validate, test =  train_test_split(remainder, test_size=test_size/(test_size + val_size))

print(f"{len(train)} train samples\n{len(validate)} validation samples\n{len(test)} test samples")
train

13419 train samples
1677 validation samples
1678 test samples


Unnamed: 0,method_name,references
206828,forward,"torch.Tensor.to,torch.nn"
45368,predict,"compat.pmdarima.get_X,compat.sklearn.check_is_..."
69266,feedforward|q|function,"tensorflow.keras.Model,tree.map_structure_up_t..."
88838,forward,"nni.nas.pytorch.mutables.LayerChoice,torch.nn...."
259351,forward,allennlp.custom_extensions._ext.highway_lstm_l...
...,...,...
173615,train,"torch.optim.Adam,tqdm.tqdm"
219993,forward|cpu,"chainer.utils.conv_nd.shape,chainer.backend.ra..."
259530,train|model,allennlp.data.iterators.data_iterator.DataIter...
141942,forward,"torch.sigmoid,torch.nn"


## 5. Save

In [20]:
import csv
train.to_csv(_DATA_DIR+'/train.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
validate.to_csv(_DATA_DIR+'/val.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
test.to_csv(_DATA_DIR+'/test.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')