In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = 'tokens'
_DATA_DIR = '../data/' + data

## 1. Load data

In [3]:
df = pd.read_json(_DATA_DIR + '/code2lib.json')
df.head()

Unnamed: 0,file,method_name,tokens
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read_file,"[filepath, read, filename, __file__, open, pat..."
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock_pyquery,"[opp_contents, read_file, status_code, html_co..."
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock_request,"[status_code, html_contents, MockRequest, url,..."
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,__init__,"[year, self, month]"
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup_method,"[MONTH, team_conference, flexmock, self, and_r..."


## 2. Preprocess

In [4]:
def camel_case_split(identifier, joinToken):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier,
    )
    return f'{joinToken}'.join([m.group(0).lower() for m in matches])

def snake_case_split(identifier, joinToken):
    return f'{joinToken}'.join([x for x in identifier.split('_') if x != ''])

In [5]:
df_processed = df.copy()

df['tokens'] = df['tokens'].apply(lambda x: list(np.unique(x))) # Dont store duplicates
df_processed = df_processed[~df_processed.tokens.str.len().eq(0)]
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: [snake_case_split(s, ',') for s in x] )
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: ",".join([camel_case_split(s, ',') for s in x]))

# split camel/snake case method names
df_processed['method_name'] = df_processed.method_name.map(lambda x: snake_case_split(x, '|'))
df_processed['method_name'] = df_processed.method_name.map(lambda x: camel_case_split(x, '|'))

df_processed

Unnamed: 0,file,method_name,tokens
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read|file,"filepath,read,filename,file,open,path,join,os,..."
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock|pyquery,"opp,contents,read,file,status,code,html,conten..."
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock|request,"status,code,html,contents,mock,request,url,tex..."
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,init,"year,self,month"
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,setup|method,"month,team,conference,flexmock,self,and,return..."
...,...,...,...
583042,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,test|get|quantiles,"get,quantiles,rand,num,points,batch,size,input..."
583043,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,test|adaptive|grid,"adaptive,grid,minimum,satisfied,min,bin,points..."
583044,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,uncollect|if|n|features|more|than|input|dim,"features,n,features,kwargs,len"
583045,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,test|explain,"rand,isinstance,constant,value,default,data,al..."


In [6]:
tokens = df_processed['tokens'].str.split(',').values
num_tokens_per_method = [len(l) for l in tokens]
print(f"Max number of tokens in method {np.max(num_tokens_per_method)}\nMin number of tokens in a method {np.min(num_tokens_per_method)}\nAverage number of tokens per method {np.mean(num_tokens_per_method):.2f}")

Max number of tokens in method 986
Min number of tokens in a method 1
Average number of tokens per method 19.24


In [7]:
tokens

array([list(['filepath', 'read', 'filename', 'file', 'open', 'path', 'join', 'os', 'dirname']),
       list(['opp', 'contents', 'read', 'file', 'status', 'code', 'html', 'contents', 'adv', 'opp', 'contents', 'div', 'url', 'text', 'self', 'basic', 'stats', 'url', 'basic', 'opponent', 'stats', 'url', 'advanced', 'opponent', 'stats', 'url', 'advanced', 'stats', 'url', 'basic', 'contents', 'mock', 'pq', 'adv', 'contents', 'year']),
       list(['status', 'code', 'html', 'contents', 'mock', 'request', 'url', 'text', 'self', 'str', 'year']),
       ..., list(['features', 'n', 'features', 'kwargs', 'len']),
       list(['rand', 'isinstance', 'constant', 'value', 'default', 'data', 'ale', 'attr', 'n', 'features', 'feature', 'values', 'exp', 'zip', 'data', 'predictor', 'mock', 'ale', 'explainer', 'feature', 'deciles', 'shape', 'np', 'a0', 'ale', 'values', 'out', 'dim', 'target', 'names', 'default', 'meta', 'ale', 'len', 'featv', 'all', 'alev', 'features', 'float', 'explain', 'batch', 'size', 'a

In [8]:
print(f"Number of unique method names: {len(df_processed)}\nNumber of unique tokens {len(np.unique(tokens))}")

Number of unique method names: 583047
Number of unique tokens 455033


In [9]:
df_processed.drop(columns=['file'], inplace=True)

## 3. Partition into sets

In [10]:
train, test = train_test_split(df_processed, test_size=0.2, shuffle=True)
print(f"{len(train)} train samples\n{len(test)} test samples")
train

466437 train samples
116610 test samples


Unnamed: 0,method_name,tokens
12796,setup|dataset,"dataset,class,dataset,dataset,iterator,subset,..."
470705,test|div|forward|cpu,"forward,cpu,self"
6650,read|data,"as,str,read,filename,tf,zip,file,namelist,zipf..."
525876,test|get|parameters,"segmentation,size,get,parameters,iaa,params,cs..."
113465,generate|hyperparameters|samples,"n,samples,optimize,sample,inference,n,burnin,r..."
...,...,...
124336,create|yaml,"open,yaml,error,print,yaml,exc,yf,data,dump"
331079,forward,"elu,slope,momentum,act,leaky,relu,running,mean..."
378479,scheduler|dicts,"self,scheduler,dicts"
44594,process,"statement,response,confidence"


## 4. Save

In [11]:
import csv
train.to_csv(_DATA_DIR+'/train.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
test.to_csv(_DATA_DIR+'/test.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')