In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
DATASET_NAME = 'tokens_small'
_DATA_DIR = f'../data/{DATASET_NAME}'

## 1. Load data

In [3]:
df = pd.read_json(_DATA_DIR + '/tokens.json')
df.head()

Unnamed: 0,file,method_name,tokens
0,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,read_file,"[filepath, join, os, dirname, open, path, __fi..."
1,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock_pyquery,"[status_code, self, read_file, YEAR, text, Moc..."
2,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,mock_request,"[status_code, self, YEAR, text, MockRequest, s..."
3,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,__init__,"[self, year, month]"
4,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,__init__,"[self, LOSS, week, result]"


In [4]:
# Optional - filter out rows where the method name doesnt contain any of the chosen subtokens
classes = { 0: 'train', 1: 'save', 2: 'process', 3: 'forward', 4: 'predict' }

df = df[df.method_name.str.contains("|".join(classes.values()))]
df

Unnamed: 0,file,method_name,tokens
723,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_no_save_returns_none,"[PropertyMock, self, fake_save, save, game, _s..."
724,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_save_returns_name,"[PropertyMock, self, fake_save, save, game, _s..."
795,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_single_goalies,"[PropertyMock, fake_saves, self, _away_goalies..."
796,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[PropertyMock, fake_saves, self, _away_goalies..."
797,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[PropertyMock, fake_saves, self, _away_goalies..."
...,...,...,...
583591,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,save,"[elapsed, self, _filename, _hash, write, test_..."
583657,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,predict,"[self, args, kwargs, __call__]"
583792,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,_run_forward,"[target, linalg, gather, _select_target, isins..."
583878,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,_preprocess_img,"[self, custom_segmentation, image_preproc, cop..."


In [5]:
# Assign categories based on method name
df['category'] = df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
df

Unnamed: 0,file,method_name,tokens,category
723,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_no_save_returns_none,"[PropertyMock, self, fake_save, save, game, _s...",1
724,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_save_returns_name,"[PropertyMock, self, fake_save, save, game, _s...",1
795,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_single_goalies,"[PropertyMock, fake_saves, self, _away_goalies...",1
796,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[PropertyMock, fake_saves, self, _away_goalies...",1
797,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test_away_saves_multiple_goalies_empty_field,"[PropertyMock, fake_saves, self, _away_goalies...",1
...,...,...,...,...
583591,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,save,"[elapsed, self, _filename, _hash, write, test_...",1
583657,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,predict,"[self, args, kwargs, __call__]",4
583792,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,_run_forward,"[target, linalg, gather, _select_target, isins...",3
583878,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,_preprocess_img,"[self, custom_segmentation, image_preproc, cop...",2


In [6]:
df.groupby('category').size()

category
0     8255
1     4257
2     4433
3    11549
4     6722
dtype: int64

## 2. Preprocess

In [7]:
def camel_case_split(identifier, joinToken):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier,
    )
    return f'{joinToken}'.join([m.group(0).lower() for m in matches])

def snake_case_split(identifier, joinToken):
    return f'{joinToken}'.join([x for x in identifier.split('_') if x != ''])

In [8]:
df_processed = df.copy()

df['tokens'] = df['tokens'].apply(lambda x: list(np.unique(x))) # Dont store duplicates
df_processed = df_processed[~df_processed.tokens.str.len().eq(0)]
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: [snake_case_split(s, ',') for s in x] )
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: ",".join([camel_case_split(s, ',') for s in x]))

# split camel/snake case method names
df_processed['method_name'] = df_processed.method_name.map(lambda x: snake_case_split(x, '|'))
df_processed['method_name'] = df_processed.method_name.map(lambda x: camel_case_split(x, '|'))

df_processed

Unnamed: 0,file,method_name,tokens,category
723,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|no|save|returns|none,"property,mock,self,fake,save,save,game,save,type",1
724,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|save|returns|name,"property,mock,self,fake,save,save,game,save,type",1
795,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|single|goalies,"property,mock,fake,saves,self,away,goalies,awa...",1
796,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|multiple|goalies|empty|field,"property,mock,fake,saves,self,away,goalies,awa...",1
797,/home/marcus/Datasets/MLCODE/roclark/sportsipy...,test|away|saves|multiple|goalies|empty|field,"property,mock,fake,saves,self,away,goalies,awa...",1
...,...,...,...,...
583591,/home/marcus/Datasets/MLCODE/maciejkula/spotli...,save,"elapsed,self,filename,hash,write,test,mrr,dump...",1
583657,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,predict,"self,args,kwargs,call",4
583792,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,run|forward,"target,linalg,gather,select,target,isinstance,...",3
583878,/home/marcus/Datasets/MLCODE/SeldonIO/alibi.gi...,preprocess|img,"self,custom,segmentation,image,preproc,copy,sh...",2


In [9]:
tokens = df_processed['tokens'].str.split(',').values
num_tokens_per_method = [len(l) for l in tokens]
print(f"Max number of tokens in method {np.max(num_tokens_per_method)}\nMin number of tokens in a method {np.min(num_tokens_per_method)}\nAverage number of tokens per method {np.mean(num_tokens_per_method):.2f}")

Max number of tokens in method 570
Min number of tokens in a method 1
Average number of tokens per method 23.97


In [10]:
print(f"Number of unique method names: {len(df_processed)}\nNumber of unique tokens {len(np.unique(tokens))}")

Number of unique method names: 35216
Number of unique tokens 28228


In [11]:
df_processed.drop(columns=['file'], inplace=True)

## 3. Partition into sets

In [12]:
train_size, val_size, test_size = 0.8, 0.1, 0.1
train, remainder = train_test_split(df_processed, test_size=(1-train_size), shuffle=True)
validate, test =  train_test_split(remainder, test_size=test_size/(test_size + val_size))

print(f"{len(train)} train samples\n{len(validate)} validation samples\n{len(test)} test samples")
train

28172 train samples
3522 validation samples
3522 test samples


Unnamed: 0,method_name,tokens,category
63200,predict,"self,apply,along,axis,predictions,predict,row,...",4
304943,train,"labels,item,model,train,nn,enumerate,sum,filte...",0
14065,run|prediction,"std,image,from,json,pre,process,config,file,bu...",4
199373,test|save|tmp|checkpoint,"experiment,self,elog,exists,join,os,time,check...",1
225796,link|and|save|file,"readlink,exists,base,path,abspath,wandb,path,j...",1
...,...,...,...
179038,process,"img,np,fliplr",2
109548,forward|pass,"self,train,embeddings,params,moments,data,trai...",3
126429,train,"self,reg,loss,labels,label,ce,loss,batch,loss,...",0
172967,predict,"input,path,format,output,path,exception,json,f...",4


## 4. Save

In [13]:
import csv
train.to_csv(_DATA_DIR+'/train.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
validate.to_csv(_DATA_DIR+'/val.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
test.to_csv(_DATA_DIR+'/test.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')