In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
DATASET_NAME = 'tokens'
_DATA_DIR = f'../data/{DATASET_NAME}'

## 1. Load data

In [None]:
df = pd.read_json(_DATA_DIR + '/tokens.json')
df.head()

In [None]:
# Optional - filter out rows where the method name doesnt contain any of the chosen subtokens
# classes = { 0: 'train', 1: 'save', 2: 'process', 3: 'forward', 4: 'predict' }

# df = df[df.method_name.str.contains("|".join(classes.values()))]
# df

In [None]:
# Assign categories based on method name
# df['category'] = df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
# df

In [None]:
# df.groupby('category').size()

## 2. Preprocess

In [None]:
def camel_case_split(identifier, joinToken):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier,
    )
    return f'{joinToken}'.join([m.group(0).lower() for m in matches])

def snake_case_split(identifier, joinToken):
    return f'{joinToken}'.join([x for x in identifier.split('_') if x != ''])

In [None]:
df_processed = df.copy()

df['tokens'] = df['tokens'].apply(lambda x: list(np.unique(x))) # Dont store duplicates
df_processed = df_processed[~df_processed.tokens.str.len().eq(0)]
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: [snake_case_split(s, ',') for s in x] )
df_processed['tokens'] = df_processed['tokens'].apply(lambda x: ",".join([camel_case_split(s, ',') for s in x]))

# split camel/snake case method names
df_processed['method_name'] = df_processed.method_name.map(lambda x: snake_case_split(x, '|'))
df_processed['method_name'] = df_processed.method_name.map(lambda x: camel_case_split(x, '|'))

df_processed

In [None]:
tokens = df_processed['tokens'].str.split(',').values
num_tokens_per_method = [len(l) for l in tokens]
print(f"Max number of tokens in method {np.max(num_tokens_per_method)}\nMin number of tokens in a method {np.min(num_tokens_per_method)}\nAverage number of tokens per method {np.mean(num_tokens_per_method):.2f}")

In [None]:
print(f"Number of unique method names: {len(np.unique(df_processed.method_name.values))}\nNumber of unique tokens {len(np.unique(tokens))}")

In [None]:
df_processed.drop(columns=['file'], inplace=True)

## 3. Partition into sets

In [None]:
train_size, val_size, test_size = 0.9, 0.05, 0.05
train, remainder = train_test_split(df_processed, test_size=(1-train_size), shuffle=True)
validate, test =  train_test_split(remainder, test_size=test_size/(test_size + val_size))

print(f"{len(train)} train samples\n{len(validate)} validation samples\n{len(test)} test samples")
train

## 4. Save

In [None]:
import csv
train.to_csv(_DATA_DIR+'/train.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
validate.to_csv(_DATA_DIR+'/val.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')
test.to_csv(_DATA_DIR+'/test.csv', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ' ')