In [15]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
import ast
import csv
import json
import common
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [16]:
dataset_name = 'cubert'
max_tokens = 500

In [None]:
df_inter = pd.read_json('./data/output.jsonl', lines=True)
df_inter.columns = ['json_element']
df_inter

In [None]:
df_inter = df_inter['json_element'].apply(json.loads)

In [None]:
df = pd.json_normalize(df_inter)

In [None]:
df.tokens = df.tokens.map(lambda x: [s.replace(' ', '') for s in x])

In [None]:
df = df[df.tokens.map(len) <= max_tokens]

In [None]:
# Write frequency of targets to file
df.method_name.value_counts().reset_index().to_csv(f'./data/{dataset_name}.histo.tgt.c2v', sep=" ", header=False, index=False)

In [None]:
# Histogram for tokens
df.tokens.explode('tokens').value_counts().reset_index().to_csv(f'./data/{dataset_name}.histo.ori.c2v', sep=" ", header=False, index=False)

In [None]:
def pad_and_stringify(tokens):
    csv_padding = " " * (max_tokens - len(tokens))
    stringified_tokens = " ".join([f"'{s}'" for s in tokens])
    padded_stringified_tokens = stringified_tokens + csv_padding
    return padded_stringified_tokens

def save_dictionaries(dataset_name, token_to_count, target_to_count,
                      num_training_examples):
    save_dict_file_path = './data/{}.dict.c2v'.format(dataset_name)
    with open(save_dict_file_path, 'wb') as file:
        pickle.dump(token_to_count, file)
        pickle.dump(target_to_count, file)
        pickle.dump(num_training_examples, file)
        print('Dictionaries saved to: {}'.format(save_dict_file_path))

In [None]:
df['stringified_tokens'] = df.tokens.map(pad_and_stringify) # Stringify for csv
df

In [None]:
train, test = train_test_split(df, test_size=0.2, shuffle=True)
print(f"{len(train)} train samples\n{len(test)} test samples")
train

In [None]:
num_training_examples = len(train)
target_vocab_size = len(train.method_name.unique())
token_vocab_size = len(df.tokens.explode('tokens').unique())
print(f"Unique method names: {target_vocab_size}\nUnique tokens: {token_vocab_size}")

In [None]:
train[['method_name', 'stringified_tokens']].to_csv(f'./data/{dataset_name}.train.c2v', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ',')
test[['method_name', 'stringified_tokens']].to_csv(f'./data/{dataset_name}.test.c2v', encoding='utf-8', sep=" ", index=False, header=None, quoting = csv.QUOTE_NONE, escapechar = ',')

In [None]:
 word_histogram_data = common.common.load_vocab_from_histogram(f'./data/{dataset_name}.histo.ori.c2v', start_from=1,
                                                                  max_size=int(
                                                                      token_vocab_size),
                                                                  return_counts=True)
_, _, _, word_to_count = word_histogram_data
_, _, _, target_to_count = common.common.load_vocab_from_histogram(f'./data/{dataset_name}.histo.tgt.c2v', start_from=1,
                                                                   max_size=int(
                                                                       target_vocab_size),
                                                                   return_counts=True)

save_dictionaries(dataset_name=dataset_name, token_to_count=word_to_count, target_to_count=target_to_count,
                  num_training_examples=num_training_examples)