In [None]:
%matplotlib inline
!mkdir -p data/plots
import jsonlines
import sentencepiece as spm

import pandas as pd
import numpy as np

import plotnine as p9

sp = spm.SentencePieceProcessor()
sp.Load('../data/codesearchnet_javascript/csnjs_8k_9995p_unigram_url.model')

In [None]:
%%time
data = []
with jsonlines.open('../data/codesearchnet_javascript/javascript_train_supervised.jsonl') as f:
    for obj in f:
        data.append(obj)

df = pd.DataFrame(data)
df['func_name_len'] = df['func_name'].str.len()
df['code_len'] = df['code'].str.len()
df['func_name_subword_len'] = df['func_name'].apply(lambda x: len(sp.EncodeAsIds(x)))

In [None]:
func_name_ids = {k: idx for idx, k in enumerate(set(df['func_name']))}
df['func_name_idx'] = df['func_name'].apply(lambda x: func_name_ids[x])
print(len(func_name_ids))

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
x = Counter(df['func_name'])

In [None]:
print(x)

In [None]:
hot_tokens = list(idx for idx, value in x.items() if value > 1)
plt.plot(list(reversed(sorted(x.values()))))
len(hot_tokens)

In [None]:
p9.ggplot(
    data=df,
    mapping=p9.aes(x='func_name_idx')
) + p9.stat_ecdf() + p9.xlab('Identifier ID') + p9.ylab('CDF') + p9.theme_classic(16)

In [None]:
plt_identifier_len = p9.ggplot(
    data=df,
    mapping=p9.aes(x='func_name_len')
) + p9.stat_ecdf() + p9.xlab('Identifier length') + p9.ylab('CDF') + p9.theme_classic(16)
plt_identifier_len.draw()
plt_identifier_len.save('data/plots/identifier_length.pdf')

plt_code_len = p9.ggplot(
    data=df,
    mapping=p9.aes(x='code_len')
) + p9.stat_ecdf() + p9.xlab('Method body length') + p9.ylab('CDF') + p9.theme_classic(16) + p9.xlim(0, 10000)
plt_code_len.draw()
plt_code_len.save('data/plots/code_length.pdf')

n_subwords = p9.ggplot(
    data=df,
    mapping=p9.aes(x='func_name_subword_len')
) + p9.stat_ecdf() + p9.xlab('Identifier subword length') + p9.ylab('CDF')
n_subwords.draw()
n_subwords.save('data/plots/identifier_subword_length.pdf')