In [16]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2

In [17]:
from ranking.util import json_lines as jl
from ranking.util import dataset_paths as dp
import pandas as pd

input_file = dp.raw_corpus
output_file  = 'all-unique-functions.jsonl'

In [18]:
def strip_extra_spaces(text):
    return " ".join(text.split())

def equalize_docItem(docItem):
    no_empty_ctx = "".join(docItem.split('() =>'))
    return strip_extra_spaces(no_empty_ctx)

sign = '($$!) :: () => (i -> r) -> Number r i -> r'
exptectedSign = '($$!) :: (i -> r) -> Number r i -> r'

assert(equalize_docItem(sign) == exptectedSign)
assert(exptectedSign == equalize_docItem(exptectedSign))

In [19]:
# Adds a storageId that acts a group id to each function
def group_unique_functions(df: pd.DataFrame) -> pd.DataFrame:
    df['equalizedDocItem'] = df.apply(
        lambda row: equalize_docItem(row['docItem']), axis=1)
    groups = df.groupby(['equalizedDocItem'])

    df['storageId'] = groups.ngroup()
    df['docContent'] = groups['docContent'].transform('first')
    df['docItem'] = groups['docItem'].transform('first')
    df['docType'] = groups['docType'].transform('first')
    return df[['docId', 'storageId', 'docContent', 'docItem', 'docType']]


In [20]:
df = jl.read_jsonl(input_file)
df = df[df['docItem'] != '']  # ignore all items that are no functions
df = group_unique_functions(df).sort_values('storageId')
jl.to_jsonl(df, output_file)
