In [1]:
import os
os.putenv("CUDA_VISIBLE_DEVICES", "")


from typing import Union, Iterable, List
from pathlib import Path
import json
import torch
import re
from catalyst.utils import set_global_seed
import random
import numpy as np


def read_jsonl(path):
    with open(path, 'r') as istream:
        return [json.loads(l) for l in istream]


SEED = 33
set_global_seed(SEED)

In [2]:
DATA_FOLDER = Path.home() / "data/method_name_prediction/python/final/jsonl"


train = read_jsonl(DATA_FOLDER / "train_preprocessed.jsonl")
valid = read_jsonl(DATA_FOLDER / "valid_preprocessed.jsonl")
test = read_jsonl(DATA_FOLDER / "test_preprocessed.jsonl")

In [3]:
train[0]['function_name']

'zmq_device'

In [4]:
train_names = {e['function_name'] for e in train}
valid_names = {e['function_name'] for e in valid}
test_names = {e['function_name'] for e in test}

In [5]:
def get_and_flatten(data, key):
    return [
        [
            tok for line in e[key]
            for tok in line
        ]
        for e in data
    ]


In [6]:
body_key = 'function_body_tokenized'
train_sentences = get_and_flatten(train, body_key)
valid_sentences = get_and_flatten(valid, body_key)
test_sentences = get_and_flatten(test, body_key)

name_key = 'function_name_tokenized'
train_names = get_and_flatten(train, name_key)
valid_names = get_and_flatten(valid, name_key)
test_names = get_and_flatten(test, name_key)

In [7]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity

In [8]:
dictionary = Dictionary(train_sentences)

In [9]:
def get_bow(dictionary, corpus):
    return [dictionary.doc2bow(d) for d in corpus]

In [10]:
bow_train = get_bow(dictionary, train_sentences)
bow_valid = get_bow(dictionary, valid_sentences)
bow_test = get_bow(dictionary, test_sentences)

In [11]:
tfidf_train = TfidfModel(bow_train)[bow_train]

In [12]:
index = SparseMatrixSimilarity(tfidf_train, num_features=len(dictionary))

In [13]:
tfidf_valid = TfidfModel(bow_valid)[bow_valid]
tfidf_test = TfidfModel(bow_test)[bow_test]

In [14]:
from tqdm import tqdm
import pandas as pd
from utils import compute_metrics


def evaluate_tfidf(index, tokenized_candidates, tfidf_corpus, tokenized_names):
    metrics = []
    for i, example in tqdm(enumerate(tfidf_corpus)):
        top_5_idx = np.argsort(
            index.get_similarities(example)
        )[-1:-5:-1]
        candidates = [tokenized_candidates[j] for j in top_5_idx]
        metrics.append(compute_metrics(tokenized_names[i], candidates))
    return pd.DataFrame(metrics)


In [15]:
EOS_TOKEN = '\\u\\u\\uNEWLINE\\u\\u\\u_'
train_names = [name[:name.index(EOS_TOKEN)] for name in train_names]
test_names = [name[:name.index(EOS_TOKEN)] for name in test_names]

In [16]:
test_metrics = evaluate_tfidf(index, train_names, tfidf_test, test_names)

21877it [1:09:36,  5.24it/s]


In [17]:
test_metrics.mean()

exact-top-1        0.035974
exact-top-5        0.052567
precision-top-1    0.094529
precision-top-5    0.183505
recall-top-1       0.091906
recall-top-5       0.174692
f1-top1            0.090522
f1-top5            0.173129
dtype: float64