In [23]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from ranking.util import json_lines as jl
import pandas as pd
import numpy as np


In [25]:
def calculate_tfidf_for_dataset(corpus) -> pd.DataFrame:
    vectorizer = TfidfVectorizer()
    result = vectorizer.fit_transform(corpus)
    idf_df = pd.DataFrame.sparse.from_spmatrix(result, columns=vectorizer.get_feature_names())
    return idf_df

def lookup_tfidf_weight(df_tfidf: pd.DataFrame, storage_id, word):
    if word in df_tfidf.columns:
        return df_tfidf.at[storage_id, word]
    else:
        return 0

def apply_tfidf_weights_to_doc(tfidf: pd.DataFrame, storage_id, doc):
    return [(word, lookup_tfidf_weight(tfidf, storage_id, word)) for word in doc.split()]

def create_tfidf_dataset(df: pd.DataFrame, min_unique_words=5):
    groups = df.groupby('storageId')
    corpus = groups['docContent'].first()
    tfidf = calculate_tfidf_for_dataset(corpus)
    eval_dataset = groups.first()
    eval_dataset['n_unique_words'] = eval_dataset['docContent'].str.split().apply(lambda x: np.unique(x).size)
    eval_dataset = eval_dataset[eval_dataset['n_unique_words'] >= min_unique_words]
    eval_dataset['tfidf'] = eval_dataset.apply(lambda row: apply_tfidf_weights_to_doc(tfidf, row.name, row['docContent']), axis=1)
    eval_dataset['docQuery'] = eval_dataset.apply(lambda row: get_query_from_doc(row, get_n_words_to_extract(row)), axis=1)
    return eval_dataset

def get_n_words_to_extract(row, percent=0.3):
    n_unique_words = row['n_unique_words']
    n = round(n_unique_words * percent)
    return n

def get_query_from_doc(row, n):
    unique_weighed_words = list(dict.fromkeys(row['tfidf']))
    max_n_scored_words = sorted(unique_weighed_words, key=lambda word_weight: word_weight[1], reverse=True)[:n]
    query = ' '.join([word for word, _ in max_n_scored_words])
    return query

In [30]:
dataset = jl.read_dataset('lem-test-all-unique-functions.jsonl')
tfidf_ds = create_tfidf_dataset(dataset)
# jl.write_dataset(tfidf_ds[['docType', 'docQuery']], 'test-tfidf-evalset.jsonl')


Unnamed: 0_level_0,docContent,docItem,docType,n_unique_words,tfidf,docQuery
storageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,lookup function partially safe min n index,(!!!) :: [a] -> Int -> Maybe a,:: [a] -> Int -> Maybe a,7,"[(lookup, 0.37809988267404765), (function, 0.2...",partially min
1,like selects nth element x wrap end x map,(!!!) :: [a] -> Int -> a,:: [a] -> Int -> a,8,"[(like, 0.30779505246155114), (selects, 0.4647...",selects nth
2,x n return element x nb vector element ascend ...,"(!!) :: (KnownNat n, Enum i) => Vec n a -> i -> a",":: (KnownNat n, Enum i) => Vec n a -> i -> a",23,"[(x, 0), (n, 0), (return, 0.08665894289489474)...",nil 14 length element index ascend maximum
3,list index subscript operator start index larg...,"(!!) :: (MonadThrow m, Integral n) => [a] -> n...",":: (MonadThrow m, Integral n) => [a] -> n -> m a",13,"[(list, 0.19063676279088834), (index, 0.460993...",throw index emptylistexception negativeindexex...
5,index must small length list otherwise result ...,(!!) :: C n => [a] -> n -> a,:: C n => [a] -> n -> a,8,"[(index, 0.24577710157203816), (must, 0.385553...",small otherwise
...,...,...,...,...,...,...
477,fee tap drinker close used tap,"(+&) :: (Closable tap, MonadCatch m) => tap m ...",":: (Closable tap, MonadCatch m) => tap m -> Si...",5,"[(fee, 0.3525574803236666), (tap, 0.7051149606...",tap drinker
478,union two area span overlap abut merge result ...,(+) :: Area -> Area -> Area,:: Area -> Area -> Area,10,"[(union, 0.09238592846794644), (two, 0.0483317...",read area union
483,add two possibly scale squantitys preserve sca...,(+) :: Num a => SQuantity s d a -> SQuantity s...,:: Num a => SQuantity s d a -> SQuantity s d a...,13,"[(add, 0.15926397978939938), (two, 0.119471036...",scale factor squantitys changerepround
484,add two number number type variable mean opera...,(+) :: Num number => number -> number -> number,:: Num number => number -> number -> number,54,"[(add, 0.18279218766389926), (two, 0.045706866...",float int number tofloat round conversion like...
