In [59]:
# Always reload modules to have the current version
%reload_ext autoreload
%autoreload 2


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from ranking.util import json_lines as jl
import pandas as pd
import numpy as np


In [65]:
def calculate_tfidf_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
    docs = df['docContent']
    vectorizer = TfidfVectorizer()
    result = vectorizer.fit_transform(docs)
    df = pd.DataFrame.sparse.from_spmatrix(result, columns=vectorizer.get_feature_names())
    return df

def lookup_tfidf_weight(df_tfidf: pd.DataFrame, document_id, word):
    if word in df_tfidf.columns:
        return df_tfidf.at[document_id, word]
    else:
        return 0

def create_tfidf_dataset(finput):
    df = jl.read_jsonl(finput)
    tfidf = calculate_tfidf_for_dataset(df)
    df['tfidf'] = df.apply(lambda row: [(word, lookup_tfidf_weight(tfidf, row.name, word)) for word in row['docContent'].split()], axis=1)
    return df

def get_query_from_doc(row, n):
    unique_weighed_words = list(dict.fromkeys(row['tfidf']))
    max_n_scored_words = sorted(unique_weighed_words, key=lambda word_weight: word_weight[1], reverse=True)[:n]
    query = ' '.join([word for word, _ in max_n_scored_words])
    return query

In [62]:

df = create_tfidf_dataset('lem-all-unique-functions.jsonl')

In [66]:
df['n_unique_words'] = df['docContent'].str.split().apply(lambda x: np.unique(x).size)
at_least_five_unique_words = df['n_unique_words'] >= 5
filtered = df[at_least_five_unique_words]
filtered['docQuery'] = filtered.apply(lambda row: get_query_from_doc(row, 5), axis=1)
filtered['expectedDocIds'] = filtered['docIds']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['docQuery'] = filtered.apply(lambda row: get_query_from_doc(row, 5), axis=1)


In [67]:
jl.to_jsonl(filtered[['expectedDocIds', 'docType', 'docQuery']], 'tf_idf.test.jsonl')