In [1]:
from pathlib import Path
import json

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


device = "cuda:0"


def get_embds_score(t5, pred, gt):
    pred_embds = t5.encode(pred, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)
    gt_embds = t5.encode(gt, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)

    res = abs((cosine_similarity(gt_embds, pred_embds)) ** 3)

    return res[0][0]


def filter_df(df):
    df = df.fillna("")

    if "subject" in df.columns:
        df = df[["original_text", "rewrite_prompt", "rewritten_text", "subject"]].reset_index(drop=True)
    
    else:
        df = df[["original_text", "rewrite_prompt", "rewritten_text"]].reset_index(drop=True)
    
    df["original_text"] = df["original_text"].apply(lambda x: str(x).strip())
    df["rewritten_text"] = df["rewritten_text"].apply(lambda x: str(x).strip())
    df["rewrite_prompt"] = df["rewrite_prompt"].apply(lambda x: str(x).strip())

    if "subject" in df.columns:
        df["subject"] = df["subject"].apply(lambda x: str(x).strip())
        df = df[df["subject"].apply(lambda x: len(x) >= 5 and len(x) <= 200)].reset_index(
                drop=True
            )

    df = df[df["original_text"].apply(lambda x: len(x) >= 300 and len(x) <= 2000)].reset_index(
        drop=True
    )
    df = df[df["rewritten_text"].apply(lambda x: len(x) >= 200 and len(x) <= 3000)].reset_index(
        drop=True
    )
    df = df[df["rewrite_prompt"].apply(lambda x: len(x) >= 5 and len(x) <= 500)].reset_index(
        drop=True
    )


    return df



def get_dataset_pub(data_path="/kaggle/input/df_with_emb.parquet"):
    df = pd.read_parquet(data_path).fillna("")
    df = df[["original_text", "rewrite_prompt", "rewritten_text"]].reset_index(drop=True)
    df = filter_df(df)
    return df


def get_dataset_gpt():
    data_list = [
        # "/kaggle/input/gemma_rewritten_text_exllama/proc_dataset_updated.csv",
        # "/kaggle/input/pedro-data/data_subject.csv",
        # "/kaggle/input/pedro-data/data_subject_2.csv",
        # "/kaggle/input/pedro-data/data_subject_3.csv",
        
        "/home/mpf/code/kaggle/llm-prompt/selected_df_optim.csv"
    ]
    df = pd.concat([pd.read_csv(data) for data in data_list], ignore_index=True)
    df = filter_df(df)

    return df


def get_embds_path(t5, text_list, path):
    path = Path(path)

    # if path.exists():
    if 0:
        return np.load(path, allow_pickle=True)
    
    
    # text_list = ["".join([t for t in text if t.isalpha() or t in (" ",)]) for text in text_list]
    # print(text_list[:10])

    embds = t5.encode(text_list, normalize_embeddings=True, show_progress_bar=True, batch_size=8)
    np.save(path, embds, allow_pickle=True)

    return embds


def calc_score(t5, prompt, embds):
    prompt_embds = t5.encode(prompt, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)
    res = ((cosine_similarity(embds, prompt_embds)) ** 3).mean()
    return res


def get_dataset_pedro():
    # prompts = json.load(open("/home/mpf/code/kaggle/pedro-llm-prompt/src/data_generation/prompts_selected.json"))
    prompts = json.load(open("/home/mpf/code/kaggle/pedro-llm-prompt/data/prompts_selected.json"))
    df = pd.DataFrame({"rewrite_prompt": prompts})
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_pub = get_dataset_pub()
# df_gpt = get_dataset_gpt()
df_gpt = get_dataset_pedro()


In [3]:
t5 = SentenceTransformer("sentence-transformers/sentence-t5-base", device=device)

embds_pub = get_embds_path(t5, df_pub["rewrite_prompt"].tolist(), "/kaggle/working/llm_prompt_embds_pub.npy")
embds_gpt = get_embds_path(t5, df_gpt["rewrite_prompt"].tolist(), "/kaggle/working/llm_prompt_embds_gpt.npy")
   

Batches: 100%|██████████| 5085/5085 [00:51<00:00, 98.91it/s] 
Batches: 100%|██████████| 202/202 [00:02<00:00, 85.54it/s] 


In [4]:
print(len(embds_pub))
print(len(embds_gpt))

40676
1616


In [5]:
embds = embds_pub
# embds = embds_gpt

print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', embds))
print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.</s>', embds))
print(calc_score(t5, 'Improve the text to this.', embds))
print(calc_score(t5, 'Improve the text to this.</s>', embds))
print(calc_score(t5, 'Rewrite the text to this.', embds))
print(calc_score(t5, 'Rewrite the text to this.</s>', embds))
print(calc_score(t5, 'Modify text better.', embds))

0.5021735
0.5205153
0.56059414
0.6021226
0.56864166
0.61298794
0.53580457


In [40]:

# embds = embds_pub
embds = embds_gpt

# print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', embds))
# print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.</s>', embds))
# print(calc_score(t5, 'Improve the text to this.', embds))
# print(calc_score(t5, 'Improve the text to this.</s>', embds))
# print(calc_score(t5, 'Rewrite the text to this.', embds))
# print(calc_score(t5, 'Rewrite the text to this.</s>', embds))


# print(calc_score(t5, 'Improve rephrase text manner this written to has character in style.', embds))
# print(calc_score(t5, 'Improve rephrase text manner this written to has character in style.', embds))
print(calc_score(t5, 'Improve rephrase text manner this written to has character in style to .', embds))
print(calc_score(t5, 'Improve text rephrase manner this written being sounds describe or written to .', embds))


print(calc_score(t5, 'Improve text rephrase manner this written being sounds describe or written .', embds))
print(calc_score(t5, 'Improve rephrase text manner this piece tone into written object provided .', embds))


0.6432702
0.64625883
0.6402337
0.64625883


In [7]:
tids = t5.tokenizer(['Improve the text to this.</s>'], return_tensors="pt", add_special_tokens=False).to(device)

print(t5.tokenizer.batch_decode(tids["input_ids"]))

import torch
with torch.no_grad():
    tembds = t5(tids)["sentence_embedding"].cpu().numpy()

cos_sim = (cosine_similarity(tembds, embds) ** 3).mean()
cos_sim

['Improve the text to this.</s>']


0.59822756

In [8]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def get_most_common_ngrams(texts, n=2, top_n=5):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    count_values = X.toarray().sum(axis=0)
    vocabulary = vectorizer.vocabulary_
    freq_dist = Counter(dict(zip(vocabulary.keys(), count_values)))
    return freq_dist.most_common(top_n)

n_grams = get_most_common_ngrams(df_gpt["rewrite_prompt"].tolist(), n=2, top_n=10)

n_grams

[('customer support', 284),
 ('persuasive using', 252),
 ('using exclusive', 252),
 ('simpler and', 218),
 ('see the', 189),
 ('past tense', 183),
 ('and upholds', 172),
 ('emotions evoked', 169),
 ('abbreviations to', 162),
 ('concepts such', 147)]

In [34]:
bow = {}

embds = embds_gpt
text_list = df_gpt["rewrite_prompt"].tolist()


# embds = embds_pub
# text_list = df_pub["rewrite_prompt"].tolist()


for i, text in enumerate(text_list):    
    words = text.split()
    
    for word in words:
        word = "".join(filter(str.isalnum, word)).lower().strip()
        
        if not word:
            continue

        if word not in bow:
            bow[word] = 0
        
        bow[word] += 1
        
bow_tup = [(k, v) for k, v in bow.items()]
sorted_bow = sorted(bow_tup, key=lambda x: x[1], reverse=True)
sorted_bow = list(sorted_bow)[:1000]

all_words = [tup[0] for tup in sorted_bow]
# all_words = [w for w in all_words if w not in ("portrayal", "conveying", "convey", "compelling", "compel", "expressing", "improving", "retell", "reword", "engaging", "storytelling")]
all_words = [w for w in all_words if w not in ("portrayal", "conveying", "convey", "compelling", "compel", "expressing", "improving", "retell", "reword", "engaging", "storytelling", "person", "to")]

# all_words = all_words + [word + "," for word in all_words]

len(all_words), all_words[:10]

(989,
 ['the', 'a', 'and', 'this', 'of', 'text', 'rewrite', 'in', 'more', 'tone'])

In [35]:
beam_width = 50  # Number of beams to keep after each step
num_words = 15  # Total number of words to generate
all_beams = [([], 0)]  # Starting with empty sequence and 0 score

for step in range(num_words):
    new_beams = []
    for sel_words, score in all_beams:
        cur_text = " ".join(sel_words)
        if sel_words:
            cur_text += " "
        all_text = [cur_text + word for word in all_words]
       
        for i, t in enumerate(all_text):
            t = t[0].upper() + t[1:]
            if len(sel_words) >= 3:
                t = t + "."
                # t = t + " to ."
            all_text[i] = t
    
        text_embds = t5.encode(all_text, normalize_embeddings=True, show_progress_bar=False, batch_size=8)
        scores = (cosine_similarity(embds, text_embds) ** 3).mean(axis=0)
        for i, new_score in enumerate(scores):
            new_beams.append((sel_words + [all_words[i]], new_score))

    # Keep only the best `beam_width` beams
    all_beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
    
    best = all_beams[0]
    print(step, best[1], " ".join(best[0]))

# Select the best beam
best_words, best_score = max(all_beams, key=lambda x: x[1])
print(" ".join(best_words))

0 0.53978646 rephrase
1 0.5973522 rephrase text
2 0.6075368 improve rephrase text
3 0.6167569 modify text better exude
4 0.62476146 improve text this sounds if
5 0.6342309 improve text this sounds within manner
6 0.6368616 improve rephrase text manner this piece tone
7 0.6409052 improve rephrase text manner this piece tone into
8 0.6420898 improve rephrase text manner this piece tone into given
9 0.64481306 improve rephrase text manner this piece tone form product into
10 0.64510465 improve rephrase text manner this piece tone into written object provided
11 0.6475574 improve rephrase text manner this piece tone form product into written given
12 0.64904976 improve rephrase text manner this piece tone form product into from current place
13 0.6496401 improve rephrase text manner this piece tone into written object within convincing appealing same
14 0.65114844 improve rephrase text manner this piece tone into written object within convincing suit current describing
improve rephrase tex

In [None]:
# 0 0.5397863 rephrase
# 1 0.5973522 rephrase text
# 2 0.6075368 improve rephrase text
# 3 0.6167569 modify text better exude
# 4 0.62476146 improve text this sounds if
# 5 0.6342308 improve text this sounds within manner
# 6 0.6368615 improve rephrase text manner this piece tone
# 7 0.6409051 improve rephrase text manner this piece tone into
# 8 0.6431984 improve rephrase text manner this written to has character
# 9 0.64508 improve rephrase text manner this written to has character with
# 10 0.6485689 improve rephrase text manner this written to has character in style
# 11 0.650034 improve rephrase text manner this written to has character is in style
# 12 0.65163815 improve rephrase text manner this written to has character has hear within fashion
# 13 0.65164465 improve rephrase text manner this written to has character has hear there in style
# 14 0.65171796 improve rephrase text manner this written to has character has hear there in style within
# improve rephrase text manner this written to has character has hear there in style within

In [None]:
# 0 0.5397863 rephrase
# 1 0.5973522 rephrase text
# 2 0.6075368 improve rephrase text
# 3 0.6157136 improve text this describe
# 4 0.62748814 improve text this sounds put
# 5 0.6391686 improve text rephrase manner this written
# 6 0.6386987 improve text rephrase manner this sounds written
# 7 0.6417676 improve text rephrase manner this write sounds written
# 8 0.64228374 improve text rephrase manner this written sounds describe written
# 9 0.64566153 improve text rephrase manner this written sounds describe or written
# 10 0.64625907 improve text rephrase manner this written being sounds describe or written
# 11 0.64795214 improve text rephrase manner this body written sounds express or are describing
# 12 0.6480724 improve text rephrase manner this body written sounds express or are describing written
# 13 0.65008354 improve text rephrase manner this body written sounds express or are describing an written
# 14 0.6513185 improve text rephrase manner this body written sounds express or are describing a was written
# improve text rephrase manner this body written sounds express or are describing a was written