In [None]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
device = "cpu"

def get_embds_score(t5, pred, gt):
    pred_embds = t5.encode(pred, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)
    gt_embds = t5.encode(gt, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)

    res = abs((cosine_similarity(gt_embds, pred_embds)) ** 3)

    return res[0][0]

def calc_score(t5, prompt, embds):
    prompt_embds = t5.encode(prompt, normalize_embeddings=True, show_progress_bar=False).reshape(1, -1)
    res = ((cosine_similarity(embds, prompt_embds)) ** 3).mean()
    return res

def get_dataset_pedro():
    prompts = json.load(open("prompts_selected.json"))
    df = pd.DataFrame({"rewrite_prompt": prompts})
    return df

In [None]:
t5 = SentenceTransformer("sentence-transformers/sentence-t5-base", device=device)

In [None]:
df_gpt = get_dataset_pedro()
embds = t5.encode(df_gpt["rewrite_prompt"].tolist(), normalize_embeddings=True, show_progress_bar=True)

In [None]:
print(len(embds))

In [None]:
print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', embds))
print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.</s>', embds))
print(calc_score(t5, 'Improve the text to this.', embds))
print(calc_score(t5, 'Improve the text to this.</s>', embds))
print(calc_score(t5, 'Rewrite the text to this.', embds))
print(calc_score(t5, 'Rewrite the text to this.</s>', embds))
print(calc_score(t5, 'Modify text better.', embds))
print(calc_score(t5, 'Improve rephrase text manner this written to has language tone within to ', embds))



In [None]:
print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.', embds))
print(calc_score(t5, 'Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.</s>', embds))
print(calc_score(t5, 'Improve the text to this.', embds))
print(calc_score(t5, 'Improve the text to this.</s>', embds))
print(calc_score(t5, 'Rewrite the text to this.', embds))
print(calc_score(t5, 'Rewrite the text to this.</s>', embds))
print(calc_score(t5, 'Reword text better through things involved human expressed retell mentee item to create ensuing simple the following please atuin prospective', embds))
print(calc_score(t5, 'Modify text better conveying described to human mentee this following body out it of language from on this is about.', embds))
print(calc_score(t5, 'Improve text rephrase narrative tone to this .', embds))


In [None]:
tids = t5.tokenizer(['Improve the text to this.</s>'], return_tensors="pt", add_special_tokens=False).to(device)

print(t5.tokenizer.batch_decode(tids["input_ids"]))

import torch
with torch.no_grad():
    tembds = t5(tids)["sentence_embedding"].cpu().numpy()

cos_sim = (cosine_similarity(tembds, embds) ** 3).mean()
cos_sim

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def get_most_common_ngrams(texts, n=2, top_n=5):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    count_values = X.toarray().sum(axis=0)
    vocabulary = vectorizer.vocabulary_
    freq_dist = Counter(dict(zip(vocabulary.keys(), count_values)))
    return freq_dist.most_common(top_n)

n_grams = get_most_common_ngrams(df_gpt["rewrite_prompt"].tolist(), n=2, top_n=10)

n_grams

In [None]:
bow = {}
text_list = df_gpt["rewrite_prompt"].tolist()
for i, text in enumerate(text_list):    
    words = text.split()
    for word in words:
        word = "".join(filter(str.isalnum, word)).lower().strip()
        if not word:
            continue
        if word not in bow:
            bow[word] = 0
        bow[word] += 1
bow_tup = [(k, v) for k, v in bow.items()]
sorted_bow = sorted(bow_tup, key=lambda x: x[1], reverse=True)
sorted_bow = list(sorted_bow)[:1000]
all_words = [tup[0] for tup in sorted_bow]
all_words = [w for w in all_words if w not in ("portrayal", "conveying", "convey", "compelling", "compel", "expressing", "improving", "retell", "reword", "engaging", "storytelling")]
len(all_words), all_words[:10]

In [None]:
beam_width = 50  # Number of beams to keep after each step
num_words = 12  # Total number of words to generate
all_beams = [([], 0)]  # Starting with empty sequence and 0 score
pbar = tqdm(range(num_words))
for step in pbar:
    new_beams = []
    for sel_words, score in all_beams:
        cur_text = " ".join(sel_words)
        if sel_words:
            cur_text += " "
        all_text = [cur_text + word for word in all_words]
       
        for i, t in enumerate(all_text):
            t = t[0].upper() + t[1:]
            if len(t.split()) > 3:
                t = t + "."
            all_text[i] = t

        if not sel_words:  # Capitalize the first word of the sentence
            all_text = [t[0].upper() + t[1:] for t in all_text]
        text_embds = t5.encode(all_text, normalize_embeddings=True, show_progress_bar=False)
        scores = (cosine_similarity(embds, text_embds) ** 3).mean(axis=0)
        for i, new_score in enumerate(scores):
            new_beams.append((sel_words + [all_words[i]], new_score))
    # Keep only the best `beam_width` beams
    all_beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
    best = all_beams[0]
    pbar.set_description(best[1], " ".join(best[0]))

# Select the best beam
best_words, best_score = max(all_beams, key=lambda x: x[1])
print(" ".join(best_words))