In [None]:
import pandas as pd
from rapidfuzz import fuzz
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
data = pd.read_csv("../../data/data.csv")

In [None]:
print(len(data))
data.dropna(inplace=True)  
print(len(data))

In [None]:
# Remove rewrite_prompt
data = data.loc[
    ~(
        data.rewrite_prompt.str.contains("Translate", case=False) &
        data.rewrite_prompt.str.contains("from", case=False) &
        data.rewrite_prompt.str.contains("to", case=False)
    )
]
print(data.shape)
words_to_remove = ["painting", "logo", "programming", "json", "html", "markdown", "python", "java", "audio", "visual", "storyboard"]
for word in words_to_remove:
   data = data.loc[
      ~(data.rewrite_prompt.str.contains(word, regex=False, case=False))
   ]
print(data.shape)
data = data.loc[
    ~(
        data.rewrite_prompt.str.contains("code", case=False) &
        data.rewrite_prompt.str.contains("language", case=False)
    )
]
print(data.shape)

In [None]:
## Process rewritten text
data["first_phrase"] = data.rewritten_text.str.strip().str.split("\\n", expand=True)[0]

def remove_first_phrase(x):
    return x.rewritten_text.replace(x.first_phrase, "")

data.loc[
    data.first_phrase.str.contains("##", case=False), "rewritten_text"
] = data.loc[
    data.first_phrase.str.contains("##", case=False)
].apply(remove_first_phrase, axis=1)

data.loc[
    data.first_phrase.str.contains("\*\*", case=False), "rewritten_text"
] = data.loc[
    data.first_phrase.str.contains("\*\*", case=False)
].apply(remove_first_phrase, axis=1)

data.loc[
    data.first_phrase.str.contains("sure", case=False) |
    data.first_phrase.str.contains("here's", case=False), "rewritten_text"
] = data.loc[
    data.first_phrase.str.contains("sure", case=False) |
    data.first_phrase.str.contains("here's", case=False)
].apply(remove_first_phrase, axis=1)

In [None]:
data.loc[
    data.first_phrase.str.contains("\*\*", case=False), "rewritten_text"
] = data.loc[
    data.first_phrase.str.contains("\*\*", case=False)
].apply(remove_first_phrase, axis=1)

In [None]:
# Remove texts where rewritten_text == original_text
data = data.loc[
    ~(data.rewritten_text == data.original_text)
]
print(data.shape)

In [None]:
to_remove = [
    "i am not able",
    "i'm not able",
    "text does not",
    "text doesn't",
    "i am unable",
    "i'm unable",
    "i will not provide",
    "i won't provide",
    "inappropriate",
    "the rewritten text",
    "the transformed text",
    "text rewritten",
    "text transformed",
    "here is the text",
    "text you provided",
    "text provided",
    "does not describe",
    "certainly"

]
for word in to_remove:
    data = data.loc[
        ~data.rewritten_text.str.contains(word, case=False)
    ]
print(data.shape)

In [None]:
data["text_ratio"] = data.progress_apply(lambda x: fuzz.ratio(x.original_text, x.rewritten_text), axis=1)
# Remove texts where text_ratio > 95
data = data.loc[data.text_ratio < 95]
print(data.shape)

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
def is_redundant_pattern(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    
    # Loop through the tagged tokens to find a noun followed by "to be" and a verb pattern
    for i in range(len(tagged)-3):
        if tagged[i][1].startswith('NN') and tagged[i+1][0].lower() == 'to' and tagged[i+2][0].lower() == 'be' and tagged[i+3][1].startswith('VBN'):                
            return True
    return False

In [None]:
data["is_redudant"] = data.rewrite_prompt.progress_apply(is_redundant_pattern)
data = data.loc[~data.is_redudant]
print(data.shape)

In [None]:
data = data[["id", "original_text", "rewrite_prompt", "rewritten_text", "cluster"]].drop_duplicates().dropna()
data["rewritten_text"] = data.rewritten_text.str.strip()

In [None]:
data.to_csv("../../data/data.csv", index=False)

In [None]:
## Add prompt variations and get new dataset
# import json
# prompt_variations = json.load(open("/home/llm-prompt-recovery/data/prompt_variations.json"))
# new_data = pd.read_csv("../../data/new_data.csv")
# old_data = pd.read_csv("../../data/data.csv")
# all_data = pd.concat([data, new_data])
# prompts_to_keep = list(prompt_variations.keys())
# for v in prompt_variations.values():
#     prompts_to_keep.extend(v)
# prompts_to_keep = set(prompts_to_keep)
# all_data = all_data.loc[all_data.rewrite_prompt.isin(prompts_to_keep)]
# original_text_counts = all_data.original_text.value_counts()
# texts_to_keep = original_text_counts[original_text_counts > 1].index.values
# all_data = all_data.loc[all_data.original_text.isin(texts_to_keep)]
# all_data.to_csv("../../data/new_data_for_training.csv", index=False)