# NOTES TO PUT IN REPORT

To get the dataset, run
_pip install parlai_ - Does not work with Python 3.10, I used Google Colab, but trying Python 3.9 or 3.8 could help.

The files are located in 
_/usr/local/lib/python3.11/dist-packages/data_ - already separated into train, test, and valid, with some preprocessing files I may or may not use

MAYBE I SHOULD USE CLASSIFICATION MODEL ON DATASET TOO (determine if a thought is even unhelpful in the first place. It would kinda suck to have a no-therapy tool that constantly reframes your already positive thoughts into different positive thoughts, making it hard to even tell if you know what you're doing or not)Score.

**imports**

In [1]:
import pandas as pd
import json
import re
import itertools 

In [2]:
#precompile regex patterns for better performance (not redoing in each later function call)
def precompile_mappings(mappings):
    precompiled = {}
    for key in mappings.keys():
        #use word boundaries to avoid partial matches in words ("I'm" before "I")
        #use escape key to ignore special characters
        pattern = r"\b" + re.escape(key) + r"\b"
        #turn pattern into regex object for later use (avoid recompiling regex in each check)
        precompiled[key] = re.compile(pattern, flags=re.IGNORECASE)
    return precompiled

In [3]:
#initialize all replacement text JSON files for data cleaning
with open(f"contractions_dict.json", "r", encoding="utf-8") as file:
    contractions = json.load(file)
with open(f"slang_dict.json", "r", encoding="utf-8") as file:
    slang = json.load(file)
with open(f"leftovers_dict.json", "r", encoding="utf-8") as file:
    leftovers = json.load(file)

#list for easy access and ordered replacements (all contractions, then all slang, then any leftovers)
all_mappings = [contractions, slang, leftovers]


#get dict of text with precompiled mappings (avoids recompiling every time they're found)
precomp_contractions = precompile_mappings(contractions)
precomp_slang = precompile_mappings(slang)
precomp_leftovers = precompile_mappings(leftovers)

precompiled_regex = [precomp_contractions, precomp_slang, precomp_leftovers]


#get sorted keys for comparison (sort by longest -> shortest keys so "I'm'a"  gets checked before "I'm")
#also saves time on resorting if done beforehand
sorted_contractions = sorted(contractions.keys(), key=lambda x: -len(x))
sorted_slang = sorted(slang.keys(), key=lambda x: -len(x))
sorted_leftovers = sorted(leftovers.keys(), key=lambda x: -len(x))

sorted_keys = [sorted_contractions, sorted_slang, sorted_leftovers]

In [4]:
#run each replacement text mapping on given text (needed for all models, so I will do in this step) 
def normalize_text(text):
    #run each replacement individually (one after another)
    for (keys, patterns, mappings) in zip(sorted_keys, precompiled_regex, all_mappings):
        for key in keys:
            #uses precompiled regex pattern to replace with corresponding replacement text
            text = patterns[key].sub(mappings[key], text)
    return text

In [5]:
#for iterating
datasets = ["train", "valid", "test"]

#for confirming sizes of datasets
lengths = []

#do process for each dataset
for dataset in datasets:

    #init
    final_persona = []
    final_pattern = []
    final_negative_thoughts = []
    final_reframed_thoughts = []

    #get longest input/output (for knowing max token length in tokenizer)
    longest_input = ""
    longest_negative = ""
    longest_reframe = ""
    
    #open line-delimited JSON file - each line is a separate JSON file to process separately
    with open(f"./reframe_thoughts_dataset/{dataset}.txt", encoding="utf-8") as file:
        for line in file:
            #original text files has \u2019 (on top of \u0027 for some reason), the former of which messed with data
            #couldn't find way to fix it that worked, so I ran this on copy of dataset with them removed
            obj = json.loads(line)

            #normalize each input to remove contractions/slang/etc.
            persona = normalize_text(obj["persona"])
            pattern = normalize_text(obj["pattern"])
            thought = normalize_text(obj["thought"])

            #check if negative thought is longest - reassign if it is
            if len(thought.split()) >= len(longest_negative.split()):
                longest_negative = thought
            
            #get current input
            long_input = persona + " " + pattern + " " + thought
            #see if longer than longest recorded          
            if len(long_input.split()) >= len(longest_input.split()):
                longest_input = long_input
            
            #duplicate other inputs based on number of reframed thoughts (creates perfect input-output pairs)
            #this does increase chance of overfitting, so hyperparams will be tuned to make sure that doesn't happen
            for reframe in obj["reframes"]:

                #do same for output
                reframe_thought = normalize_text(reframe["reframe"])

                #see if longest output
                if len(reframe_thought.split()) >= len(longest_reframe.split()):
                    longest_reframe = reframe_thought
                
                #add to appropriate lists
                final_persona.append(f"{persona}\n")
                final_pattern.append(f"{pattern}\n")
                final_negative_thoughts.append(f"{thought}\n")
                final_reframed_thoughts.append(f"{reframe_thought}\n")

    #add into dataframes for csv storage
    required_data = pd.DataFrame({
        "negative_thought": final_negative_thoughts,
        "reframed_thought": final_reframed_thoughts
    })
    all_data = pd.DataFrame({
        "persona": final_persona,
        "pattern": final_pattern,
        "negative_thought": final_negative_thoughts,
        "reframed_thought": final_reframed_thoughts
    })

    #add length to lengths for vis of data split - any of the lists work for this
    lengths.append(len(final_reframed_thoughts))

    #save new datasets
    required_data.to_csv(f"{dataset}_data.csv", index=False, encoding="utf-8")
    all_data.to_csv(f"all_{dataset}_data.csv", index=False, encoding="utf-8")

    #see longest input and output
    print(f"Longest input (with persona and pattern, without keywords): {len(longest_input.split())} words.")
    print(f"Longest input (just negative_thought): {len(longest_negative.split())} words.")
    print(f"Longest output (reframed_thought): {len(longest_reframe.split())} words.")
    
    print(f"{dataset} data done!\n")

Longest input (with persona and pattern, without keywords): 108 words.
Longest input (just negative_thought): 59 words.
Longest output (reframed_thought): 70 words.
train data done!

Longest input (with persona and pattern, without keywords): 113 words.
Longest input (just negative_thought): 56 words.
Longest output (reframed_thought): 79 words.
valid data done!

Longest input (with persona and pattern, without keywords): 120 words.
Longest input (just negative_thought): 45 words.
Longest output (reframed_thought): 68 words.
test data done!



In [6]:
total_length = lengths[0] + lengths[1] + lengths[2]

print(f"Total Dataset Size: {total_length}\n")
print(f"Training Data Size: {lengths[0]} samples - {(lengths[0] / total_length * 100):.2f}% of data")
print(f"Validation Data Size: {lengths[1]} samples - {(lengths[1] / total_length * 100):.2f}% of data")
print(f"Testing Data Size: {lengths[2]} samples - {(lengths[2] / total_length * 100):.2f}% of data")

Total Dataset Size: 26507

Training Data Size: 18635 samples - 70.30% of data
Validation Data Size: 5249 samples - 19.80% of data
Testing Data Size: 2623 samples - 9.90% of data
