In [1]:
import csv
import re
import pandas as pd

NO_PROMPT_SEQUENCE = "<<NO PROMPT>>"


In [2]:
def remove_bracketed_comments(old_file, new_file):
    with open(new_file, 'w') as new_file:
        new_file_writer = csv.writer(new_file)
        
        with open(old_file, 'r') as old_file:    
            old_file_reader = csv.reader(old_file)
            
            # Write title into new csv file
            new_file_writer.writerow(next(old_file_reader))
            
            # Iterate through old csv file by line, remove bracketed text, save modified line in new csv file
            for person, comment in old_file_reader:
                regex = re.sub("\[(.*?)\]", '', comment)
                regex = re.sub("\[(.*?)", '', regex)
                
                new_file_writer.writerow([person, regex])

In [3]:
def filter_prompt_response_pairs(old_file, new_file, log=False):
    line_number = 1
    prev_name = None
    prev_line = None
    
    with open(new_file, 'w') as new_file:
        new_file_writer = csv.writer(new_file)
        new_file_writer.writerow(["name", "response", "prompt"])
        
        with open(old_file, 'r') as old_file:
            old_file_reader = csv.reader(old_file)
                        
            """ iterate through all the rows in the file, saving the speaker, their line, and the prev line
            If the speaker is the same as the previous speaker, it is assumed the scene has changed, so the
            prev line is saved as null """
            next(old_file_reader, None)
            for row in old_file_reader:
                name = row[0]
                response = row[1]
                if name != prev_name and prev_line != None:
                    prompt = prev_line
                else:
                    prompt = NO_PROMPT_SEQUENCE
                    if log:
                        print(f"NO PROMPT FOUND FOR LINE {line_number}")
                new_file_writer.writerow([name, response, prompt])
                prev_name = name
                prev_line = response
                line_number += 1        

In [4]:
def filter_by_name(old_file, new_file, name):
    dataframe = pd.read_csv(old_file)
    new_dataframe = dataframe.loc[dataframe["name"] == name]
    new_dataframe.to_csv(new_file, index=False)

In [5]:
def replace_no_prompt_sequence(old_file, new_file, replacement):
    dataframe = pd.read_csv(old_file)
    new_dataframe = dataframe.replace(NO_PROMPT_SEQUENCE, replacement)
    new_dataframe.to_csv(new_file, index = False)

In [6]:
def get_final_dataset(old_file, new_file):
    dataframe = pd.read_csv(old_file)
    new_dataframe = dataframe.loc[:, ["prompt", "response"]]
    new_dataframe.to_csv(new_file, index = False)

In [7]:
file = "Tony.csv"
new_file = "removed_bracketed_comments.csv"
remove_bracketed_comments(file, new_file)

In [8]:
file = new_file
new_file = "prompt_response.csv"

filter_prompt_response_pairs(file, new_file)

In [9]:
file = new_file
new_file = "tony_stark_prompt_response.csv"
person = "Tony"

filter_by_name(file, new_file, person)

In [10]:
file = new_file
new_file = "tony_stark_prompt_response_with_replacement.csv"
replacement = "iron man"

replace_no_prompt_sequence(file, new_file, replacement)

In [11]:
file = new_file
new_file = "FINAL_DATA.csv"

get_final_dataset(file, new_file)