In [2]:
import pandas as pd
import numpy as np
from typing import List

In [3]:
def read_transcript(file_path:str,strip_newlines:bool=True):
    with open(file_path, 'r') as f:
        transcript = f.readlines()
    if strip_newlines:
        lines = [line.strip() for line in transcript]
    return lines

def create_dataframe(lines:List[str]):
    df = pd.DataFrame(lines, columns=['text'])
    df['speaker'] = df['text'].apply(lambda x: x.split(':')[0])
    df['text'] = df['text'].apply(lambda x: x.split(':')[1])
    df['text'] = df['text'].apply(lambda x: x.strip())
    df["title"] = df["text"][0]
    df["description"] = df["text"][1]
    df.drop([0,1], inplace=True)
    return df

def create_embeddings_dataframe(lines:List[str]):
    df = pd.DataFrame(lines, columns=['text'])
    df.drop([0,1], inplace=True)
    return df

def create_finetuning_dataframe(lines:List[str]):
    #title = lines[0]
    description = lines[1]
    if lines[2].startswith("THERAPIST:"):
        lines = lines[3:]
    prompts = []
    responses = []
    window = 3
    for i in range(0, len(lines)-1, 2):
        w = min(window, i)
        prev = max(i-1, 0)
        history = lines[i-w:prev] #chathistory of previous window lines, or if i < window, then the whole chat history
        prompt = "prompt: " + description + "\n\n###\n\n" + "\n".join(history) + "\n"+ lines[i] +"\n" + "THERAPIST: "
        prompts.append(prompt)
        responses.append(lines[i+1].split(":")[1].strip())

    df = pd.DataFrame({"prompt": prompts, "response": responses})
    return df


In [4]:
transcript = read_transcript('./Transcripts/0.txt')

In [5]:
df_fine_tuning = create_finetuning_dataframe(transcript)

df_fine_tuning.to_csv('./finetuning_dataset_2.csv', index=False)


In [6]:
dfs = []
for i in range(311):
    try:
        transcript = read_transcript('./Transcripts/'+str(i)+'.txt')
        df = create_dataframe(transcript)
        dfs.append(df)
    except:
        print(i)
        continue

for i in range(312,1100):
    try:
        transcript = read_transcript('./Transcripts2/'+str(i)+'.txt')
        df = create_dataframe(transcript)
        dfs.append(df)
    except:
        print(i)
        continue


742
743
744
745
746
747
748
749
750
868


In [None]:
df = pd.concat(dfs)

In [27]:
embeddings_df = create_embeddings_dataframe(transcript)
embeddings_df.head()

Unnamed: 0,text
2,THERAPIST: Hi! Come on in!
3,CLIENT: Good morning!
4,THERAPIST: I’m going to just grab some water.
5,"CLIENT: Okay, sounds good. (pause) How are you?"
6,"THERAPIST: Good, thank you!"
