In [13]:
import os
from dotenv import load_dotenv
load_dotenv()
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI
import time

Open guideline descriptions dir and read into a dictionary

In [14]:
dirPath = "../data/guideline_desc"

descFiles = os.listdir(dirPath)

guidelines = {}

for filename in descFiles:
    filePath = os.path.join(dirPath, filename)

    if os.path.isfile(filePath):
        with open(filePath, "r") as file:
            fileContent = file.read
            # No extension filename
            strippedFilename = os.path.splitext(filename)[0]
            guidelines[strippedFilename] = fileContent

Create directories for training data, if not already created

In [15]:
for key in guidelines:
    if not os.path.exists(f"../data/guideline_data/{key}"):
        os.makedirs(f"../data/guideline_data/{key}")

Promt for generating

In [16]:
prompt = """
Your role is to generate a potential statement from a Spanish speaker that would be categorized as "{level}".

Here is the description for "{level}":

"{description}"

Using this context, generate 5 statements that would be classified as coming from an "{level}" Spanish speaker. The statements should be rather brief, as if they were part of a conversation, but need not only be one sentence. Each statement should vary in length, from 1-3 sentences. Only provide the statements - do NOT provide English translations.
"""

In [17]:
openAI_client = OpenAI(api_key=os.environ.get("OPENAI_KEY"))

def generateLevel(level, description, i):
    print(f"Generating level {level} | {i}")
    
    chatCompletion = openAI_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt.format(level=level, description=description),
            }
        ],
        model="gpt-3.5-turbo",
        temperature=1.3,
        frequency_penalty=0.7
    )

    response = chatCompletion.choices[0].message.content

    with open(f"../data/guideline_data/{level}/{i}.txt", "w") as file:
        file.write(response)

I was going to benchmark between single thread and multithread, but single threaded speed almost killed me, so let's just stick to multithreaded :)

In [18]:
'''
# Single threaded
for level, description in guidelines.items():
    for i in range(20):
        generateLevel(level, description, i)
'''

with ThreadPoolExecutor() as executor:
    for level, description in guidelines.items():
        futures = []
        for i in range(20):
            futures.append(executor.submit(generateLevel, level, description, i))
            # Delay to preserve my query per second limit
            time.sleep(.5)

        # Wait for all tasks to complete before moving to the next level
        for future in futures:
            future.result()

Generating level Advanced Mid | 0
Generating level Advanced Mid | 1
Generating level Advanced Mid | 2
Generating level Advanced Mid | 3
Generating level Advanced Mid | 4
Generating level Advanced Mid | 5
Generating level Advanced Mid | 6
Generating level Advanced Mid | 7
Generating level Advanced Mid | 8
Generating level Advanced Mid | 9
Generating level Advanced Mid | 10
Generating level Advanced Mid | 11
Generating level Advanced Mid | 12
Generating level Advanced Mid | 13
Generating level Advanced Mid | 14
Generating level Advanced Mid | 15
Generating level Advanced Mid | 16
Generating level Advanced Mid | 17
Generating level Advanced Mid | 18
Generating level Advanced Mid | 19
Generating level Distinguished | 0
Generating level Distinguished | 1
Generating level Distinguished | 2
Generating level Distinguished | 3
Generating level Distinguished | 4
Generating level Distinguished | 5
Generating level Distinguished | 6
Generating level Distinguished | 7
Generating level Distinguished

There are some files with lines between the lines. Now I remove it.

In [19]:
def removeBlankLines(path):
    with open(path, 'r') as file:
        lines = file.readlines()

    # Remove empty lines
    nonEmptyLine = [line.strip() for line in lines if line.strip()]

    with open(path, 'w') as file:
        file.write('\n'.join(nonEmptyLine))

def processDirBlankLines(path):
    for root, dirs, files in os.walk(path):
        for filename in files:
            if filename.endswith('.txt'):
                filePath = os.path.join(root, filename)
                removeBlankLines(filePath)
                print(f"Processed: {filePath}")

dirPath = '../data/guideline_data'
processDirBlankLines(dirPath)

Processed: ../data/guideline_data/Advanced High/15.txt
Processed: ../data/guideline_data/Advanced High/14.txt
Processed: ../data/guideline_data/Advanced High/16.txt
Processed: ../data/guideline_data/Advanced High/17.txt
Processed: ../data/guideline_data/Advanced High/13.txt
Processed: ../data/guideline_data/Advanced High/12.txt
Processed: ../data/guideline_data/Advanced High/10.txt
Processed: ../data/guideline_data/Advanced High/11.txt
Processed: ../data/guideline_data/Advanced High/9.txt
Processed: ../data/guideline_data/Advanced High/8.txt
Processed: ../data/guideline_data/Advanced High/5.txt
Processed: ../data/guideline_data/Advanced High/4.txt
Processed: ../data/guideline_data/Advanced High/6.txt
Processed: ../data/guideline_data/Advanced High/7.txt
Processed: ../data/guideline_data/Advanced High/3.txt
Processed: ../data/guideline_data/Advanced High/2.txt
Processed: ../data/guideline_data/Advanced High/0.txt
Processed: ../data/guideline_data/Advanced High/1.txt
Processed: ../data/g

In [20]:
def splitToSingleLines(filePath):
    with open(filePath, 'r') as file:
        content = file.read()

    fileNum = int(filePath.split('/')[-1].split('.')[0])

    lines = content.split('\n')
    i = 0
    for line in lines:
        # Remove the leading "*. "
        line = line.strip()[3:]
        itemFileName = f"item{fileNum * 5 + i}.txt"

        itemFilePath = os.path.join(os.path.dirname(filePath), itemFileName)
        with open(itemFilePath, 'w') as itemFile:
            itemFile.write(line)

        print(f"Created: {itemFilePath}")
        i += 1

def processDir(dirPath):
    for root, dirs, files in os.walk(dirPath):
        for filename in files:
            if filename.endswith('.txt'):
                file_path = os.path.join(root, filename)
                splitToSingleLines(file_path)
                print(f"Processed: {file_path}")

folder_path = '../data/guideline_data'
processDir(folder_path)

Created: ../data/guideline_data/Advanced High/item75.txt
Created: ../data/guideline_data/Advanced High/item76.txt
Created: ../data/guideline_data/Advanced High/item77.txt
Created: ../data/guideline_data/Advanced High/item78.txt
Created: ../data/guideline_data/Advanced High/item79.txt
Processed: ../data/guideline_data/Advanced High/15.txt
Created: ../data/guideline_data/Advanced High/item70.txt
Created: ../data/guideline_data/Advanced High/item71.txt
Created: ../data/guideline_data/Advanced High/item72.txt
Created: ../data/guideline_data/Advanced High/item73.txt
Created: ../data/guideline_data/Advanced High/item74.txt
Processed: ../data/guideline_data/Advanced High/14.txt
Created: ../data/guideline_data/Advanced High/item80.txt
Created: ../data/guideline_data/Advanced High/item81.txt
Created: ../data/guideline_data/Advanced High/item82.txt
Created: ../data/guideline_data/Advanced High/item83.txt
Created: ../data/guideline_data/Advanced High/item84.txt
Processed: ../data/guideline_data/A