# Python Script for Text OCR using OpenAI API
---
[OpenAI API documentation](https://platform.openai.com/docs/api-reference/introduction)

In [None]:
from dotenv import load_dotenv # load .env file for obtaining api key
from openai import OpenAI      # text generation
import pandas as pd            # save generated text as csv
import re                      # regex for cleaning text
import random      

load_dotenv()  # config .env file

In [None]:
client = OpenAI()

def generate_OCR_text_shopping(given_text, max_tokens = 6):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": """Output something that is found in a store from the given broad category. I just want the item name, 
            be thorough with your output."""
        },
        {
        "role": "user",
        "content": f"Given shopping category: {given_text}"
        }
    ],
    temperature=1,
    max_tokens=max_tokens,
    top_p=1
    )
    return response.choices[0].message.content

def generate_OCR_text_food_dishes(given_text, max_tokens = 6):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": """Your job is just to output me a some food or dish name from a broad food/dish category that I will provide."""
        },
        {
        "role": "user",
        "content": f"Given food/dish category: {given_text}"
        }
    ],
    temperature=1,
    max_tokens=max_tokens,
    top_p=1
    )
    return response.choices[0].message.content

def generate_OCR_text_titles(given_text, max_tokens = 12):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": """ Your job is just to output me a specific title from a broad type of medium category that I will provide.
            This title can either be some famous title or a made up one pertaining to the given category. Try not to pick or make long 
            titles."""
        },
        {
        "role": "user",
        "content": f"Given medium category: {given_text}"
        }
    ],
    temperature=1,
    max_tokens=max_tokens,
    top_p=1
    )
    return response.choices[0].message.content

# functions used to clean up the generated text (aka output)
def remove_quotation_marks(response_output):
    return response_output.replace('"', '')

def remove_delimiters(response_output):
    response_output = response_output.replace('\n', '')
    response_output = response_output.replace('\\', '')
    return response_output

def clean_text(response_output, title=False):
    # remove emojis (if any)
    response_output = re.sub(r'[^\w\s,]', '', response_output)

    # remove specific punctuation (in this case, a period)
    response_output = response_output.replace('.', '')

    # if not a title, convert text to lowercase
    if not title:
        response_output = response_output.lower()

    return response_output

# store the generated text in lists
shopping_goods_entries = list()
food_dishes_entries = list()
medium_entries = list()

# categories for shopping, food and dishes, and title types
shopping_goods_categories = [
    "Groceries", "Fresh Produce", "Dairy Products", "Bakery", "Meat and Poultry",
    "Seafood", "Frozen Foods", "Canned Goods", "Snacks", "Sweets",
    "Beverages", "Household Items", "Cleaning Supplies", "Laundry Supplies", "Kitchen Essentials",
    "Skincare", "Haircare", "Oral Care", "Bath and Body", "Health and Wellness Products", "Clothing and Accessories",
    "Men's Clothing", "Women's Clothing", "Children's Clothing", "Shoes", "Jewelry and Accessories",
    "Electronics", "Televisions", "Computers and Laptops", "Mobile Phones", "Cameras",
    "Audio Equipment", "Home and Furniture", "Living Room Furniture", "Bedroom Furniture", "Kitchen and Dining Furniture",
    "Home Décor", "Bedding and Linens", "Toys", "Action Figures", "Board Games",
    "Puzzles", "Outdoor Toys", "Automotive", "Car Maintenance Products", "Tires",
    "Car Accessories", "Tools and Equipment", "Outdoor Equipment", "Sporting Goods", "Fitness Equipment",
    "Outdoor Gear", "Bicycles", "Garden and Patio", "Gardening Tools", "Outdoor Furniture",
    "Plants and Seeds", "Grills and Outdoor Cooking", "Pharmacy", "Prescription Medications", "Over-the-Counter Medications",
    "Vitamins and Supplements", "Office Supplies", "Office Furniture", "School Supplies", "Pet Supplies",
    "Pet Food", "Pet Toys", "Pet Health Products", "Pet Accessories", "Consoles"
]

food_and_dishes_categories = [
    "Fruits", "Vegetables", "Dairy Products", "Meat and Poultry", "Seafood",
    "Bakery", "Frozen Foods", "Canned Goods", "Beverages", "Grains and Pasta",
    "Cereals", "Condiments and Sauces", "Spices and Herbs", "Nuts and Seeds", "Soups and Broths",
    "Oils and Vinegars", "Health Foods", "Baby Food", "Breakfast Foods", "Mexican Dishes",
    "American Dishes", "Argentinian Dishes", "Indian Dishes", "Chinese Dishes", "Japanese Dishes",
    "Italian Dishes", "French Dishes", "German Dishes", "Greek Dishes", "Korean Dishes",
    "Thai Dishes", "Vietnamese Dishes", "Spanish Dishes", "Brazilian Dishes", "Middle Eastern Dishes",
    "African Dishes", "Caribbean Dishes", "Deli Meats", "Cheese", "Ice Cream and Desserts"
]

medium_types = [
    "Books", "Movies", "TV Shows", "Songs", "Albums",
    "Paintings", "Poems", "Plays", "Video Games", "Podcasts",
    "Websites", "Magazines", "Academic Journals", "Newspapers", "Comics"
]

def gen_clean_OCR_output(output_size, categories):
    output_text_list = list()
    while len(output_text_list) < output_size:
        # randomly choose a category from given category list
        category = random.choice(categories)
        response = generate_OCR_text_shopping(category)

        # clean up the output
        response = remove_quotation_marks(response)
        response = remove_delimiters(response)
        response = clean_text(response)

        # make sure we don't have duplicates
        if response not in input:
            input.append(response)

    return output_text_list

shopping_goods_texts = gen_clean_OCR_output(5000, shopping_goods_categories)
food_dishes_texts = gen_clean_OCR_output(2500, food_and_dishes_categories)
title_texts = gen_clean_OCR_output(2500, medium_types)

In [None]:
# take a look at text outputs
print(f"Shopping Good Data Entries (size: {len(shopping_goods_texts)}):")
print(shopping_goods_texts)
print('\n')

print(f"Food and Dishes Data Entries (size: {len(food_dishes_texts)}):")
print(food_dishes_texts)
print('\n')

print(f"Title Data Entries (size: {len(title_texts)}):")
print(title_texts)
print('\n')

In [None]:
conglomerated_data = []

# join all the generated text entries along with their respective categories
for entry in shopping_goods_entries:
    conglomerated_data.append({"OCR Text": entry, "Category": "Shopping Goods"})

for entry in food_dishes_entries:
    conglomerated_data.append({"OCR Text": entry, "Category": "Food/Dishes"})

for entry in medium_entries:
    conglomerated_data.append({"OCR Text": entry, "Category": "Title Types"})

# create pandas df
df = pd.DataFrame(conglomerated_data)

# save as a csv file
df.to_csv('OCR_text_dataset.csv', index=False)