In [21]:
API_KEY = 'REDACTED'

# Imports

In [3]:
import pandas as pd
import os
import google.generativeai as genai
from datetime import datetime
import csv
import re

# Prompt

In [34]:
def get_prompt(df):
    return f"""
            rewrite this table so that the examples:
            * include reference to a specific app to use to complete the Example Task
            * are self-contained and do not refer to any context that is outside of the task
            * are specific tasks that a person can complete on a computer in under 3 minutes
            * all the detail should be given in the task example for a person to execute the task
            * a task like "Track recruiting department budget in a spreadsheet" is bad because it's not specific - make all examples specific so a person can execute the task themselves without needing to ask questions or get specifics
            * all examples should be under 15 words and be as concise as possible
            * delete any examples that are too vague to be done by a random stranger based on the example
            * combine examples so that the resulting table has fewer rows than the input table
            * where multiple rows do the same task, pick the job most commonly associated with the example
            * return just a rewritten table

        {df}
        

    """

In [6]:
df = pd.read_csv('tasks_filtered_clean.csv')

In [10]:
categories = list(df['Category'].unique())

In [12]:
len(categories)

496

# Setup Gemini

In [22]:
genai.configure(api_key=API_KEY)

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

# Use gemini to get tasks for each job

In [24]:
index = 0

In [26]:
dft = df[df['Category'] == category]

PROMPT = get_prompt(dft.to_csv(index=False))

# output csv of tasks for this job
FILENAME = f'TASKS_REWRITTEN/{category}.csv'

In [27]:
model = None

# get jobs CSV using gemini
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

start_time = datetime.now()
print(f"Start time: {start_time}")

chat_session = model.start_chat(
    history=[
        {
            "role": "user",
            "parts": [
                PROMPT,
            ],
        }
    ]
)

response = chat_session.send_message("Produce new TABLE as per the instructions above.")
end_time = datetime.now()
duration = end_time - start_time

print(f"End time: {end_time}")
print(f"Gemini API request duration: {duration}")

Start time: 2024-06-01 23:15:54.606136
End time: 2024-06-01 23:16:00.320331
Gemini API request duration: 0:00:05.714195


['| Job | Category | Example |',
 '|---|---|---|',
 '| Barista | Accounting | Record a purchase of milk and coffee beans in QuickBooks. |',
 '| Chef (Executive) | Accounting | View a QuickBooks report for restaurant income and expenses this month. |',
 '| Chemical Engineer | Accounting | Record chemical purchase in Xero. |',
 '| Chemist | Accounting | Record lab supply expenses in Xero. |',
 '| Chief Executive Officer (CEO) | Accounting | Generate a Xero financial report for a board meeting. |',
 '| Chief Information Officer (CIO) | Accounting | Track IT asset depreciation in Excel. |',
 '| Cook | Accounting | Record a payment to "John\'s Produce" in QuickBooks. | ',
 '| Director (Technology) | Accounting | Manage technology service invoices in QuickBooks. |',
 '| Doctor (Anesthesiologist) | Accounting | Manage anesthesia service invoices in QuickBooks. |',
 '| Graphic Designer (Print) | Accounting | Record Adobe Creative Cloud subscription cost in your accounting software. |',
 '| Hum

In [33]:
# Remove extra newlines and split the response text into lines
lines = response.text.strip().split("\n")

# Extract the header and rows
header = re.split(r'\s*\|\s*', lines[0].strip('|'))
rows = [re.split(r'\s*\|\s*', line.strip('|')) for line in lines[2:]]

# Write to CSV
with open(FILENAME, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(header)
    csvwriter.writerows(rows)

In [41]:
from time import sleep

In [45]:
for index, category in enumerate(categories):

    dft = df[df['Category'] == category]
    
    PROMPT = get_prompt(dft.to_csv(index=False))
    
    # output csv of tasks for this job
    FILENAME = f'TASKS_REWRITTEN/{category}.csv'

    #print(PROMPT)

    # skip if already done
    if os.path.exists(FILENAME):
        print(f"[loading saved data] {index+1}/{len(categories)} Category: {category}")
    else:
        print(f"{index+1}/{len(categories)} Category: {category}")

        try:
            model = None
            
            # get jobs CSV using gemini
            model = genai.GenerativeModel(
                model_name="gemini-1.5-flash",
                safety_settings=safety_settings,
                generation_config=generation_config,
            )
            
            start_time = datetime.now()
            print(f"Start time: {start_time}")
            
            chat_session = model.start_chat(
                history=[
                    {
                        "role": "user",
                        "parts": [
                            PROMPT,
                        ],
                    }
                ]
            )
            
            response = chat_session.send_message("Produce new TABLE as per the instructions above.")
            end_time = datetime.now()
            duration = end_time - start_time
            
            print(f"End time: {end_time}")
            print(f"Gemini API request duration: {duration}")
        
            # Remove extra newlines and split the response text into lines
            lines = response.text.strip().split('\n')
            
            # Extract the header and rows
            header = re.split(r'\s*\|\s*', lines[0].strip('|'))
            rows = [re.split(r'\s*\|\s*', line.strip('|')) for line in lines[2:]]
            
            # Write to CSV
            with open(FILENAME, 'w', newline='') as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(header)
                csvwriter.writerows(rows)
        
            print("Saving file")
            sleep(2)
        except Exception as e:
            print(f"[ERROR] {index+1}/{len(categories)} Category: {category}")
            print(e)

[loading saved data] 1/496 Category: 2D Art
[loading saved data] 2/496 Category: 3D Modeling
[loading saved data] 3/496 Category: 3D Printing
[loading saved data] 4/496 Category: Abstract Algebra
[loading saved data] 5/496 Category: Accessibility
[loading saved data] 6/496 Category: Accounting
[loading saved data] 7/496 Category: Acting
[loading saved data] 8/496 Category: Actuarial Science
[loading saved data] 9/496 Category: Administration
[loading saved data] 10/496 Category: Administrative Tasks
[loading saved data] 11/496 Category: Agile
[loading saved data] 12/496 Category: Agile Development
[loading saved data] 13/496 Category: Agricultural Genetics
[loading saved data] 14/496 Category: Agriculture
[loading saved data] 15/496 Category: AI & Machine Learning
[loading saved data] 16/496 Category: Algebra
[loading saved data] 17/496 Category: Analysis
[loading saved data] 18/496 Category: Analytics
[loading saved data] 19/496 Category: Animation
[loading saved data] 20/496 Category