In [16]:
API_KEY = 'REDACTED'

# Imports

In [17]:
import pandas as pd
import os
import google.generativeai as genai
from datetime import datetime
import csv
import re

# Prompt

In [18]:
def get_prompt(df):
    return f"""
        # TABLE
        {df}
        
        # INSTRUCTION
        Create a table called NEW_TABLE which has the same rows as TABLE but the Task is rewritten to be semantically the same but with improved wording for each Task.
        
        Change the Task column only in NEW_TABLE, the other columns from TABLE should remain the same
        
        make the examples specific - i need to be able to give this table to an employee who will do screen recordings of themselves doing each task. it's fine if the subject of a Task is left vague e.g. "Subscribe to a YouTuber" is better than "Subscribe to [random youtuber]".
        
        the task definition must be:
        * precise
        * straightforward
        * specific
        * self-contained
        * action-oriented
        * include the name of the App if there's any ambiguity
        * 15 words or fewer
        * not require prior context about a project / individual / project
        * it should be clear what needs to be done from the Task definition in context with the App attached to the Task
        * if I gave this task to 5 random people, they should all follow approximately the same actions to complete it
        * not open to interpretation
        * concrete e.g. do not say "create a report", rather say "create a short report outline with 5 items"
        * do not use commas or quotes in task definitions
        * there can be no "," in TASK for each row e.g. 'filter by image, type and color' should be replaced with 'filter by image and type and color'
        
        Rewrite the app column so that:
        * it is semantically the same as it is currently
        * uses the correct capitalization and standard name for each app
        * replace "Example Forum for Python (e.g. python.org)" with the example so app="python.org" in that case
        * websites app valus should contain only the website url, not https or other text
        * do not return "Example Slack for a Design community", instead say "Slack" to reference the app 
        * defer to native apps e.g. Slack rather than web versions e.g. Slack.com where applicable
        
        EXAMPLES
        Bad example task from TABLE:
        'Add a dependency to the "Respond to Customer Feedback" task so that it cannot be started until the "Customer Support" project is completed in Zoho Projects'
        Why it's bad:
        It's too verbose and assumes a task and project which might not exist (too specific)
        Should be replaced with:
        'Add a task dependency so it cannot be started until a project is completed in Zoho'
        
        Bad example task from TABLE:
        'Add a due date of 2024-04-22 to the "Develop Content Strategy" task in Wrike'
        Why it's bad:
        It assumes that a 'Develop Content Strategy' task already exists which we cannot assume.
        Should be replaced with:
        'Add a due date of tomorrow to a task in Wrike'
        
        Bad example task from TABLE:
        'Use the Animation tools to create a 3D animation'
        Why it's bad:
        It's vague and it's not clear what animation to create.
        It should be replaced with:
        'Use the Animation tools to create a simple animation of a cube rotating'
        
        A reasonable average person should be able to read a row of TABLE and understand how to do the Task in a way that another person would agree that the Task was done if they watched a screen recording of the first person doing the task.
        
        NEW_TABLE should have the following columns in order only: 
        Index, Category, Subcategory, App, Task
        
        return nothing else but NEW_TABLE in the same format as TABLE (CSV) with "," delimiters and nothing else.
    """

# Setup Gemini

In [19]:
genai.configure(api_key=API_KEY)

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_MEDIUM_AND_ABOVE",
    },
]

# Load Tasks

In [20]:
# Load the CSV file
file_path = 'all_tasks.csv'
tasks = pd.read_csv(file_path)

print(tasks.head())
print("...")
print(tasks.tail())

   Index            Category       Subcategory        App  \
0      1  Security & Privacy   Data Encryption  1Password   
1      2  Security & Privacy   Data Encryption  1Password   
2      3  Security & Privacy  Password Manager  1Password   
3      4  Security & Privacy  Password Manager  1Password   
4      5  Security & Privacy  Password Manager  1Password   

                                                Task  
0        Change your password on a 1Password account  
1  Enable and then disable two-factor authenticat...  
2  Change the master password for the password ma...  
3  Configure the password manager to automaticall...  
4              Create a new password manager account  
...
       Index                      Category         Subcategory   App  \
32179  32180  Productivity & Collaboration  Team Communication  Zoom   
32180  32181  Productivity & Collaboration  Team Communication  Zoom   
32181  32182            Security & Privacy     Data Encryption  Zoom   
32182  3218

# Use gemini to rewrite tasks

In [21]:
tasks.head()

Unnamed: 0,Index,Category,Subcategory,App,Task
0,1,Security & Privacy,Data Encryption,1Password,Change your password on a 1Password account
1,2,Security & Privacy,Data Encryption,1Password,Enable and then disable two-factor authenticat...
2,3,Security & Privacy,Password Manager,1Password,Change the master password for the password ma...
3,4,Security & Privacy,Password Manager,1Password,Configure the password manager to automaticall...
4,5,Security & Privacy,Password Manager,1Password,Create a new password manager account


In [22]:
# Define the chunk size
chunk_size = 100

# Get the number of chunks
num_chunks = (len(tasks) // chunk_size) + (1 if len(tasks) % chunk_size != 0 else 0)

# Loop over the DataFrame in chunks of 100 rows
for index in range(num_chunks):
    if index > 150:
    
        # Define the start and end of the slice
        start = index * chunk_size
        end = start + chunk_size
        
        # Slice the DataFrame
        dft = tasks.iloc[start:end]
        print(f"PROCESSING CHUNK {index}/{num_chunks}")
    
        # output csv of tasks for this job
        FILENAME = f'TASKS_CLEANED/{index}.csv'
    
        PROMPT = get_prompt(dft.to_csv(index=False))
    
        #print(PROMPT)
    
        # skip if already done
        if os.path.exists(FILENAME):
            print(f"[loading saved data] {index+1}/{num_chunks} slice: {index}")
        else:
            print(f" {index+1}/{num_chunks} slice: {index}")
    
            try:
                model = None
                
                # get jobs CSV using gemini
                model = genai.GenerativeModel(
                    model_name="gemini-1.5-flash",
                    safety_settings=safety_settings,
                    generation_config=generation_config,
                )
                
                start_time = datetime.now()
                print(f"Start time: {start_time}")
                
                chat_session = model.start_chat(
                    history=[
                        {
                            "role": "user",
                            "parts": [
                                PROMPT,
                            ],
                        }
                    ]
                )
                
                response = chat_session.send_message("Produce NEW_TABLE as per the instructions above.")
                end_time = datetime.now()
                duration = end_time - start_time
                
                print(f"End time: {end_time}")
                print(f"Gemini API request duration: {duration}")
            
                # Remove extra newlines and split the response text into lines
                lines = response.text.strip().split('\n')[0:]
                
                # Extract the header and rows
                header = re.split(r'\s*\|\s*', lines[0].strip('|'))
                rows = [re.split(r'\s*\|\s*', line.strip('|')) for line in lines[1:]]
                
                # Write to CSV
                with open(FILENAME, 'w', newline='') as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(header)
                    csvwriter.writerows(rows)
            
                print("Saving file")
            except Exception as e:
                print(f"[ERROR] {index+1}/{num_chunks} slice: {index}")
                print(e)

PROCESSING CHUNK 151/322
 152/322 slice: 151
Start time: 2024-06-02 10:18:24.223588
End time: 2024-06-02 10:18:45.617034
Gemini API request duration: 0:00:21.393446
Saving file
PROCESSING CHUNK 152/322
 153/322 slice: 152
Start time: 2024-06-02 10:18:45.631396
End time: 2024-06-02 10:19:04.244553
Gemini API request duration: 0:00:18.613157
Saving file
PROCESSING CHUNK 153/322
 154/322 slice: 153
Start time: 2024-06-02 10:19:04.256720
End time: 2024-06-02 10:19:23.550115
Gemini API request duration: 0:00:19.293395
Saving file
PROCESSING CHUNK 154/322
 155/322 slice: 154
Start time: 2024-06-02 10:19:23.569053
End time: 2024-06-02 10:19:42.645258
Gemini API request duration: 0:00:19.076205
Saving file
PROCESSING CHUNK 155/322
 156/322 slice: 155
Start time: 2024-06-02 10:19:42.673804
End time: 2024-06-02 10:20:00.679532
Gemini API request duration: 0:00:18.005728
Saving file
PROCESSING CHUNK 156/322
 157/322 slice: 156
Start time: 2024-06-02 10:20:00.695908
End time: 2024-06-02 10:20:18.7

In [24]:

print("Done")

Done
