In [22]:
from threading import Thread, Lock

import json

import requests

import win32api
import win32con

In [23]:
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
from flask import Flask, request, jsonify

from PIL import Image, ImageGrab

import cv2 as cv

In [25]:
from Perplexity import Perplexity

In [26]:
JSON_SKILLS_FILEPATH = r"./Skills.json"

In [27]:
def load_json(filepath):

    with open(filepath, "rt") as file:
        return json.load(file)

In [28]:
def flatten_skills(skills_dict):
    flattened_skills = []
    for primary in skills_dict.keys():
        for secondary in skills_dict[primary].keys():
            for item in skills_dict[primary][secondary]:
                flattened_skills.append(f"{primary}::{secondary}::{item}")

    return flattened_skills

In [29]:
skills = load_json("./Skills.json")

user_skills = load_json("./UserSkills.json")
flattened_user_skills = flatten_skills(user_skills)

In [30]:
def retreive_and_process_screen_image(windows_target_coordinate = win32api.GetCursorPos()):

    # Take a screenshot of the user's screen.
    screen_image = ImageGrab.grab()
    screen_w, screen_h = screen_image.size

    # Convert the Windows coordinates to coordinates on the captured image using simple proportions.
    # TODO: Determine how Windows determines screen size.
    # On a 1080 x 1920 Screen
    # Width : win32api.GetSystemMetrics(0) = 1536
    # Height : win32api.GetSystemMetrics(1) = 864
    translate_windows_xy_coordinates = lambda win_x, win_y : (int(win_x * 1.0 * screen_w / win32api.GetSystemMetrics(0)), int(win_y * 1.0 * screen_h / win32api.GetSystemMetrics(1)))

    image_target_coordinate = translate_windows_xy_coordinates(*windows_target_coordinate)

    rectangle_options = {
        "color" : (255, 0, 0),
        "thickness" : 10
    }
    screen_image = cv.rectangle(screen_image, image_target_coordinate, **rectangle_options)


    # Return the raw screenshot along with the cropped screenshot around their mouse.
    return screen_image, image_target_coordinate

In [31]:
perplex = Perplexity()

In [32]:
prompts = {
    "Greetings" : """Hello! What would you like help doing on your computer?""",
    
    "Determine Goal" : """
        A beginner-level computer user needs help preforming a task on their computer. If you can confidently determine what the user wants to do using the triple quotation mark enclosed query (\"\"\"), which may provide clarification questions and answers along with the provided screenshot of the user's screen content, then respond "✅✅✅" followed by a clear and concise summary of what they want to do in a way that can be understood by beginner-level computer users. If it is not clear what the user needs help with, or if a follow-up question would be beneficial for clarity, respond "???" followed by the clarification statements or clarification question(s) you would like to ask the user. Clarification questions should be short, direct, and accessible to beginner-level users. Clarification questions should also be kept to a minimum to make the assistance process as seamless as possible. Always ask clarification questions, however, if the user's intention is ambiguous.
        Make sure to never begin your response without a "✅✅✅" or "???". Thank you!
        """,
    
    "Instruction Generation" : """Create step-by-step instructions to complete the following task enclosed in triple quotation marks: \"\"\"{Task}\"\"\" {Issues} Your instructions should not contain line breaks and take the following format:
        Instruction1 Information;Instruction 2 Information;Information for Instruction 3;...
        You should not use semicolons for instructions and should instead only use them to separate different instructions.
        Your instructions should be clear, understandable by a beginner level computer user, and avoid launching specialized programs or the command line as much as possible. Your instructions must contain explanations of why we are doing the things we are doing. You should enumerate every click you can. The current computer user is familiar with the following set of skills:
        {Skills} Try to cater your instructions to give users more experience with the skills they have experience with. Use the provided skill codes as appropriate in your response. To the greatest extent possible, avoid using more than one or two skills that the user does not have experience with before. After you have finished writing your instructions, end your response with the following code: {BEGIN_SUMMARY}
        After this code, write a short, clear and concise summary of what they want to do in a way that can be understood by beginner-level computer users. Your summary of your instructions should include the starting point and the ending point as well as crucial parts or shifts of frame. An example summary for instructions on bookmarking a website might talk about one or more of the following: So web browsers navigate to/download webpages from other people's servers in the world. A bookmark, like bookmarks used in your books help you remember websites you'd like to refer back to often. Make sure to bookmark websites you trust; you can also organize your bookmarks.
        Make sure not to be too verbose in your summary""",
    
    # Alright, does this sound good and are you ready to get started?
    "Confirmation" : """If the user is satisfied with the current set of instructions, respond with ✅, otherwise respond with xx, followed by a summary of their disagreement of issue. User Response: {issue}""",

    "Action Verification" : """The provided image has marked the [user's most recent click location/the user's current mouse position] with a large blue dot.
        The user is currently following instruction {CurrentInstructionNumber} from the following list of instructions: {InstructionList} Identify the appropriate response code given the following criteria:
        If the user successfully completed the current instruction, respond with ✅. If the user hasn't done anything yet or are confused, don't use any special symbols and explain what they can do to get back on track.
        If the user made a mistake, or the next instructions need to be revised, respond with xx followed immediately by a description of the error.
    """,

    "Congratulations" : """Summarize what the user just finished after following this list of steps:
        {instructionList}
        Use the following format for your response:
        Congratulations, you just finished [Very Brief Summary of Completed Steps]! Do you have any questions about how that worked, or have something you would like to work on next?
    """,

    "Clarification Question" : """The user is currently trying to [most recent query summary] following the instruction [Instruction Number] out of the following list of instructions. They have the following question:"""

}

In [33]:
should_exit_program = False

proceed_to_next_state_mutex = False

most_recent_user_input = None
most_recent_model_output = None


In [44]:
def state_handler():
    global should_exit_program

    global proceed_to_next_state_mutex

    global most_recent_user_input
    global most_recent_model_output


    next_state_to_handle = "Determine Goal"

    current_goal = None
    clarification_questions = []
    clarification_answers = []

    issues = ""

    current_instruction_string = None
    current_instruction_list = None
    current_instruction_number = None
    current_instruction_summary = None
    
    while not should_exit_program:

        with proceed_to_next_state_mutex:

            if next_state_to_handle == "Determine Goal":

                assert most_recent_user_input != None

                current_prompt = prompts["Determine Goal"] + f'\n"""{most_recent_user_input}"""'
                # If handling clarification questions, save their response now.
                if len(clarification_questions) != len(clarification_answers):
                    clarification_answers.append(most_recent_user_input)

                most_recent_user_input = None
                
                screen_image = ImageGrab.grab()
                screen_image.save("./CurrentImage.png")
                raw_model_output = perplex.infer(current_prompt, "./CurrentImage.png")

                # Check if the model was able to determine intention
                if "✅✅✅" in raw_model_output:

                    joined_clarifications = "".join([f"Clarification Question {i + 1}: {clarification}\nClarification Question {i + 1}, Response: {response}\n" for i, (clarification, response) in enumerate(zip(clarification_questions, clarification_answers))])

                    processed_model_output = raw_model_output.strip("✅✅✅").strip() + joined_clarifications

                    most_recent_model_output = processed_model_output
                    current_goal = processed_model_output

                    next_state_to_handle = "Instruction Generation"

                elif "???" in raw_model_output:
                    processed_model_output = raw_model_output.strip("???").strip()
                    most_recent_model_output = processed_model_output

                    clarification_questions.append(processed_model_output)
            
            elif next_state_to_handle == "Instruction Generation":
                formatted_prompt = prompts["Instruction Generation"].format(current_goal, issues, "\n".join(flattened_user_skills))

                model_response = perplex.infer(formatted_prompt)

                split_response = model_response.split("BEGIN_SUMMARY")

                assert len(split_response) == 2

                current_instruction_string, current_instruction_summary = split_response

                current_instruction_list = current_instruction_string.split(";")
                current_instruction_number = 0

                most_recent_model_output = f"Alright, so here is what we'll be doing: {current_instruction_summary}\nHow does that sound? Do you have any questions or changes you'd like to make?"

                next_state_to_handle = "Confirmation"

            elif next_state_to_handle == "Confirmation":

                formatted_prompt = prompts["Confirmation"].format(most_recent_user_input)

                model_response = perplex.infer(formatted_prompt)

                if model_response.contains("✅"):
                    most_recent_model_output = "Ok! Let's get started!"
                    next_state_to_handle = "Action Verification"

                else:
                    most_recent_model_output = "Ok! Let me think for a minute..."
                    issues += f"\n {model_response.strip('xx')}"
                
                    next_state_to_handle = "Instruction Generation"

            elif next_state_to_handle == "Action Verification":
                screen_image = retreive_and_process_screen_image()
                screen_image.save("./CurrentImage.png")

                formatted_prompt = prompts["Action Verification"].format(current_instruction_list[current_instruction_number], current_instruction_string)
                
                model_response = perplex.infer(formatted_prompt)

                if model_response.contains("✅"):
                    most_recent_model_output = perplex.infer("Craft a response like 'Nice!', or 'Great!', or 'You're on the right track!'")
                    
                    if current_instruction_number < len(current_instruction_list) - 1:

                        next_state_to_handle = "Action Verification"
                        current_instruction_number += 1
                    
                    else:
                        next_state_to_handle = "Congratulations"

                elif model_response.contains('xx'):
                    most_recent_model_output = "Ok! I think we might need to try something different. Let me think for a minute."
                    issues += f"\n {model_response.strip('xx')}"
                
                    next_state_to_handle = "Instruction Generation"

                else:
                    most_recent_model_output = model_response
            else:
                formatted_prompt = prompts["Congratulations"].format(current_instruction_string)

                next_state_to_handle = "Determine Goal"
                
                
                


        # At some point, initialized issues





            


In [45]:
state_handler_thread = Thread(target=state_handler)

thread_started = False

In [46]:
app = Flask(__name__)

@app.route("/")
def home():
    return "Hello!"

@app.route("/search", methods=["POST"])
def api_endpoint():
    
    global most_recent_user_input
    global most_recent_model_output
    global proceed_to_next_state_mutex

    if not thread_started:
        proceed_to_next_state_mutex.acquire()
        state_handler_thread.start()
        thread_started = True

    most_recent_user_input = request.json.get("query")


    proceed_to_next_state_mutex.release()

    proceed_to_next_state_mutex.acquire()

    return most_recent_model_output


    # If in Determine Goal/goal determined make sure to check with the user to see if that is really what they want

In [47]:
app.run(host="0.0.0.0")

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
