In [315]:
import os


class dataloader(object):
    def __init__(self, input_dir, output_dir, training_size = 99):
    # Initialize an empty dictionary to store the data
        data_dict = {}
        data_loaded = 0
        # Loop through all files in the input directory
        for filename in os.listdir(input_dir):
            
            if filename.endswith(".jpg"):
                # Get the full path of the image file
                img_path = os.path.join(input_dir, filename)
                
                # Construct the corresponding txt file name
                txt_filename = filename.replace(".jpg", ".txt")
                txt_filename = txt_filename.replace("input", "output")
                txt_path = os.path.join(output_dir, txt_filename)
                
                if os.path.exists(txt_path):
                    with open(txt_path, 'r') as txt_file:
                        label = txt_file.read().strip()             
                        # Store the image path and label in the dictionary
                        data_dict[img_path] = label
                        data_loaded += 1
                        if data_loaded == training_size:
                            break

    
        self.data_dict = data_dict

In [272]:
import os
import langchain
from langchain.chains import TransformChain
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_core.runnables import chain
import base64

openai_api_key = "OpenAI API Key"

class llm_model(object):
    def __init__(self):
        self.llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0.0, model="gpt-4o", max_tokens=1024)
        self.format_instruction = format_instruction = """Infer the 5 characters (Only capital letters and numbers) shown in the capcha image provide. 
             Provide your output in the format of a python dict object code {"InferredCharacters": "XXXXX"}.
             Do not provide any other information. 
             Do not provide the backticks or any characters not related to the dict string such that I can convert to dict object directly by json.load(result)"""
        

def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
            
def load_image(inputs: dict) -> dict:
    """Load image from file and encode it as base64."""
    image_path = inputs["image_path"]
    image_base64 = encode_image(image_path)
    return {"image": image_base64}

load_image_chain = TransformChain(
    input_variables=["image_path"],
    output_variables=["image"],
    transform=load_image
)

In [312]:
from langchain.tools import tool
llm_agent = llm_model()
@chain
def image_model(inputs: dict) -> str:
    """Invoke model with image and prompt."""
    model = llm_agent.llm
    print("Final vision prompt\n", inputs["prompt"])
    msg = model.invoke(
             [SystemMessage(content=llm_agent.format_instruction),
              HumanMessage(
             content=[
             {"type": "text", "text": inputs["prompt"]},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
             ])]
             )
    return msg.content

@tool
def convert_output_to_dict(llm_output:str) -> dict:
    """Convert the output of the LLM to a simple dict with only the predicted output as the key"""
    try:
        response_dict = json.loads(llm_output)
    except json.JSONDecodeError:
        response_dict = {"error": "Invalid JSON response"}
    return response_dict
    


def get_image_information(image_path: str, llm_model, finetune = False, training_data = None) -> dict:
    vision_prompt = """Given the Captcha image, where 
    1. the number of characters remains the same each time.  
    2. the font and spacing is the same each time.  
    3. the background and foreground colors and texture, remain largely the same.
    4. there is no skew in the structure of the characters.
    5. the captcha generator, creates strictly 5-character captchas, and each of the characters is either an upper-case character (A-Z) or a numeral (0-9).
    
    Infer the relevant image characters (Letters and Numbers)
    
    """
    if finetune:
        few_shot_prompt = """A list of sample data are as per below to fine-tune your performance for the given task: 
        """
        vision_prompt = vision_prompt + few_shot_prompt
        for data_path in training_data.data_dict.keys():
            encoded_image_data = encode_image(data_path)
            fine_tune_data = f"""Image data: {encoded_image_data} -> True label: {training_data.data_dict[data_path]} \n\n"""
            vision_prompt = vision_prompt + fine_tune_data
    
            
    vision_chain = load_image_chain | image_model | convert_output_to_dict
    return vision_chain.invoke({'image_path': f'{image_path}', 
                               'prompt': vision_prompt})

In [313]:
sample_image = "C:/Users/guang/OneDrive/Desktop/IMDA/sampleCaptchas/input/input00.jpg"

In [316]:
training_data = dataloader(input_dir, output_dir, training_size = 3) # Select an integer between 1 to 25 to set the amount of data used to fine tune 
result = get_image_information(sample_image, llm_agent, finetune = True, training_data = training_data)

Final vision prompt
 Given the Captcha image, where 
    1. the number of characters remains the same each time.  
    2. the font and spacing is the same each time.  
    3. the background and foreground colors and texture, remain largely the same.
    4. there is no skew in the structure of the characters.
    5. the captcha generator, creates strictly 5-character captchas, and each of the characters is either an upper-case character (A-Z) or a numeral (0-9).
    
    Infer the relevant image characters (Letters and Numbers)
    
    A list of sample data are as per below to fine-tune your performance for the given task: 
        Image data: /9j/4AAQSkZJRgABAQAAAQABAAD//gA+Q1JFQVRPUjogZ2QtanBlZyB2MS4wICh1c2luZyBJSkcgSlBFRyB2NjIpLCBkZWZhdWx0IHF1YWxpdHkK/9sAQwAIBgYHBgUIBwcHCQkICgwUDQwLCwwZEhMPFB0aHx4dGhwcICQuJyAiLCMcHCg3KSwwMTQ0NB8nOT04MjwuMzQy/9sAQwEJCQkMCwwYDQ0YMiEcITIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIy/8AAEQgAHgA8AwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEBAAAAAAAA

In [311]:
result

{'InferredCharacters': 'EGYK4'}

In [317]:
input_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\sampleCaptchas\input"
output_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\sampleCaptchas\output"


class Captcha(object):
    def __init__(self, finetune = False, llm_agent = None, input_dir="", output_dir=""):
        self.finetune = finetune
        self.llm_agent = llm_agent
        self.training_data = dataloader(input_dir, output_dir, training_size = 3) # Select an integer between 1 to 25 to set the amount of data used to fine tune 


    def __call__(self, im_path, save_path):
        """
        Algo for inference
        args:
            im_path: .jpg image path to load and to infer
            save_path: output file path to save the one-line outcome
        """
        result = get_image_information(im_path, self.llm_agent)
        print(result)

In [318]:
captcha_inferencer = Captcha(finetune = False, llm_agent = llm_agent, input_dir = input_dir, output_dir = output_dir)
captcha_inferencer(sample_image, None)

Final vision prompt
 Given the Captcha image, where 
    1. the number of characters remains the same each time.  
    2. the font and spacing is the same each time.  
    3. the background and foreground colors and texture, remain largely the same.
    4. there is no skew in the structure of the characters.
    5. the captcha generator, creates strictly 5-character captchas, and each of the characters is either an upper-case character (A-Z) or a numeral (0-9).
    
    Infer the relevant image characters (Letters and Numbers)
    
    
{'InferredCharacters': 'EGYK4'}
