In [1]:
import os


class dataloader(object):
    def __init__(self, input_dir, output_dir, training_size = 99):
    # Initialize an empty dictionary to store the data
        data_dict = {}
        data_loaded = 0
        # Loop through all files in the input directory
        for filename in os.listdir(input_dir):
            
            if filename.endswith(".jpg"):
                # Get the full path of the image file
                img_path = os.path.join(input_dir, filename)
                
                # Construct the corresponding txt file name
                txt_filename = filename.replace(".jpg", ".txt")
                txt_filename = txt_filename.replace("input", "output")
                txt_path = os.path.join(output_dir, txt_filename)
                
                if os.path.exists(txt_path):
                    with open(txt_path, 'r') as txt_file:
                        label = txt_file.read().strip()             
                        # Store the image path and label in the dictionary
                        data_dict[img_path] = label
                        data_loaded += 1
                        print(f"Pairing {filename} with {txt_filename}")
                        if data_loaded == training_size:
                            break

    
        self.data_dict = data_dict

In [2]:
import os
import langchain
from langchain.chains import TransformChain
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_core.runnables import chain
import base64
import json
from langchain.tools import tool


openai_api_key = "sk-proj-EdTJfI06sx021wOqwqMUT3BlbkFJQqpltq3qDR6x4yVxElu2"

class llm_model(object):
    def __init__(self, vision_prompt, few_shot_prompt, finetune = True, training_data = None):
        self.llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0.0, model="gpt-4o", max_tokens=1024)
        self.format_instruction = format_instruction = """
             Provide your output in the format of a python dict object code {"InferredCharacters": "XXXXX"}.
             Do not provide any other information. 
             Do not provide the backticks or any characters not related to the dict string such that I can convert to dict object directly by json.load(result)"""
        self.vision_prompt = vision_prompt 
        if finetune:
            self.vision_prompt = self.vision_prompt + few_shot_prompt
            for data_path in training_data.data_dict.keys():
                encoded_image_data = encode_image(data_path)
                fine_tune_data = f"""```Encoded image data: {encoded_image_data} -> True label: {training_data.data_dict[data_path]}``` \n"""
                self.vision_prompt = self.vision_prompt + fine_tune_data



def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
            
def load_image(inputs: dict) -> dict:
    """Load image from file and encode it as base64."""
    image_path = inputs["image_path"]
    image_base64 = encode_image(image_path)
    return {"image": image_base64}

load_image_chain = TransformChain(
    input_variables=["image_path"],
    output_variables=["image"],
    transform=load_image
)

vision_prompt = """Given the Captcha image, where 
1. the font and spacing is the same each time.  
2. the background and foreground colors and texture, remain largely the same.
3. there is no skew in the structure of the characters.
4. the captcha generator, creates strictly 5-character captchas, and each of the characters is either an upper-case character (A-Z) or a numeral (0-9).
5. They resemble each other very much where the identify that the texture, nature of the font, spacing of the font, morphological characteristic of the letters and numerals arev very consistent.
Infer the image's characters (Only capital letters and numbers)

"""

few_shot_prompt = """Few shot prompting fine tunning: it is very important to take note of the difference between:
0 and O  

and

1 and I

Here are some samples describing the challenging characters for you to tell the difference between 0, O, 1 and I are provided delimited by triple backticks: 
"""


finetune_input_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\fine-tune-data\input"
finetune_output_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\fine-tune-data\output"
training_data = dataloader(input_dir = finetune_input_dir, output_dir = finetune_output_dir)
llm_agent = llm_model(vision_prompt, few_shot_prompt, finetune = False, training_data = training_data)

@chain
def image_model(inputs: dict) -> str:
    """Invoke model with image and prompt."""
    model = llm_agent.llm
    # print("Final vision prompt\n", inputs["prompt"])
    msg = model.invoke(
             [SystemMessage(content=llm_agent.format_instruction),
              HumanMessage(
             content=[
             {"type": "text", "text": inputs["prompt"]},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
             ])]
             )
    return msg.content

@tool
def convert_output_to_dict(llm_output:str) -> dict:
    """Convert the output of the LLM to a simple dict with only the predicted output as the key"""
    try:
        response_dict = json.loads(llm_output)
    except json.JSONDecodeError:
        response_dict = {"error": "Invalid JSON response"}
    return response_dict
    


'''def get_image_information(image_path: str, llm_model, finetune = False, training_data = None) -> dict:
    vision_prompt = """Given the Captcha image, where 
    1. the number of characters remains the same each time.  
    2. the font and spacing is the same each time.  
    3. the background and foreground colors and texture, remain largely the same.
    4. there is no skew in the structure of the characters.
    5. the captcha generator, creates strictly 5-character captchas, and each of the characters is either an upper-case character (A-Z) or a numeral (0-9).
    6. They resemble each other very much where the identify that the texture, nature of the font, spacing of the font, morphological characteristic of the letters and numerals arev very consistent.
    Infer the image's characters (Only capital letters and numbers)

    """
    
    if finetune:
        few_shot_prompt = """Given the data, it is very important to take note of the difference between:
        ```0 and O```  
        and
        ```1 and I```
        In our dataset, their difference are consistent and you must fine-tune yourself to adapt to the font format
        A list of sample data describing the difference between 0, O, 1 and I are provided to allow you to fine-tune your performance for the given task: 
        """
        vision_prompt = vision_prompt + few_shot_prompt
        for data_path in training_data.data_dict.keys():
            encoded_image_data = encode_image(data_path)
            fine_tune_data = f"""Image data: {encoded_image_data} -> True label: {training_data.data_dict[data_path]} \n\n"""
            vision_prompt = vision_prompt + fine_tune_data
    
            
    vision_chain = load_image_chain | image_model | convert_output_to_dict
    return vision_chain.invoke({'image_path': f'{image_path}', 
                               'prompt': vision_prompt})'''

def get_image_information(image_path: str, llm_model) -> dict:
    vision_prompt = llm_model.vision_prompt
    vision_chain = load_image_chain | image_model | convert_output_to_dict
    return vision_chain.invoke({'image_path': f'{image_path}', 
                               'prompt': vision_prompt})
    

Pairing input02B.jpg with output02B.txt
Pairing input04B.jpg with output04B.txt
Pairing input17B.jpg with output17B.txt
Pairing input22B.jpg with output22B.txt
Pairing input24B.jpg with output24B.txt


In [3]:
'''# Sample inferencing
sample_image = "C:/Users/guang/OneDrive/Desktop/IMDA/sampleCaptchas/input/input00.jpg"
training_data = dataloader(input_dir, output_dir, training_size = 3) # Select an integer between 1 to 25 to set the amount of data used to fine tune 
result = get_image_information(sample_image, llm_agent, finetune = True, training_data = training_data)
print(result)'''

'# Sample inferencing\nsample_image = "C:/Users/guang/OneDrive/Desktop/IMDA/sampleCaptchas/input/input00.jpg"\ntraining_data = dataloader(input_dir, output_dir, training_size = 3) # Select an integer between 1 to 25 to set the amount of data used to fine tune \nresult = get_image_information(sample_image, llm_agent, finetune = True, training_data = training_data)\nprint(result)'

In [4]:


class Captcha(object):
    def __init__(self, finetune = False, llm_agent = None):
        self.finetune = finetune
        self.llm_agent = llm_agent
        

    def __call__(self, im_path, save_path):
        """
        Algo for inference
        args:
            im_path: .jpg image path to load and to infer
            save_path: output file path to save the one-line outcome
        """
        result = get_image_information(im_path, self.llm_agent)
        return result


In [5]:
# declare inferencer and sample inferencing
sample_image = "C:/Users/guang/OneDrive/Desktop/IMDA/sampleCaptchas/input/input00.jpg"


captcha_inferencer = Captcha(llm_agent = llm_agent)
captcha_inferencer(sample_image, None)

{'InferredCharacters': 'EGYK4'}

In [6]:
input_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\sampleCaptchas\input"
output_dir = r"C:\Users\guang\OneDrive\Desktop\IMDA\sampleCaptchas\output"

inference_data = dataloader(input_dir, output_dir)

corrects = 0
attempts = 0
for data_path in inference_data.data_dict:
    true_label = inference_data.data_dict[data_path]
    inferred_output = captcha_inferencer(data_path, None)
    predict = inferred_output['InferredCharacters']
    print(f"Predict result: {predict} versus True Label: {true_label}")
    if predict == true_label:
        corrects += 1
    else:
        print("incorrect! Predicting", data_path)
    attempts += 1

print(f"Final remarks: corrects {corrects} out of {attempts} attempts. accuracy of {corrects/attempts:.2f}")


# Common errors, mistaken O as 0, I mistake as 1, mistake 0 as O, 

Pairing input00.jpg with output00.txt
Pairing input01.jpg with output01.txt
Pairing input02.jpg with output02.txt
Pairing input03.jpg with output03.txt
Pairing input04.jpg with output04.txt
Pairing input05.jpg with output05.txt
Pairing input06.jpg with output06.txt
Pairing input07.jpg with output07.txt
Pairing input08.jpg with output08.txt
Pairing input09.jpg with output09.txt
Pairing input10.jpg with output10.txt
Pairing input11.jpg with output11.txt
Pairing input12.jpg with output12.txt
Pairing input13.jpg with output13.txt
Pairing input14.jpg with output14.txt
Pairing input15.jpg with output15.txt
Pairing input16.jpg with output16.txt
Pairing input17.jpg with output17.txt
Pairing input18.jpg with output18.txt
Pairing input19.jpg with output19.txt
Pairing input20.jpg with output20.txt
Pairing input22.jpg with output22.txt
Pairing input23.jpg with output23.txt
Pairing input24.jpg with output24.txt
Predict result: EGYK4 versus True Label: EGYK4
Predict result: GRC35 versus True Label: 