# SFO AI Hackathon - ReCaptcha Hacking

In [1]:
!pip install langchain -q

In [2]:
!pip install transformers -q

In [3]:
!pip install chromadb -q

In [4]:
#!pip3 install torch torchvision torchaudio

In [5]:
from langchain.document_loaders import ImageCaptionLoader

In [6]:
import os

# Get the current working directory
current_dir = os.getcwd()

# List all files and directories in the current directory
files = os.listdir(current_dir)

# Filter out any non-image files
images = []
for file in files:
    if file.endswith(".jpg") or file.endswith(".jpeg"):
        images.append(file)

print("Images found:", len(images))

Images found: 9


In [7]:
images

['output_image_0_2.jpg',
 'output_image_2_1.jpg',
 'output_image_0_1.jpg',
 'output_image_1_0.jpg',
 'output_image_2_0.jpg',
 'output_image_1_1.jpg',
 'output_image_2_2.jpg',
 'output_image_1_2.jpg',
 'output_image_0_0.jpg']

## Image Captioning using LangChain Image captions
# Salesforce BLIP image captioning model

In [9]:
loader = ImageCaptionLoader(images=images)
list_docs = loader.load()
list_docs

[Document(page_content='an image of a person riding a motorcycle [SEP]', metadata={'image_path': 'output_image_0_2.jpg'}),
 Document(page_content='an image of a street in the middle of a town [SEP]', metadata={'image_path': 'output_image_2_1.jpg'}),
 Document(page_content='an image of a house with a driveway [SEP]', metadata={'image_path': 'output_image_0_1.jpg'}),
 Document(page_content='an image of a building with a blue door [SEP]', metadata={'image_path': 'output_image_1_0.jpg'}),
 Document(page_content='an image of a building with a car parked in front [SEP]', metadata={'image_path': 'output_image_2_0.jpg'}),
 Document(page_content='an image of a street with cars parked on it [SEP]', metadata={'image_path': 'output_image_1_1.jpg'}),
 Document(page_content='an image of a parking lot with cars parked in it [SEP]', metadata={'image_path': 'output_image_2_2.jpg'}),
 Document(page_content='an image of a person riding a motorcycle [SEP]', metadata={'image_path': 'output_image_1_2.jpg'})

## Amazon Bedrock LLM and Embeddings

In [12]:
# Amazon Bedrock - boto3
import boto3

# Setup bedrock
bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
)
# LLM - Amazon Bedrock LLM using LangChain
from langchain.llms import Bedrock
model_id = "anthropic.claude-v2"
model_kwargs =  { 
    "max_tokens_to_sample": 4096,
    "temperature": 0.6,
    "top_k": 250,
    "top_p": 1,
    "stop_sequences": [
      "\n\nHuman:"
    ],
}
llm = Bedrock(
    client=bedrock_runtime,
    model_id=model_id,
    model_kwargs=model_kwargs
)
# Embeddings Model - Amazon Titan Embeddings Model using LangChain
from langchain.embeddings import BedrockEmbeddings
# create embeddings
bedrock_embedding = BedrockEmbeddings(
    client=bedrock_runtime,
    model_id="amazon.titan-embed-text-v1",
)

## Print image captions

In [13]:
for item in list_docs:
    print(item.page_content)
    print(item.metadata)

an image of a person riding a motorcycle [SEP]
{'image_path': 'output_image_0_2.jpg'}
an image of a street in the middle of a town [SEP]
{'image_path': 'output_image_2_1.jpg'}
an image of a house with a driveway [SEP]
{'image_path': 'output_image_0_1.jpg'}
an image of a building with a blue door [SEP]
{'image_path': 'output_image_1_0.jpg'}
an image of a building with a car parked in front [SEP]
{'image_path': 'output_image_2_0.jpg'}
an image of a street with cars parked on it [SEP]
{'image_path': 'output_image_1_1.jpg'}
an image of a parking lot with cars parked in it [SEP]
{'image_path': 'output_image_2_2.jpg'}
an image of a person riding a motorcycle [SEP]
{'image_path': 'output_image_1_2.jpg'}
an image of a motorcycle parked in front of a building [SEP]
{'image_path': 'output_image_0_0.jpg'}


## LangChain VectorStore using Chroma

In [14]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(
documents=list_docs,
embedding=bedrock_embedding
)

## Similarity Search with Score

In [15]:
question = "car"
docs = vectorstore.similarity_search_with_score(question, k=9)

In [16]:
docs

[(Document(page_content='an image of a building with a car parked in front [SEP]', metadata={'image_path': 'output_image_2_0.jpg'}),
  476.55633544921875),
 (Document(page_content='an image of a street with cars parked on it [SEP]', metadata={'image_path': 'output_image_1_1.jpg'}),
  477.7770080566406),
 (Document(page_content='an image of a parking lot with cars parked in it [SEP]', metadata={'image_path': 'output_image_2_2.jpg'}),
  522.1278076171875),
 (Document(page_content='an image of a house with a driveway [SEP]', metadata={'image_path': 'output_image_0_1.jpg'}),
  528.5362548828125),
 (Document(page_content='an image of a motorcycle parked in front of a building [SEP]', metadata={'image_path': 'output_image_0_0.jpg'}),
  530.2296752929688),
 (Document(page_content='an image of a person riding a motorcycle [SEP]', metadata={'image_path': 'output_image_0_2.jpg'}),
  546.64599609375),
 (Document(page_content='an image of a person riding a motorcycle [SEP]', metadata={'image_path'

## Using LLM (Claude V2) to validate Similarity Search output

In [23]:
question = "car"

query = f"""
\n\nHuman:
You are a Capcha expert.

Review the following context.
<context>
{docs}
</context>

Iterate each lines and provide output using the following template in json format:

- image_path: image_path , match: say Yes when data contains {question}, if not say No. 

Assistant:
"""
response = llm.invoke(query)
print(response)

 Here are the results in JSON format:

[
  {
    "image_path": "output_image_2_0.jpg", 
    "match": "Yes"
  },
  {
    "image_path": "output_image_1_1.jpg",
    "match": "Yes"  
  },
  {
    "image_path": "output_image_2_2.jpg",
    "match": "Yes"
  },
  {  
    "image_path": "output_image_0_1.jpg",
    "match": "No"
  },
  {
    "image_path": "output_image_0_0.jpg", 
    "match": "No"
  },
  {
    "image_path": "output_image_0_2.jpg",
    "match": "No"
  },
  {
    "image_path": "output_image_1_2.jpg", 
    "match": "No"
  },
  {
    "image_path": "output_image_2_1.jpg",
    "match": "No"
  },
  {
    "image_path": "output_image_1_0.jpg",
    "match": "No"
  }
]
