In [1]:
pip install git+https://github.com/facebookresearch/detectron2.git

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-jz9o1vdd
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-jz9o1vdd
  Resolved https://github.com/facebookresearch/detectron2.git to commit 31bebdea147c96f8a00a0d55931858bf727ae370
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3-none-any.whl.metadata (370 bytes)
Collecting omegaconf<2.

In [1]:
import torch
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

def load_model():
    model = maskrcnn_resnet50_fpn(pretrained=True)
    model.eval()
    return model

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_tensor = F.to_tensor(image)
    return image_tensor.unsqueeze(0)

def segment_image(model, image_tensor):
    with torch.no_grad():
        prediction = model(image_tensor)[0]
    return prediction

def visualize_segmentation(image, masks, scores, threshold=0.5):
    image = image.squeeze().permute(1, 2, 0).numpy()
    plt.figure(figsize=(12, 8))
    plt.imshow(image)

    for mask, score in zip(masks, scores):
        if score > threshold:
            masked = np.where(mask.squeeze().numpy() > 0.5, 1, 0)
            plt.contour(masked, colors=['red'], alpha=0.5, linewidths=2)

    plt.axis('off')
    plt.tight_layout()
    plt.savefig('segmented_image.png')
    plt.close()

def main(image_path):
    model = load_model()
    image_tensor = preprocess_image(image_path)
    prediction = segment_image(model, image_tensor)

    masks = prediction['masks']
    scores = prediction['scores']

    visualize_segmentation(image_tensor, masks, scores)
    print(f"Segmentation complete. Output saved as 'segmented_image.png'")
    return masks, scores

if __name__ == "__main__":
    image_path = "/content/WhatsApp Image 2024-02-27 at 3.07.59 PM (2).jpeg"
    main(image_path)



Segmentation complete. Output saved as 'segmented_image.png'


In [3]:
import cv2
import numpy as np
from PIL import Image
import sqlite3
import os
import uuid
import torch

def extract_objects(image_path, masks, scores, threshold=0.5):
    # Load the original image
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    extracted_objects = []
    for i, (mask, score) in enumerate(zip(masks, scores)):
        if score > threshold:
            # Convert mask to binary
            binary_mask = (mask.squeeze().numpy() > 0.5).astype(np.uint8) * 255

            # Find contours
            contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            # Create a mask for the largest contour
            object_mask = np.zeros(binary_mask.shape, dtype=np.uint8)
            cv2.drawContours(object_mask, contours, -1, (255), thickness=cv2.FILLED)

            # Extract the object
            extracted_object = cv2.bitwise_and(image_rgb, image_rgb, mask=object_mask)

            # Crop the object to its bounding box
            x, y, w, h = cv2.boundingRect(object_mask)
            cropped_object = extracted_object[y:y+h, x:x+w]

            extracted_objects.append(cropped_object)

    return extracted_objects

def save_objects(extracted_objects, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    object_ids = []
    for i, obj in enumerate(extracted_objects):
        obj_id = str(uuid.uuid4())  # Generate a unique ID
        object_ids.append(obj_id)

        # Convert from BGR to RGB
        obj_rgb = cv2.cvtColor(obj, cv2.COLOR_BGR2RGB)

        # Save the object as an image
        img = Image.fromarray(obj_rgb)
        img.save(os.path.join(output_folder, f"{obj_id}.png"))

    return object_ids

def create_database():
    conn = sqlite3.connect('objects_database.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS objects
                 (id TEXT PRIMARY KEY, master_id TEXT)''')
    conn.commit()
    return conn

def store_metadata(conn, object_ids, master_id):
    c = conn.cursor()
    for obj_id in object_ids:
        c.execute("INSERT INTO objects (id, master_id) VALUES (?, ?)", (obj_id, master_id))
    conn.commit()

def main(image_path, model):
    # Use the model to get masks and scores
    image = Image.open(image_path).convert("RGB")
    image_tensor = torch.from_numpy(np.array(image).transpose((2, 0, 1))).float().unsqueeze(0) / 255.0

    with torch.no_grad():
        prediction = model(image_tensor)[0]

    masks = prediction['masks']
    scores = prediction['scores']

    # Extract objects
    extracted_objects = extract_objects(image_path, masks, scores)

    # Save objects and get their IDs
    output_folder = "extracted_objects"
    object_ids = save_objects(extracted_objects, output_folder)

    # Generate a master ID for the original image
    master_id = str(uuid.uuid4())

    # Store metadata in the database
    conn = create_database()
    store_metadata(conn, object_ids, master_id)
    conn.close()

    print(f"Extracted {len(object_ids)} objects. Master ID: {master_id}")
    return object_ids, master_id

if __name__ == "__main__":
    from torchvision.models.detection import maskrcnn_resnet50_fpn

    image_path = "/content/WhatsApp Image 2024-02-27 at 3.07.59 PM (2).jpeg"
    model = maskrcnn_resnet50_fpn(pretrained=True)
    model.eval()

    main(image_path, model)



Extracted 11 objects. Master ID: 8f07c123-bbf2-4630-b9ea-951ef1958f02


In [5]:
import torch
from PIL import Image
import clip
import os
import json

def load_clip_model():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    return model, preprocess, device

def identify_object(model, preprocess, image_path, device):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # List of potential object categories
    categories = [
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
        "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
        "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
        "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
        "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
        "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
        "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
        "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
    ]

    text = clip.tokenize(categories).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(5)

    results = [
        {"category": categories[idx], "confidence": value.item()}
        for value, idx in zip(values, indices)
    ]

    # Generate a description
    top_category = results[0]["category"]
    description = f"This image appears to contain a {top_category}. "
    if len(results) > 1:
        description += f"It might also be a {results[1]['category']} or a {results[2]['category']}."

    return results, description

def process_objects(input_folder, model, preprocess, device):
    object_descriptions = {}

    for filename in os.listdir(input_folder):
        if filename.endswith(".png"):
            object_id = filename[:-4]  # Remove .png extension
            image_path = os.path.join(input_folder, filename)

            results, description = identify_object(model, preprocess, image_path, device)

            object_descriptions[object_id] = {
                "top_categories": results,
                "description": description
            }

    return object_descriptions

def save_descriptions(object_descriptions, output_file):
    with open(output_file, 'w') as f:
        json.dump(object_descriptions, f, indent=2)

def main(input_folder):
    model, preprocess, device = load_clip_model()
    object_descriptions = process_objects(input_folder, model, preprocess, device)

    output_file = "object_descriptions.json"
    save_descriptions(object_descriptions, output_file)

    print(f"Object identification complete. Descriptions saved to {output_file}")

if __name__ == "__main__":
    input_folder = "extracted_objects"  # Folder containing extracted object images from Step 2
    main(input_folder)

100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 92.8MiB/s]


Object identification complete. Descriptions saved to object_descriptions.json


In [8]:
import easyocr
import os
import json
from PIL import Image
import numpy as np

def load_ocr_reader(languages=['en']):
    return easyocr.Reader(languages)

def extract_text(reader, image_path):
    # Read the image
    image = Image.open(image_path)
    image_np = np.array(image)

    # Perform OCR
    results = reader.readtext(image_np)

    # Extract text and confidence
    extracted_data = [
        {
            "text": result[1],
            "confidence": result[2],
            "bounding_box": result[0]
        } for result in results
    ]

    return extracted_data

def process_objects(input_folder, reader):
    object_text_data = {}

    for filename in os.listdir(input_folder):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            object_id = os.path.splitext(filename)[0]
            image_path = os.path.join(input_folder, filename)

            extracted_data = extract_text(reader, image_path)

            object_text_data[object_id] = extracted_data

    return object_text_data

def save_text_data(object_text_data, output_file):
    with open(output_file, 'w') as f:
        json.dump(object_text_data, f, indent=2)

def main(input_folder):
    # Load the OCR reader
    reader = load_ocr_reader()

    # Process all objects
    object_text_data = process_objects(input_folder, reader)

    # Save the extracted text data
    output_file = "object_text_data.json"
    save_text_data(object_text_data, output_file)

    print(f"Text extraction complete. Data saved to {output_file}")

    return object_text_data

if __name__ == "__main__":
    input_folder = "extracted_objects"  # Folder containing extracted object images from Step 2
    main(input_folder)



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% CompleteText extraction complete. Data saved to object_text_data.json


In [15]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def load_previous_data(identification_file, text_data_file):
    with open(identification_file, 'r') as f:
        identification_data = json.load(f)

    with open(text_data_file, 'r') as f:
        text_data = json.load(f)

    return identification_data, text_data

def extract_key_terms(text, n=5):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in word_tokens if w.isalnum() and w not in stop_words]

    # Count and return top N most common terms
    return [word for word, _ in Counter(filtered_tokens).most_common(n)]

def generate_summary(object_id, identification, text_data):
    top_category = identification['top_categories'][0]['category']
    confidence = identification['top_categories'][0]['confidence']

    summary = f"Object {object_id} is identified as a {top_category} with {confidence:.2f} confidence. "

    if len(identification['top_categories']) > 1:
        second_category = identification['top_categories'][1]['category']
        summary += f"It might also be a {second_category}. "

    if text_data:
        extracted_text = ' '.join([item['text'] for item in text_data])
        key_terms = extract_key_terms(extracted_text)
        if key_terms:
            summary += f"Key terms associated with this object are: {', '.join(key_terms)}. "
    else:
        summary += "No text was extracted from this object. "

    return summary.strip()

def process_objects(identification_data, text_data):
    object_summaries = {}

    for object_id in identification_data.keys():
        identification = identification_data[object_id]
        object_text_data = text_data.get(object_id, [])

        summary = generate_summary(object_id, identification, object_text_data)
        object_summaries[object_id] = summary

    return object_summaries

def save_summaries(object_summaries, output_file):
    with open(output_file, 'w') as f:
        json.dump(object_summaries, f, indent=2)

def main(identification_file, text_data_file):
    # Load data from previous steps
    identification_data, text_data = load_previous_data(identification_file, text_data_file)

    # Process all objects
    object_summaries = process_objects(identification_data, text_data)

    # Save the summaries
    output_file = "object_summaries.json"
    save_summaries(object_summaries, output_file)

    print(f"Object summarization complete. Summaries saved to {output_file}")

    return object_summaries

if __name__ == "__main__":
    identification_file = "object_descriptions.json"  # Output from Step 3
    text_data_file = "object_text_data.json"  # Output from Step 4
    main(identification_file, text_data_file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Object summarization complete. Summaries saved to object_summaries.json


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
import json
import sqlite3
import os

def load_data_from_previous_steps():
    # Load object descriptions (Step 3)
    with open('object_descriptions.json', 'r') as f:
        object_descriptions = json.load(f)

    # Load text data (Step 4)
    with open('object_text_data.json', 'r') as f:
        object_text_data = json.load(f)

    # Load object summaries (Step 5)
    with open('object_summaries.json', 'r') as f:
        object_summaries = json.load(f)

    return object_descriptions, object_text_data, object_summaries

def get_object_metadata_from_db():
    conn = sqlite3.connect('objects_database.db')
    c = conn.cursor()
    c.execute("SELECT id, master_id FROM objects")
    object_metadata = {row[0]: {"master_id": row[1]} for row in c.fetchall()}
    conn.close()
    return object_metadata

def map_data(object_metadata, object_descriptions, object_text_data, object_summaries):
    mapped_data = {}

    for object_id, metadata in object_metadata.items():
        master_id = metadata['master_id']

        if master_id not in mapped_data:
            mapped_data[master_id] = {
                "objects": {}
            }

        mapped_data[master_id]["objects"][object_id] = {
            "identification": object_descriptions.get(object_id, {}),
            "extracted_text": object_text_data.get(object_id, []),
            "summary": object_summaries.get(object_id, "")
        }

    return mapped_data

def save_mapped_data(mapped_data, output_file):
    with open(output_file, 'w') as f:
        json.dump(mapped_data, f, indent=2)

def main():
    # Load data from previous steps
    object_descriptions, object_text_data, object_summaries = load_data_from_previous_steps()

    # Get object metadata from the database
    object_metadata = get_object_metadata_from_db()

    # Map all data
    mapped_data = map_data(object_metadata, object_descriptions, object_text_data, object_summaries)

    # Save the mapped data
    output_file = "mapped_data.json"
    save_mapped_data(mapped_data, output_file)

    print(f"Data mapping complete. Mapped data saved to {output_file}")

    return mapped_data

if __name__ == "__main__":
    main()

Data mapping complete. Mapped data saved to mapped_data.json


In [17]:
import json
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont

def load_mapped_data(mapped_data_file):
    with open(mapped_data_file, 'r') as f:
        return json.load(f)

def load_image(image_path):
    return cv2.imread(image_path)

def annotate_image(image, objects_data):
    # Convert BGR to RGB
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(image_rgb)
    draw = ImageDraw.Draw(pil_image)

    # Use a default font
    font = ImageFont.load_default()

    for obj_id, obj_data in objects_data['objects'].items():
        # Assuming bounding box information is available in obj_data
        # If not, you might need to adjust this part
        bbox = obj_data.get('bounding_box', [0, 0, 100, 100])  # default values if not available
        x, y, w, h = bbox

        # Draw bounding box
        draw.rectangle([x, y, x+w, y+h], outline="red", width=2)

        # Draw label
        label = obj_data['identification']['top_categories'][0]['category']
        draw.text((x, y-20), f"{label} ({obj_id})", font=font, fill="red")

    return np.array(pil_image)

def create_summary_table(objects_data):
    rows = []
    for obj_id, obj_data in objects_data['objects'].items():
        row = {
            'Object ID': obj_id,
            'Category': obj_data['identification']['top_categories'][0]['category'],
            'Confidence': obj_data['identification']['top_categories'][0]['confidence'],
            'Extracted Text': '; '.join([text['text'] for text in obj_data['extracted_text']]),
            'Summary': obj_data['summary']
        }
        rows.append(row)

    return pd.DataFrame(rows)

def save_output(annotated_image, summary_table, output_image_path, output_table_path):
    # Save annotated image
    plt.figure(figsize=(12, 8))
    plt.imshow(annotated_image)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_image_path)
    plt.close()

    # Save summary table
    summary_table.to_csv(output_table_path, index=False)

def main(mapped_data_file, original_image_path):
    # Load mapped data
    mapped_data = load_mapped_data(mapped_data_file)

    # Process each master image
    for master_id, master_data in mapped_data.items():
        # Load original image
        original_image = load_image(original_image_path)

        # Annotate image
        annotated_image = annotate_image(original_image, master_data)

        # Create summary table
        summary_table = create_summary_table(master_data)

        # Save outputs
        output_image_path = f"annotated_image_{master_id}.png"
        output_table_path = f"summary_table_{master_id}.csv"
        save_output(annotated_image, summary_table, output_image_path, output_table_path)

        print(f"Output generated for master image {master_id}:")
        print(f"- Annotated image saved as {output_image_path}")
        print(f"- Summary table saved as {output_table_path}")

if __name__ == "__main__":
    mapped_data_file = "mapped_data.json"
    original_image_path = "/content/WhatsApp Image 2024-02-27 at 3.07.59 PM (2).jpeg"  # Replace with actual path
    main(mapped_data_file, original_image_path)

Output generated for master image 8f07c123-bbf2-4630-b9ea-951ef1958f02:
- Annotated image saved as annotated_image_8f07c123-bbf2-4630-b9ea-951ef1958f02.png
- Summary table saved as summary_table_8f07c123-bbf2-4630-b9ea-951ef1958f02.csv
