<a href="https://colab.research.google.com/github/meghanaaggadi-1/projects/blob/masterr/amazonML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install pytesseract opencv-python

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import os
import requests
from PIL import Image
from io import BytesIO

def download_image(image_url, save_dir='images'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    image_name = os.path.join(save_dir, image_url.split('/')[-1])
    response = requests.get(image_url)

    if response.status_code == 200:
        img = Image.open(BytesIO(response.content))
        img.save(image_name)
        return image_name
    else:
        print(f"Failed to download image: {image_url}")
        return None


In [None]:
ALLOWED_UNITS = {
    'item_weight': ['gram', 'kilogram', 'ounce', 'pound'],
    'item_volume': ['litre', 'millilitre'],
    'dimensions': ['centimetre', 'metre', 'millimetre', 'inch'],
    'power': ['watt', 'kilowatt'],
    # Add more entity types and units as needed
}


In [None]:
import cv2
import numpy as np

def preprocess_image(image_path, target_size=(224, 224)):
    img = cv2.imread(image_path)
    img = cv2.resize(img, target_size)
    img = img.astype('float32') / 255.0
    img = np.expand_dims(img, axis=0)  # Expand dimensions for model input
    return img


In [None]:
import pytesseract

def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text


In [None]:
pip install pytesseract



In [13]:
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from concurrent.futures import ThreadPoolExecutor
import urllib.request
import numpy as np
import csv
import subprocess
import time

# Start timing
start_time = time.time()

# 1. Limit the number of images for fast downloading
def download_image(image_url, save_dir):
    image_name = os.path.basename(image_url)
    image_path = os.path.join(save_dir, image_name)

    if not os.path.exists(image_path):
        urllib.request.urlretrieve(image_url, image_path)
    return image_path

def download_images(df, save_dir, limit=50):  # Limit the number of images for fast execution
    with ThreadPoolExecutor() as executor:
        for index, row in df.iterrows():
            if index >= limit:  # Only download a limited number of images for testing
                break
            executor.submit(download_image, row['image_link'], save_dir)

# 2. Load Data (limit dataset for speed)
train_df = pd.read_csv('/content/drive/MyDrive/dataset/train.csv').head(50)  # Limit to 50 rows for faster processing
test_df = pd.read_csv('/content/drive/MyDrive/dataset/test.csv').head(20)

# Adjust image paths if necessary
train_df['image_link'] = train_df['image_link'].apply(lambda x: os.path.join(image_save_dir, os.path.basename(x)))
test_df['image_link'] = test_df['image_link'].apply(lambda x: os.path.join(image_save_dir, os.path.basename(x)))

# Create image directory
image_save_dir = '/content/images/'
os.makedirs(image_save_dir, exist_ok=True)

# Download images for train and test data (limited)
download_images(train_df, image_save_dir, limit=50)
download_images(test_df, image_save_dir, limit=20)

# Verify the existence of images
print("Training Data Sample:")
print(train_df.head())
print("Test Data Sample:")
print(test_df.head())

# 3. Image Data Generator (smaller image size for faster processing)
datagen = ImageDataGenerator(rescale=1./255)

train_generator = datagen.flow_from_dataframe(
    train_df,
    directory=None,  # Local paths are in the image_link column
    x_col='image_link',
    y_col='entity_value',
    target_size=(64, 64),  # Use smaller image size for quicker processing
    batch_size=16,
    class_mode='categorical'
)

# Check the number of classes found
print("Classes found:", train_generator.class_indices)
print("Number of classes:", len(train_generator.class_indices))

# 4. Build a simpler model with ResNet50 (or use MobileNet for even faster results)
def build_model(num_classes):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(64, 64, 3))  # Smaller input size
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dense(1024, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 5. Train the Model (reduce epochs for fast execution)
num_classes = len(train_generator.class_indices)
model = build_model(num_classes)

# Check if there are any batches available for training
if train_generator.samples > 0:
    model.fit(train_generator, epochs=1)  # Only 1 epoch for quicker training
else:
    print("No images found for training.")

# 6. Prediction on Test Data (limit the number of predictions for testing)
def preprocess_image(image_path, target_size):
    from tensorflow.keras.preprocessing import image
    img = image.load_img(image_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return img_array / 255.0

def predict_on_test_data(model, test_df, target_size=(64, 64)):
    predictions = []
    for index, row in test_df.iterrows():
        image_path = row['image_link']
        if os.path.exists(image_path):
            img = preprocess_image(image_path, target_size)
            prediction = model.predict(img)
            entity_value = np.argmax(prediction)  # Simple decoding for testing purposes
            predictions.append((row['index'], entity_value))
        else:
            predictions.append((row['index'], ""))
    return predictions

# 7. Save Predictions to CSV
def save_predictions_to_csv(predictions, output_file='output.csv'):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["index", "prediction"])
        for index, prediction in predictions:
            writer.writerow([index, prediction])

# Make predictions and save them
if train_generator.samples > 0:
    predictions = predict_on_test_data(model, test_df)
    save_predictions_to_csv(predictions, '/content/output.csv')

# 8. Sanity Check
def run_sanity_check(output_file):
    result = subprocess.run(['python', 'src/sanity.py', output_file], capture_output=True, text=True)
    print(result.stdout)

if train_generator.samples > 0:
    run_sanity_check('/content/output.csv')

# End timing
print(f"Time taken: {time.time() - start_time} seconds")


Training Data Sample:
                        image_link  group_id  entity_name    entity_value
0  /content/images/61I9XdN6OFL.jpg    748919  item_weight      500.0 gram
1  /content/images/71gSRbyXmoL.jpg    916768  item_volume         1.0 cup
2  /content/images/61BZ4zrjZXL.jpg    459516  item_weight      0.709 gram
3  /content/images/612mrlqiI4L.jpg    459516  item_weight      0.709 gram
4  /content/images/617Tl40LOXL.jpg    731432  item_weight  1400 milligram
Test Data Sample:
   index                       image_link  group_id entity_name
0      0  /content/images/110EibNyclL.jpg    156839      height
1      1  /content/images/11TU2clswzL.jpg    792578       width
2      2  /content/images/11TU2clswzL.jpg    792578      height
3      3  /content/images/11TU2clswzL.jpg    792578       depth
4      4  /content/images/11gHj8dhhrL.jpg    792578       depth
Found 50 validated image filenames belonging to 33 classes.
Classes found: {'0.35 ounce': 0, '0.709 gram': 1, '1 kilogram': 2, '1.0 

  self._warn_if_super_not_called()


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2s/step - accuracy: 0.1352 - loss: 4.2728
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━