In [1]:
from PIL import Image
from io import BytesIO
from collections import Counter
from transformers import CLIPProcessor, CLIPModel, SegformerImageProcessor, AutoModelForSemanticSegmentation , AutoFeatureExtractor
from urllib.parse import urlparse, urlunparse
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import os
import weaviate
import numpy as np
import json
import cv2
import base64
import re
import ast

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
PATH = "D:\\Codes\\ziggy-ai\\scraping\\Flipkart"

In [11]:
client = weaviate.Client(url="http://localhost:8080")

In [17]:
flipkart = open("flipkart_clean.json", "r")
flipkart_class = json.load(flipkart)
flipkart.close()
client.schema.delete_class("FlipkartCleanProducts")
client.schema.create_class(flipkart_class)

In [None]:
# flipkart = open("flipkart_products.json", "r")
# flipkart_class = json.load(flipkart)
# flipkart.close()
# client.schema.delete_class("FlipkartProducts")
# client.schema.create_class(flipkart_class)


In [3]:
checkpoint = "patrickjohncyh/fashion-clip"
model = CLIPModel.from_pretrained(checkpoint)
processor = CLIPProcessor.from_pretrained(checkpoint)
seg_processor = SegformerImageProcessor.from_pretrained("mattmdjaga/segformer_b2_clothes")
seg_model = AutoModelForSemanticSegmentation.from_pretrained("mattmdjaga/segformer_b2_clothes")



In [4]:
def getImageEmbeddingsFromPath(image_path):
	image = Image.open(image_path)
	inputs = processor(text=["dummy"] , images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs , return_dict=True)
	return outputs["image_embeds"]

def getImageEmbeddings(image):
	inputs = processor(text=["dummy"] , images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs , return_dict=True)
	return outputs["image_embeds"]

def applyMask(image, mask):
	image = np.array(image)
	mask = np.array(mask)
	mask = np.stack((mask,)*3, axis=-1)
	resultant = image*mask
	resultant[mask == 0] = 255
	return resultant

def cropImage(image):
	temp = image[:, :, ::-1].copy() 
	temp = temp.astype('uint8')
	gray = cv2.cvtColor(temp, cv2.COLOR_BGR2GRAY)
	thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
	contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
	contours = sorted(contours, key=lambda x: cv2.contourArea(x), reverse=True)
	x,y,w,h = cv2.boundingRect(contours[0])
	crop = image[y:y+h, x:x+w]
	return crop

def segment(image, to_mask):
	inputs = seg_processor(images=image, return_tensors="pt")
	outputs = seg_model(**inputs)
	logits = outputs.logits.cpu()
	upsampled_logits = nn.functional.interpolate(
		logits,
		size=image.size[::-1],
		mode="bilinear",
		align_corners=False,
	)
	pred_seg = upsampled_logits.argmax(dim=1)[0]
	result = []
	for i in to_mask:
		mask = pred_seg.numpy().copy()
		mask[mask != i] = 0
		mask[mask == i] = 1
		item = applyMask(image, mask)
		result.append(item)
	return result

def segmentAndEmbed(image_path, to_mask):
	result = {}
	image = Image.open(image_path)
	fullImageEmbedding = getImageEmbeddings(image)
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	fullImageBase64 = base64.b64encode(buffered.getvalue()).decode()
	result["fullImageBase64"] = fullImageBase64
	result["fullImageEmbedding"] = fullImageEmbedding
	segments = segment(image, to_mask)
	for i in range(len(to_mask)):
		segmentEmbedding = getImageEmbeddings(segments[i])
		segments[i] = Image.fromarray(np.uint8(segments[i]))
		buffered = BytesIO()
		segments[i].save(buffered, format="PNG")
		segmentBase64 = base64.b64encode(buffered.getvalue()).decode()
		result[f"segmentBase64_{i}"] = segmentBase64
		result[f"segmentEmbedding_{i}"] = segmentEmbedding
	return result

In [19]:
def clean_url(url):
    parsed_url = urlparse(url)
    cleaned_parsed_url = parsed_url._replace(query='')
    cleaned_url = urlunparse(cleaned_parsed_url)
    return cleaned_url

def extract_first_amount(text):
    amount_match = re.search(r'₹(\d+)', text)
    if amount_match:
        first_amount = int(amount_match.group(1))
        if first_amount == 1:
            first_amount = 499
        return first_amount
    else:
        return None

def get_rating_and_number(string1, string2):
    rating = string1[0:(len(string1)-len(string2))]
    ratings_match = re.search(r'([\d,.]+) ratings', string2)
    if ratings_match:
        num_ratings = int(ratings_match.group(1).replace(',', ''))
    else:
        num_ratings = 0
    return rating, num_ratings

def get_product_details(dict_string):
    dict = ast.literal_eval(dict_string)
    keys_to_check = ['Color', 'Pattern', 'Fabric', 'Fit', 'Type']
    new_dict = {key: dict.get(key, None) for key in keys_to_check}
    return new_dict


In [None]:
def edit_file_names(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg"):
            try: 
                new_filename = filename.split("_")[0] + ".jpg"
                original_path = os.path.join(folder_path, filename)
                new_path = os.path.join(folder_path, new_filename)
                os.rename(original_path, new_path)
            except:
                pass

In [None]:
edit_file_names(f"{PATH}\\flipkart_images\\Formal Pants Weaviate")

In [30]:
df = pd.read_csv(f"{PATH}\\Girls Bottoms.csv")

In [29]:
with client.batch(batch_size=100, num_workers=2) as batch:
    for index, row in df.iterrows():
        if not os.path.exists(f"{PATH}\\flipkart_images\\Girls Bottoms Weaviate\\{index}.jpg"):
            continue
        try:
            ratings, num_ratings = get_rating_and_number(row["Rating"], row["Reviews"])
            specification_dict = get_product_details(row["Specifications"])
            output = segmentAndEmbed(f"{PATH}\\flipkart_images\\Girls Bottoms Weaviate\\{index}.jpg", [6])
            flipkart_obj = {
                "Image" : output["segmentBase64_0"],
                # "Image" : output["fullImageBase64"],
                "URL" : clean_url(row["URL"]),
                "Category" : "Girls Bottoms",
                "Brand" : row["Brand"],
                "Product" : row["Name"],
                "Price" : extract_first_amount(row["Price"]),
                "Rating" : ratings,
                "NumberRatings" : num_ratings,
                "Colour" : specification_dict["Color"],
                "Pattern" : specification_dict["Pattern"],
                "Fabric" : specification_dict["Fabric"],
                "Fit" : specification_dict["Fit"],
                "Type" : specification_dict["Type"],
                "Specification" : row["Specifications"]
            }
            flipkart_uuid = batch.add_data_object(flipkart_obj, "FlipkartCleanProducts", vector=output["fullImageEmbedding"])
            print(f"Added {index} to Weaviate")
        except Exception as e:
            print(f"Error while adding {index} to Weaviate: {e}")

Added 69 to Weaviate
Added 70 to Weaviate
Added 74 to Weaviate
Added 75 to Weaviate
Added 80 to Weaviate
Added 81 to Weaviate


In [32]:
for index, row in df.iterrows():
        if not os.path.exists(f"{PATH}\\flipkart_images\\Girls Bottoms Weaviate\\{index}.jpg"):
            continue
        output = segmentAndEmbed(f"{PATH}\\flipkart_images\\Girls Bottoms Weaviate\\{index}.jpg", [6])
        image = output["segmentBase64_0"]
        image = Image.open(BytesIO(base64.b64decode(image.split(",",1)[0])))
        image.save(f"{PATH}\\flipkart_images\\Girls Bottoms Segment\\{index}.jpg")
    
