# Setup image and stuff

Run the first block, then skip till the next markdown cell.

In [None]:
# Read Luxury_Products_Apparel_Data.csv
import os
import csv

path = os.path.join(".", "Luxury_Products_Apparel_Data.csv")
print(path)

header: list[str] = []
array: list[list[str]] = []

with open(path, 'r', encoding="UTF-8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    header = next(csvreader)
    header[0] = "id"
    for row in csvreader:
        array.append(row)

# sort by first column
array.sort(key=lambda x: int(x[0]))

In [None]:
print("Before: ", len(array))

# Remove row has empty string in any column
array = [row for row in array if all(row)]

# Remove third column
header = header[:2] + header[3:]
array = [row[:2] + row[3:] for row in array]

# Remove the '"'
array = [[cell.replace('"', '') for cell in row] for row in array]

# print count of rows
print("After: ", len(array))

In [None]:
# Show header
print(header)

# Show first 5 rows
for i in range(5):
    print(array[i])

In [None]:
# Filter similarity
from difflib import SequenceMatcher


percent = 0.4
def similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()


# Remove the similar rows (Very time consuming: 29m 32s)
i = 0
j = 0
while i < len(array):
    j = i + 1
    while j < len(array):
        if similarity(array[i][3], array[j][3]) > percent:
            print(array[i][3], "\n", array[j][3], "\n", similarity(array[i][3], array[j][3]))
            print("=> Remove: ", array[j][3])
            print()
            array.pop(j)
        else:
            j += 1
    i += 1

# print count of rows
print("After: ", len(array))

In [None]:
# Add url column in array, add to array2
source = "https://source.unsplash.com/random/900×900/?"
array2 = [[source + row[1]] + row for row in array]
print(len(array2))
for i in array2:
    print(i)

In [None]:
# Export to csv the array2. This is a checkpoint due to the time consuming of the similarity filter
path2 = os.path.join(".", "modified_Luxury_Products_Apparel_Data.csv")
# with open(path2, 'w', newline='', encoding="UTF-8") as csvfile:
#     csvwriter = csv.writer(csvfile)
#     csvwriter.writerow(["url"] + header)
#     for row in array2:
#         csvwriter.writerow(row)

print("Exported to ", path2)

# Ok stop scrolling.

Run blocks bellow

In [None]:
# import the modified csv, write into array2
array2 = []
with open(path2, 'r', encoding="UTF-8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    header = next(csvreader)
    for row in csvreader:
        array2.append(row)

In [None]:
# make a folder and images, download the images from the url, change the name to id.jpg
# import requests
# import shutil

os.makedirs("images", exist_ok=True)

# Yes I use single thread, because I don't want to get banned from the source teehee
# for row in array2:
#     url = row[0]
#     response = requests.get(url, stream=True)
#     with open(f"images" + "/" + row[1] + ".jpg", 'wb') as out_file:
#         shutil.copyfileobj(response.raw, out_file)
#     print("Downloaded: ", row[1])

# async multi-threading download version (not recommended)
import asyncio
import aiohttp

# Create a semaphore with a limit of 50 concurrent downloads
semaphore = asyncio.Semaphore(50)

async def download_image(session, url: str, id: str):
    async with semaphore:
        async with session.get(url) as response:
            with open(f"images" + "/" + id + ".jpg", "wb") as out_file:
                out_file.write(await response.read())
            print("Downloaded: ", id)


async def main():
    # Create a single aiohttp.ClientSession
    async with aiohttp.ClientSession() as session:
        # Create tasks for all downloads
        tasks = [download_image(session, row[0], row[1]) for row in array2]

        # Use asyncio.gather to run all tasks concurrently
        await asyncio.gather(*tasks)


# Run the main function
await main()

In [None]:
# Create modified_images folder, crop image to square in the center both horizontally and vertically
from PIL import Image

os.makedirs("modified_images", exist_ok=True)

for row in array2:
    img = Image.open(f"images/{row[1]}.jpg")
    width, height = img.size
    if width > height:
        left = (width - height) / 2
        right = (width + height) / 2
        top = 0
        bottom = height
    else:
        left = 0
        right = width
        top = (height - width) / 2
        bottom = (height + width) / 2

    img = img.crop((left, top, right, bottom)) # type: ignore
    img.save(f"modified_images/{row[1]}.jpg")

In [None]:
# write as json
import json
import random

# print(header)
# for row in array2:
#     print(row)

# json file is an array of:
# {
#     name -> row[3]
#     description -> row[4]
#     price -> random from 1 to 100
#     image -> "/static/img/row[1].jpg"
#     category -> row[2]
#     priceHistory -> array of float, length random from 10 to 100, value random from 1 to 100, last value must be the same as price
# }

json_array = []

for row in array2:
    priceHistory = [random.randint(1, 100) for _ in range(random.randint(10, 100))]
    json_array.append({
        "name": row[3],
        "description": row[4],
        "price": priceHistory[-1],
        "image": f"/static/img/{row[1]}.jpg",
        # "category": row[2],
        "priceHistory": priceHistory
    })

with open("product_data.json", "w") as f:
    json.dump(json_array, f)

In [None]:
# Now json for category
# {
#     name -> row[2]
# }

json_category_set = set()
for row in array2:
    json_category_set.add(row[2])

json_category = [{"name": category} for category in json_category_set]

with open("category_data.json", "w") as f:
    json.dump(json_category, f)

In [None]:
# Now json for provider
provider = [
    "Gucci",
    "Chanel",
    "Louis Vuitton",
    "Hermes",
    "Rolex",
    "Cartier",
    "Prada",
    "Burberry",
    "Tiffany & Co.",
    "Dior",
]

json_provider = [{"name": name} for name in provider]

with open("provider_data.json", "w") as f:
    json.dump(json_provider, f)

In [None]:
# Make a json map row[1] to row[2]
# {
#     id: row[1]
#     category: row[2]
# }

json_map = [{"id": row[1], "category": row[2]} for row in array2]

with open("map_data.json", "w") as f:
    json.dump(json_map, f)

# Now copy images to static folder of nginx and you are good to go.

Bye bye