In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_amazon_data():
    base_url = 'https://www.amazon.com'
    clothing_items = []

    # Iterate over multiple pages of clothing items on Amazon
    for page in range(1, 6):  # Adjust the range based on the number of pages you want to scrape
        url = f'{base_url}/s?k=clothing&page={page}'  # Adjust the URL structure based on the target website's pagination
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        item_tags = soup.find_all('div', {'data-component-type': 's-search-result'})  # Adjust the tag and attributes based on the website's structure

        for item in item_tags:
            item_description = item.find('span', class_='a-size-base-plus').text.strip()
            item_url = base_url + item.find('a', class_='a-link-normal', href=True)['href']

            clothing_items.append({
                'description': item_description,
                'url': item_url
            })

    return clothing_items

def scrape_flipkart_data():
    base_url = 'https://www.flipkart.com'
    clothing_items = []

    # Iterate over multiple pages of clothing items on Flipkart
    for page in range(1, 6):  # Adjust the range based on the number of pages you want to scrape
        url = f'{base_url}/search?q=clothing&page={page}'  # Adjust the URL structure based on the target website's pagination
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        item_tags = soup.find_all('a', {'class': '_1fQZEK'})  # Adjust the tag and attributes based on the website's structure

        for item in item_tags:
            item_description = item.find('a', {'class': 'IRpwTa'}).text.strip()
            item_url = base_url + item['href']

            clothing_items.append({
                'description': item_description,
                'url': item_url
            })

    return clothing_items

# Scrape the data from Amazon and Flipkart
amazon_data = scrape_amazon_data()
flipkart_data = scrape_flipkart_data()

# Combine the data from both sources
dataset = amazon_data + flipkart_data

# Save the data to a CSV file
csv_file = 'clothing_dataset.csv'
fieldnames = ['description', 'url']

with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(dataset)

print('Scraping completed and dataset saved to', csv_file)



Scraping completed and dataset saved to clothing_dataset.csv


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_clothing_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    clothing_items = []
    item_tags = soup.find_all('a', class_='styles__link--3QJ5N')  # Adjust the tag and class based on the website's structure
    for item in item_tags:
        item_description = item.find('span', class_='styles__box--2Ufmy styles__text--23E5U styles__display6--3wsBG styles__nowrap--33UtL styles__display-block--3kWC4').text.strip()
        clothing_items.append({
            'description': item_description,
            'url': item['href']
        })

    return clothing_items

def scrape_multiple_websites(websites):
    all_clothing_items = []

    for website in websites:
        for page in range(1, 6):  # Adjust the range based on the number of pages you want to scrape
            url = website + '?page=' + str(page)  # Adjust the URL structure based on the target website's pagination
            clothing_items = scrape_clothing_data(url)
            print(clothing_items)
            all_clothing_items.extend(clothing_items)

    return all_clothing_items

# List of e-commerce websites to scrape
websites = [
    
    'https://www.redbubble.com/shop/mens-clothing',
    #'https://www.bewakoof.com/women-joggers-sweatpants'
    #'https://www.redbubble.com/shop'
    #'https://www.redbubble.com/g/clothing'
    
]

# Scrape the data
dataset = scrape_multiple_websites(websites)
print(dataset)
print("Hello World")

# Save the data to a CSV file
csv_file = 'clothing_dataset.csv'
fieldnames = ['description', 'url']

with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(dataset)

print('Scraping completed and dataset saved to', csv_file)


[{'description': 'Ghost Of Disapproval Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Ghost-Of-Disapproval-by-obinsun/17844852.IJ6L0'}, {'description': 'Street Cats Essential T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Street-Cats-by-wytrab8/37992310.G22WK'}, {'description': 'Stay Positive Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Stay-Positive-by-stevenrhodes/79299073.7H7A9'}, {'description': "Sorry I'm late. I didn't want to come. Classic T-Shirt", 'url': 'https://www.redbubble.com/i/t-shirt/Sorry-I-m-late-I-didn-t-want-to-come-by-chestify/29664349.IJ6L0'}, {'description': 'Visit Arrakis - Vintage Distressed Surf - Dune - Sci Fi Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Visit-Arrakis-Vintage-Distressed-Surf-Dune-Sci-Fi-by-Nemons/39719380.S7RYU'}, {'description': 'The Bodacious Period Essential T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/The-Bodacious-Period-by-wytrab8/26255784.G22WK'}, {'description': 'Many La

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_flipkart(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    items = soup.find_all('div', {'class': '_1xHGtK _373qXS'}) 
    data = []
    for item in items:
        description = item.find('a', {'class': 'IRpwTa'})['title']
        product_url = 'https://www.flipkart.com' + item.find('a', {'class': '_2UzuFa'})['href']
        data.append({'description': description, 'url': product_url})

    print(data)

    return data

def save_to_csv(data, filename):
    keys = data[0].keys() if data else []
    
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, keys)
        writer.writeheader()
        writer.writerows(data)

def scrape_e_commerce_websites():
    e_commerce_websites = [
        {
            'name': 'Flipkart',
            'url': 'https://www.flipkart.com/search?q=clothing',
            'scrape_function': scrape_flipkart
        },
        # Add more e-commerce websites here
    ]

    all_data = []
    for website in e_commerce_websites:
        print(f"Scraping {website['name']}...")
        data = website['scrape_function'](website['url'])
        all_data.extend(data)

    save_to_csv(all_data, 'clothing_dataset.csv')
    print("Scraping completed successfully.")

if __name__ == '__main__':
    scrape_e_commerce_websites()


Scraping Flipkart...
[{'description': 'Women Regular Fit Black Cotton Blend Trousers', 'url': 'https://www.flipkart.com/lee-tex-regular-fit-women-black-trousers/p/itme21873a951c47?pid=TROGHKK6SJVGGNQB&lid=LSTTROGHKK6SJVGGNQBAGCVBI&marketplace=FLIPKART&q=clothing&store=clo&srno=s_1_1&otracker=search&fm=organic&iid=en_fqGbglqncEMinJxjhiEPidFQLkwV7vC89OCS1SIG9rSvb1hMl1DDoN2FWyhnn%2By1MlNgG7TJlusF%2FoQP0bWTRA%3D%3D&ppt=None&ppn=None&ssid=yrkrmveq340000001684690400939&qH=50cf19912960f654'}, {'description': 'Pack of 2 Men Solid Black, Grey Track Pants', 'url': 'https://www.flipkart.com/vebnor-solid-men-black-grey-track-pants/p/itmea1098f6f1b58?pid=TKPGCG42S6GU8PB4&lid=LSTTKPGCG42S6GU8PB4XWPDGC&marketplace=FLIPKART&q=clothing&store=clo&srno=s_1_2&otracker=search&fm=organic&iid=en_fqGbglqncEMinJxjhiEPidFQLkwV7vC89OCS1SIG9rReAWSS5cSgV6yzR3BD27deWOGuQ8i0WY8AhaK5FFW0fA%3D%3D&ppt=None&ppn=None&ssid=yrkrmveq340000001684690400939&qH=50cf19912960f654'}, {'description': 'Unstitched Polyester Shirt Fab

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_clothing_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    clothing_items = []
    item_tags = soup.find_all('a', class_='styles__link--3QJ5N')  # Adjust the tag and class based on the website's structure
    for item in item_tags:
        item_description = item.find('span', class_='styles__box--2Ufmy styles__text--23E5U styles__display6--3wsBG styles__nowrap--33UtL styles__display-block--3kWC4').text.strip()
        clothing_items.append({
            'description': item_description,
            'url': item['href']
        })

    return clothing_items

def scrape_multiple_websites(websites):
    all_clothing_items = []

    for website in websites:
        for page in range(1, 6):  # Adjust the range based on the number of pages you want to scrape
            url = website + '?page=' + str(page)  # Adjust the URL structure based on the target website's pagination
            clothing_items = scrape_clothing_data(url)
            print(clothing_items)
            all_clothing_items.extend(clothing_items)

    return all_clothing_items

# List of e-commerce websites to scrape
websites = [
    
    'https://www.redbubble.com/shop/womens-clothing',
    #'https://www.bewakoof.com/women-joggers-sweatpants'
    #'https://www.redbubble.com/shop'
    #'https://www.redbubble.com/g/clothing'
    
]

# Scrape the data
dataset = scrape_multiple_websites(websites)
print(dataset)
print("Hello World")

# Save the data to a CSV file
csv_file = 'clothing_dataset.csv'
fieldnames = ['description', 'url']

with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(dataset)

print('Scraping completed and dataset saved to', csv_file)


[{'description': 'Ghost Of Disapproval Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Ghost-Of-Disapproval-by-obinsun/17844852.IJ6L0.XYZ'}, {'description': 'Cute Galaxy Girl and Forest Wood Cabin Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Cute-Galaxy-Girl-and-Forest-Wood-Cabin-by-SpringNguyen/145327459.QUQES.XYZ'}, {'description': 'Street Cats Essential T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Street-Cats-by-wytrab8/37992310.G22WK.XYZ'}, {'description': 'Stay Positive Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/Stay-Positive-by-stevenrhodes/79299073.7H7A9.XYZ'}, {'description': 'mahout Classic T-Shirt', 'url': 'https://www.redbubble.com/i/t-shirt/mahout-by-Kayordesign/145511677.WFLAH.XYZ'}, {'description': "Sorry I'm late. I didn't want to come. Classic T-Shirt", 'url': 'https://www.redbubble.com/i/t-shirt/Sorry-I-m-late-I-didn-t-want-to-come-by-chestify/29664349.IJ6L0.XYZ'}, {'description': 'Visit Arrakis - Vintage Distr

In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
df = pd.read_csv('Combined_dataset.csv')

In [3]:
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

def normalize_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Lemmatization using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join the words back into a string
    normalized_text = ' '.join(words)
    return normalized_text

def preprocess_text(text):
    cleaned_text = clean_text(text)
    normalized_text = normalize_text(cleaned_text)
    return normalized_text


In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

df['description'] = df['description'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
df.to_csv('normalised_dataset.csv', index=False)

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [8]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import pandas as pd

# Assuming you have a dataset in a pandas DataFrame with columns 'description' and 'url'
data = pd.read_csv('normalised_dataset.csv')

# Extract the descriptions and URLs
descriptions = data['description'].tolist()
urls = data['url'].tolist()

# Load the pre-trained transformer model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name, from_pt=True)

# Tokenize the descriptions
tokenized_inputs = tokenizer(str(descriptions), padding=True, truncation=True, return_tensors='tf')

# Generate embeddings using the transformer model
outputs = model(**tokenized_inputs)

# Access the embeddings
embeddings = outputs.last_hidden_state

# Print the extracted features
for doc_index, doc_embedding in enumerate(embeddings):
    print(f"Features for Document {doc_index + 1} (URL: {urls[doc_index]}):")
    print(doc_embedding)
    print()


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Features for Document 1 (URL: https://www.redbubble.com/i/t-shirt/Ghost-Of-Disapproval-by-obinsun/17844852.IJ6L0):
tf.Tensor(
[[ 0.16651796  0.1343925   0.3281678  ...  0.53296465  0.5528433
   0.2956033 ]
 [ 1.2062598   0.58638656 -0.5141406  ...  0.7866383   0.4551862
  -0.43499044]
 [-0.4346164   0.37482083  0.26867533 ...  0.76647     0.43390837
  -0.24965292]
 ...
 [ 0.9243228   1.0141428   0.5911616  ...  0.27782536 -0.24440522
  -1.5358912 ]
 [ 0.07750858  0.14886549  0.254744   ... -0.12839878 -0.04510181
  -0.1685324 ]
 [ 0.7484343   0.3540758  -0.24561197 ...  0.41793588 -0.23155019
   0.01935341]], shape=(512, 768), dtype=float32)



In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assuming you have a dataset in a pandas DataFrame with columns 'description' and 'url'
data = pd.read_csv('normalised_dataset.csv')

# Fill missing values with an empty string
data['description'] = data['description'].fillna('')

# Extract the descriptions and URLs from the dataset
database_texts = data['description'].tolist()
database_urls = data['url'].tolist()

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the database texts
database_features = vectorizer.fit_transform(database_texts)

# Example input text
input_text = "This is a clothing item."

# Transform the input text
input_feature = vectorizer.transform([input_text])

# Compute cosine similarity between input text and database texts
similarity_scores = cosine_similarity(input_feature, database_features)

# Get the index of the most similar text in the database
most_similar_index = np.argmax(similarity_scores)

# Get the corresponding text and URL from the database
most_similar_text = database_texts[most_similar_index]
most_similar_url = database_urls[most_similar_index]

# Print the similarity score, the most similar text, and the corresponding URL
print("Similarity Score:", similarity_scores[0, most_similar_index])
print("Most Similar Text:", most_similar_text)
print("URL:", most_similar_url)


Similarity Score: 0.8933837200165705
Most Similar Text: shirt clothing essential shirt
URL: https://www.redbubble.com/i/t-shirt/Shirt-Just-Some-Clothing-by-designeclipse/32177401.FB110


Top-1: https://www.redbubble.com/i/t-shirt/Shirt-Just-Some-Clothing-by-designeclipse/32177401.FB110.XYZ
Top-2: https://www.redbubble.com/i/t-shirt/Shirt-Just-Some-Clothing-by-designeclipse/32177401.FB110
Top-3: nan
Top-4: https://www.redbubble.com/i/t-shirt/This-is-Why-We-Can-t-Have-Nice-Things-by-obinsun/21875858.IJ6L0
Top-5: https://www.redbubble.com/i/sweatshirt/Save-The-Clock-Tower-by-DeepFriedArt/22058284.0LCRC


In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def find_similar_items(input_text, database_texts, database_urls, top_n=30):
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the database texts
    database_features = vectorizer.fit_transform(database_texts)

    # Transform the input text
    input_feature = vectorizer.transform([input_text])

    # Compute cosine similarity between input text and database texts
    similarity_scores = cosine_similarity(input_feature, database_features)

    # Get the indices of the top-N most similar items
    top_indices = np.argsort(similarity_scores)[0, ::-1][:top_n]

    # Get the URLs of the top-N most similar items
    top_urls = [database_urls[idx] for idx in top_indices]

    return top_urls

# Assuming you have a dataset in a pandas DataFrame with columns 'description' and 'url'
data = pd.read_csv('normalised_dataset.csv')

# Fill missing values with an empty string
data['description'] = data['description'].fillna('')

# Extract the descriptions and URLs from the dataset
database_texts = data['description'].tolist()
database_urls = data['url'].tolist()

# Example input text
input_text = "This is a clothing item."

# Find the top-700 most similar items
top_urls = find_similar_items(input_text, database_texts, database_urls, top_n=700)

# Print the URLs of the top-700 most similar items
for i, url in enumerate(top_urls, 1):
    print(f"Top-{i}: {url}")


Top-1: https://www.redbubble.com/i/t-shirt/Shirt-Just-Some-Clothing-by-designeclipse/32177401.FB110.XYZ
Top-2: https://www.redbubble.com/i/t-shirt/Shirt-Just-Some-Clothing-by-designeclipse/32177401.FB110
Top-3: nan
Top-4: https://www.redbubble.com/i/t-shirt/This-is-Why-We-Can-t-Have-Nice-Things-by-obinsun/21875858.IJ6L0
Top-5: https://www.redbubble.com/i/sweatshirt/Save-The-Clock-Tower-by-DeepFriedArt/22058284.0LCRC
Top-6: https://www.redbubble.com/i/hoodie/Space-Heartbeat-by-carbine/28995330.O6XP1
Top-7: https://www.redbubble.com/i/t-shirt/I-do-not-think-therefore-i-do-not-am-possum-by-BattleGoat/114300605.IJ6L0
Top-8: https://www.redbubble.com/i/t-shirt/Psychedelic-Hippy-Retro-Peace-Art-by-Alondra/22474765.VL7OD
Top-9: https://www.redbubble.com/i/t-shirt/Skateboarding-Frog-by-SaradaBoru/24705507.QUQES
Top-10: https://www.redbubble.com/i/t-shirt/Blue-Sun-Logo-Firefly-by-RennHarper/11547543.UHLBD
Top-11: https://www.redbubble.com/i/t-shirt/SR388-by-tchuk/11328416.9V5H1
Top-12: https://