In [1]:
import pandas as pd
import nltk
import re

In [2]:
df = pd.read_csv('clothing_data.csv')

In [3]:
df.shape

(26960, 4)

In [4]:
df.sample(10)

Unnamed: 0,brand,description,url,img
11654,Peter England,Men Jeans,https://www.amazon.in/Peter-England-Mens-Jeans...,https://m.media-amazon.com/images/I/617sCUR90o...
12785,Puma,Mens Jacket,https://www.amazon.in/Puma-Mens-A-Line-Coat-84...,https://m.media-amazon.com/images/I/412O5U6Tja...
20244,SATYAM KRAFT,Women's Elastic Stretchy Handmade Resin Round ...,https://www.amazon.in/SATYAM-KRAFT-Elastic-kan...,https://m.media-amazon.com/images/I/41eZTfANSN...
19867,Pragati Trends,Pragati Trends Women's Wrinkle Resistant Poly ...,https://www.amazon.in/Womens-Wrinkle-Resistant...,https://m.media-amazon.com/images/I/61jK0j4x6b...
19936,Miss Mayra,Ankle Leggings Bio Wash 200GSM Ultra Premium 4...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/31tHPfAE3p...
10138,REYA,Men's Regular Fit T-Shirt,https://www.amazon.in/REYA-Regular-T-Shirt-TSR...,https://m.media-amazon.com/images/I/61d2QLrSZ2...
5560,Cute Fellow,Embroidered Semi Stitched Lehenga Choli,https://www.flipkart.com/cute-fellow-embroider...,https://rukminim1.flixcart.com/image/612/612/x...
5424,BHAVNATH FASHION,Embroidered Semi Stitched Lehenga Choli,https://www.flipkart.com/bhavnath-fashion-embr...,https://rukminim1.flixcart.com/image/612/612/k...
21010,LEOTUDE,Regular Fit Cotton-Blend Half Sleeve Men's T-S...,https://www.amazon.in/LEOTUDE-Cotton-Sleeve-Bo...,https://m.media-amazon.com/images/I/71foOIdlxc...
701,CAMPUS SUTRA,Men Regular Fit Checkered Spread Collar Casual...,https://www.flipkart.com/campus-sutra-men-chec...,https://rukminim1.flixcart.com/image/612/612/l...


In [5]:
df = df.drop_duplicates(subset='description')

In [6]:
clothing_df = df[['description','url','img']]
clothing_df_copy = clothing_df.copy()

In [7]:
clothing_df_copy.shape

(1944, 3)

In [8]:
from nltk.corpus import stopwords    # Import the stopwords module from nltk.corpus
from nltk.stem import WordNetLemmatizer   # Import the WordNetLemmatizer class from nltk.stem
nltk.download('stopwords')    # Download the stopwords dataset
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()    # Create an instance of the PorterStemmer class

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...


In [9]:
def clean_text(text:str):
    cleaned_text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters and replace them with spaces
    cleaned_text = cleaned_text.lower()  # Convert the text to lowercase
    cleaned_text = cleaned_text.split()  # Split the text into a list of words
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text if not word in stopwords.words('english')]  # Perform lemmetization and remove stopwords
    cleaned_text = ' '.join(cleaned_text)  # Join the words back into a single string
    return cleaned_text

In [10]:
# Apply the 'clean_text' function to the 'description' column of the DataFrame
# The lambda function 'lambda x: clean_text(x)' is used to apply the 'clean_text' function to each element in the 'description' column
# The cleaned text is assigned back to the 'description' column
clothing_df['cleaned_description'] = clothing_df['description'].apply(clean_text)

In [11]:
clothing_df.head(5)

Unnamed: 0,description,url,img,cleaned_description
0,Men Regular Fit Printed Casual Shirt,https://www.flipkart.com/solbiza-men-printed-c...,https://rukminim1.flixcart.com/image/612/612/x...,men regular fit printed casual shirt
1,Men Regular Fit Checkered Spread Collar Casual...,https://www.flipkart.com/roadster-men-checkere...,https://rukminim1.flixcart.com/image/612/612/k...,men regular fit checkered spread collar casual...
2,Men Solid Polo Neck Green T-Shirt,https://www.flipkart.com/3bros-solid-men-polo-...,https://rukminim1.flixcart.com/image/612/612/x...,men solid polo neck green shirt
3,"Pack of 2 Men Striped Round Neck Dark Blue, Re...",https://www.flipkart.com/blive-striped-men-rou...,https://rukminim1.flixcart.com/image/612/612/x...,pack men striped round neck dark blue red shirt
4,Men Printed Round Neck Light Blue T-Shirt,https://www.flipkart.com/nb-nicky-boy-printed-...,https://rukminim1.flixcart.com/image/612/612/x...,men printed round neck light blue shirt


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
def recommend_clothes(text:str,top_num:int):
    # Clean the input text by removing non-alphabetic characters
    cleaned_text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert the text to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Split the text into individual words
    cleaned_text = cleaned_text.split()
    
    # Lemmatize the words and remove stopwords
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text if not word in stopwords.words('english')]
    
    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_text)
    
    # Create a Pandas Series with the cleaned text
    cleaned_text_as_series = pd.Series([cleaned_text])
    
    # Get the existing descriptions from the clothing dataframe
    descriptions = clothing_df['cleaned_description']
    
    # Concatenate the existing descriptions with the cleaned text series
    decription_with_new_text = pd.concat([descriptions,cleaned_text_as_series]).reset_index(drop=True)
    
    # Vectorize the descriptions using CountVectorizer
    vectors = cv.fit_transform(decription_with_new_text).toarray()
    
    # Compute the cosine similarity scores between vectors
    similarity_scores = cosine_similarity(vectors)
    
    # Find the index of the input description in the concatenated series
    input_description_index = decription_with_new_text[decription_with_new_text==cleaned_text].index[0]
    
    # Get the similarity scores of the input description with other descriptions
    distances = similarity_scores[input_description_index]
    
    # Get the indices and distances of the top similar clothing items
    clothing_items_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:top_num+1]
    
    # Get the details of the top similar clothing items from the dataframe
    clothing_item_details = [(clothing_df.iloc[each[0]]).to_dict() for each in clothing_items_list]
    
    # Exclude the input description from the list of descriptions
    descriptions = descriptions[descriptions!=cleaned_text]
    
    # Return the details of the recommended clothing items
    return clothing_item_details


In [16]:
cl = recommend_clothes('shirt',5)
cl

[{'description': 'Men T-Shirt',
  'url': 'https://www.amazon.in/EYEBOGLER-Regular-Colorblocked-Sleeves-T-Shirt/dp/B0BRSS8LVS/ref=sr_1_72?content-id=amzn1.sym.f5e83e00-a666-492b-b882-5fa6fba3548e&pd_rd_r=21025b0c-ec69-41d2-96e6-ec699afe0fee&pd_rd_w=lDWwe&pd_rd_wg=IQSQx&pf_rd_p=f5e83e00-a666-492b-b882-5fa6fba3548e&pf_rd_r=4SSBTYVCG98DM53DR4PQ&qid=1684619140&refinements=p_36%3A4595084031&s=apparel&sr=1-72',
  'img': 'https://m.media-amazon.com/images/I/515PNKYCDcL._AC_UL400_.jpg',
  'cleaned_description': 'men shirt'},
 {'description': "Men's T-Shirt",
  'url': 'https://www.amazon.in/The-Blazze-Mens-Stringers-Gym-Tank-Top-Vest-Vests-for-Men-Muscle-Tee-M-Gray/dp/B07M8Q56YB/ref=sr_1_81?qid=1684619326&s=apparel&sr=1-81',
  'img': 'https://m.media-amazon.com/images/I/71ee+OixN5L._AC_UL400_.jpg',
  'cleaned_description': 'men shirt'},
 {'description': 'Women Shirt',
  'url': 'https://www.amazon.in/Sanisa-Womens-Regular-322KN772-M_Black-Medium/dp/B08VG3DCDS/ref=sr_1_68?qid=1684619389&s=apparel&

In [17]:
import pickle
# Save the 'clothing_df' DataFrame using pickle
pickle.dump(clothing_df, open('clothing.pkl', 'wb'))