In [1]:
#Import Required Libraries
import os
import requests
import warnings
import numpy as np
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup as bs

warnings.filterwarnings('ignore')

In [2]:
#Funtion for fetching item category from the description
items = {'Blazer', 'Bottomwear', 'Capri', 'Cargos', 'Coat', 'Harem Pants', 'Jeans', 'Jeggings', 'Shirt', 'Shorts', 'Skirts', 'Suit', 'T-Shirt', 'Three-Fourth', 'Tights', 'Top', 'Track Pant', 'Trouser', 'Trousers', 'Waistcoat'}
def get_category(desc):
    for i in items:
        if i in desc or i.lower() in desc or i+'s' in desc:
            return i.lower()
    return None

#Funtion for fetching gender from the description
def get_gender(desc):
    if 'women' in desc or 'girl' in desc:
        return 'Women'
    elif ('women' not in desc and 'girl' not in desc):
        if ('men' in desc or 'boy' in desc):
            return 'Men'
    else:
        return None

In [6]:
#Extracting products data from Mytntra Website
myntra_clothing_items = ['wallets', 'flip-flops', 'kurtas', 'tshirts', 'kurta-sets',
   'dresses', 'tops', 'sports-shoes', 'sweaters', 'caps',
   'lounge-pants', 'handbags', 'heels', 'casual-shoes',
   'jeans', 'shirts', 'jackets', 'formal-shoes', 'sandals', 'shorts',
   'sarees', 'sweatshirts', 'saree-blouse', 'belts', 'track-pants',
   'bra', 'sunglasses', 'trousers', 'boxers', 'flats', 'night-suits',
   'thermal-set', 'palazzos', 'skirts', 'swimwear', 'suits',
   'leggings', 'shapewear', 'nightdress', 'blazers', 'capris', 'kurtis',
   'lounge-tshirts', 'tights', 'tracksuits', 'ties', 'pyjamas',
   'waistcoat', 'churidar', 'coats', 'swim-bottoms', 'sleepsuit',
   'harem-pants', 'rain-jacket', 'rain-suit', 'shawl', 'hanger',
   'salwar', 'swim-tops']

def filter_items(desc):
    for myntra_clothing_item in myntra_clothing_items:
        if myntra_clothing_item in desc:
            return myntra_clothing_item
    return None



myntra = pd.read_csv('/kaggle/input/myntra-168k-products/data.csv')
myntra.columns = [c.lstrip().rstrip().replace(' ', '') for c in myntra.columns]

myntra['product_name'] = myntra['product_link'].apply(lambda x: x.split('/')[-3].replace('-', ' ').lstrip().rstrip())

myntra_pre = myntra[myntra['product_tag'].isin(myntra_clothing_items)][['product_name', 'product_tag', 'product_link']]
myntra_pre.rename(columns={'product_name': 'description', 'product_tag': 'category', 'product_link': 'product_url'}, inplace=True)

myntra_pre['description'] = myntra_pre['description'].apply(lambda x: x.lower().lstrip().rstrip())
myntra_pre['product_url'] = 'https://www.myntra.com/' + myntra_pre['product_url']
myntra_pre['gender'] = myntra_pre['description'].apply(get_gender)


myntra_pre.reset_index(drop=True, inplace=True)

In [None]:
#Extracting products data from flipkart website

flipkart = set()
urls = {
    'https://www.flipkart.com/clothing-and-accessories/topwear/pr?sid=clo%2Cash&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&otracker=nmenu_sub_Men_0_Top%20wear&' : {'categories': ['Shirt', 'T-Shirt']}, 
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo%2Cvua&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&otracker=nmenu_sub_Men_0_Bottom%20wear&' : {'categories':['Bottomwear', 'Short', 'Harem Pant', 'Three-Fourth', 'Jean', 'Tight', 'Track Pant', 'Cargo', 'Trouser']},
    'https://www.flipkart.com/clothing-and-accessories/blazers-suits-waistcoat-coat/pr?sid=clo%2Cupk&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&otracker=nmenu_sub_Men_0_Suits%2C%20Blazers%20%26%20Waistcoats&' : {'categories': ['Coat', 'Suit', 'Blazer', 'Waistcoat']},
    'https://www.flipkart.com/clothing-and-accessories/topwear/pr?sid=clo,ash&p[]=facets.ideal_for%255B%255D%3DWomen&p[]=facets.ideal_for%255B%255D%3Dwomen&otracker=categorytree&otracker=nmenu_sub_Women_0_Topwear&': {'categories': ['T-Shirt', 'Shirt', 'Top']},
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/jeans/women-jeans/pr?sid=clo,vua,k58,4hp&otracker=categorytree&otracker=nmenu_sub_Women_0_Jeans&': {'categories': ['Jeans']},
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/shorts/women-shorts/pr?sid=clo,vua,e8g,hbd&otracker=categorytree&otracker=nmenu_sub_Women_0_Shorts&': {'categories': ['Short']},
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/skirts/women-skirts/pr?sid=clo,vua,iku,w5t&otracker=categorytree&otracker=nmenu_sub_Women_0_Skirts&': {'categories': ['Skirt']},
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/~cs-hdgz8fjfgm/pr?sid=clo%2Cvua&collection-tab-name=Tights%20And%20%20Legging&otracker=nmenu_sub_Women_0_Jeggings%20%26%20Tights&': {'categories': ['Jegging', 'Tight']},
    'https://www.flipkart.com/clothing-and-accessories/bottomwear/~cs-g50m8job1g/pr?sid=clo%2Cvua&collection-tab-name=Trousers%20and%20Capris&otracker=nmenu_sub_Women_0_Trousers%20%26%20Capris&': {'categories': ['Capri', 'Trouser']},
       }
for url, val in urls.items():
    page = 1
    while True:
        if page != 1:
            req_url = f'{url}&page={page}'
            res = requests.get(req_url)
        else:
            res = requests.get(url)
        soup = bs(res.text, 'html.parser')
        elements = soup.find_all('a', {'class': 'IRpwTa'})
        
        if len(elements) > 0:
            for i, v in enumerate(elements):
                description = v.get_text()
                item_category = None
                for category in val['categories']:
                    if category in description or category.lower() in description or category +'s' in description or category.lower() +'s' in description:
                        item_category = category.lower()
                flipkart.add((description, item_category, 'www.flipkart.com' + v['href']))
            print(f'Number of items : {len(flipkart)}')
            page += 1
        else:
            print(f'URL Fetch Done; Number of items : {len(flipkart)}')
            break

flipkart = pd.DataFrame(flipkart)
flipkart.columns = ['description', 'category', 'product_url']
flipkart['description'] = flipkart['description'].apply(lambda x: x.lower().lstrip().rstrip())
flipkart['gender'] = flipkart['description'].apply(get_gender)

In [7]:
myntra_pre.head()

Unnamed: 0,description,category,product_url,gender
0,lino perros women peach coloured croc textured...,wallets,https://www.myntra.com/wallets/lino-perros/lin...,Women
1,mast harbour men navy blue grey striped sliders,flip-flops,https://www.myntra.com/flip-flops/mast--harbou...,Men
2,biba women off white black printed pure cotto...,kurtas,https://www.myntra.com/kurtas/biba/biba-women-...,Women
3,anthrilo girls white floral printed t shirt,tshirts,https://www.myntra.com/tshirts/anthrilo/anthri...,Women
4,fashion dwar women multicoloured ethnic motifs...,kurta-sets,https://www.myntra.com/kurta-sets/fashion-dwar...,Women


In [8]:
flipkart.head()

Unnamed: 0,description,category,product_url,gender
0,men graphic print round neck red t-shirt,t-shirt,www.flipkart.com/ducati-graphic-print-men-roun...,Men
1,casual regular sleeves printed women pink top,top,www.flipkart.com/tokyo-talkies-casual-printed-...,Women
2,women flared mid rise blue jeans,jeans,www.flipkart.com/leetos-flared-women-blue-jean...,Women
3,men slim fit green cotton blend trousers,trouser,www.flipkart.com/highlander-slim-fit-men-green...,Men
4,3butterflies white jegging,jegging,www.flipkart.com/3butterflies-white-jegging/p/...,


In [9]:
#Appending both data
data = myntra_pre.append(flipkart, ignore_index=True)
data = data.append(flipkart, ignore_index=True)
data = data.sample(frac = 1)

In [10]:
#Saving the final dataset
data.to_csv('mercor_clothing_data.csv', index=False)