In [57]:
import re
import requests
import pandas as pd

from pathlib import Path
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [51]:
MENU_URI = "https://mymall.se/media/mymall/megamenu/9826648.html"

data = []
uri_response = requests.get(url=MENU_URI).content
bs_obj = BeautifulSoup(markup=uri_response, parser='html.parser', features='lxml')

unordered_li = bs_obj.select('body > li')

for un_lis in unordered_li:
    # get the mega category info first
    mega_cat_name = un_lis.contents[1].text.strip()
    mega_cat_uri = un_lis.contents[1].find('a', href=True).get('href')

    # get main categories from the mega category
    main_cat = un_lis.select('div > ul > li')
    for main_cats in main_cat:
        try:
            content = main_cats.contents[1]
        except IndexError:
            content = main_cats.contents[0]
        main_cat_name = content.text.strip()

        try:
            main_cat_uri = content.contents[1].get('href')
        except IndexError:
            main_cat_uri = content.get('href')

        # get sub-categories from main category
        sub_cat = main_cats.select('div > ul > li')
        for sub_cats in sub_cat:
            sub_cat_name = sub_cats.text
            sub_cat_uri = sub_cats.contents[0].get('href')

            data.append({
                'mega_category': mega_cat_name,
                'main_category': main_cat_name,
                'sub_category': sub_cat_name,
                'mega_category_url': mega_cat_uri,
                'main_category_url': main_cat_uri,
                'sub_category_url': sub_cat_uri
            })

# add data into panda dataframe
cat_df = pd.DataFrame(data=data, columns=['mega_category', 'main_category', 'sub_category',
                                          'mega_category_url', 'main_category_url', 'sub_category_url'])

In [52]:
# print the category dataframe
cat_df

Unnamed: 0,mega_category,main_category,sub_category,mega_category_url,main_category_url,sub_category_url
0,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
1,Inredning & Möbler,Barnrummet,Förvaring,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
2,Inredning & Möbler,Barnrummet,Hemsäkerhet,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
3,Inredning & Möbler,Barnrummet,Inredningsdetaljer,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
4,Inredning & Möbler,Barnrummet,Junior- & barnsängar,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
...,...,...,...,...,...,...
697,Hemelektronik,Smarta hemmet,Övervakningskameror,https://mymall.se/hemelektronik,https://mymall.se/hemelektronik/smarta-hemmet,https://mymall.se/hemelektronik/smarta-hemmet/...
698,Hemelektronik,Vitvaror,Bänkspisar & Kokplattor,https://mymall.se/hemelektronik,https://mymall.se/hemelektronik/vitvaror,https://mymall.se/hemelektronik/vitvaror/kokpl...
699,Hemelektronik,Vitvaror,Kyl & Frys,https://mymall.se/hemelektronik,https://mymall.se/hemelektronik/vitvaror,https://mymall.se/hemelektronik/vitvaror/kyl-frys
700,Hemelektronik,Vitvaror,Mikrovågsugnar,https://mymall.se/hemelektronik,https://mymall.se/hemelektronik/vitvaror,https://mymall.se/hemelektronik/vitvaror/mikro...


In [53]:
# traverse each sub-category and fetch all the product's price, name, and images
noisy_str = re.compile("[^\d\.]")
product_info = []

for index, sub_cat_urls in cat_df['sub_category_url'].iteritems():
    uri_response = requests.get(url=sub_cat_urls).content
    bs_obj = BeautifulSoup(markup=uri_response, parser='html.parser', features='lxml')

    unordered_product = bs_obj.select('div.product-item')
    for unordered_products in unordered_product:
        if hasattr(unordered_products, 'contents'):
            try:
                product_info.append({
                    'product_name': unordered_products.contents[1].text.strip(),
                    'product_price': float(noisy_str.sub(r'', unordered_products.contents[2].text)),
                    'image_url': unordered_products.contents[0].find('a', href=True).get('href'),
                    'mega_category': cat_df.at[index, 'mega_category'],
                    'main_category': cat_df.at[index, 'main_category'],
                    'sub_category': cat_df.at[index, 'sub_category'],
                    'mega_category_url': cat_df.at[index, 'mega_category_url'],
                    'main_category_url': cat_df.at[index, 'main_category_url'],
                    'sub_category_url': sub_cat_urls,
                })
            except (IndexError, ValueError):
                pass

# add data into panda dataframe
prod_df = pd.DataFrame(data=product_info,
                       columns=['product_name', 'product_price', 'image_url', 'mega_category', 'main_category', 'sub_category',
                                'mega_category_url','main_category_url', 'sub_category_url'])

In [54]:
# print the product information dataframe with sub-category and all the product's price, name, and images
prod_df

Unnamed: 0,product_name,product_price,image_url,mega_category,main_category,sub_category,mega_category_url,main_category_url,sub_category_url
0,Frost LED nattlampa,189.0,https://mymall.se/inredning-mobler/barnrummet/...,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
1,Paw Patrol LED Nattlampa,199.0,https://mymall.se/inredning-mobler/barnrummet/...,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
2,Lampa - Karamell björn,329.0,https://mymall.se/inredning-mobler/barnrummet/...,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
3,Lampa - Blå björn,329.0,https://mymall.se/inredning-mobler/barnrummet/...,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
4,Lampa - Rosa björn,329.0,https://mymall.se/inredning-mobler/barnrummet/...,Inredning & Möbler,Barnrummet,Belysning,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
...,...,...,...,...,...,...,...,...,...
172,Badrumsmatta - Vildängel,249.0,https://mymall.se/catalog/product/view/id/4463...,Inredning & Möbler,Barnrummet,Inredningsdetaljer,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
173,Dekoration - Stjärnor,95.0,https://mymall.se/catalog/product/view/id/4463...,Inredning & Möbler,Barnrummet,Inredningsdetaljer,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
174,Smyckeskrin - Ballerina,229.0,https://mymall.se/catalog/product/view/id/4175...,Inredning & Möbler,Barnrummet,Inredningsdetaljer,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...
175,Väggklistermärken - Enhörningar,69.0,https://mymall.se/catalog/product/view/id/3856...,Inredning & Möbler,Barnrummet,Inredningsdetaljer,https://mymall.se/inredning-mobler,https://mymall.se/inredning-mobler/barnrummet,https://mymall.se/inredning-mobler/barnrummet/...


In [55]:
# print to csv file
filepath = Path('file/products.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
prod_df.to_csv(filepath, index=False)

In [58]:
# save into postgresql db
engine = create_engine('postgresql://username:password@host:port/your-db?gssencmode=disable')
prod_df.to_sql('table-name', engine, if_exists='replace',index=False)

177