# Web Scraping from [Othoba.com](https://www.othoba.com "https://www.othoba.com")

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import json

import math

In [2]:
baseUrl = "https://www.othoba.com"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

#### utility function-1: `find_content()`

In [3]:
def find_content(url):
    r = requests.get(url, headers = headers)
    return BeautifulSoup(r.content, "lxml")

### 1. Collecting categories from the home page

In [4]:
catLinks = []
soup = find_content(baseUrl)
for tag in soup.find_all("a", {"class":"cl-1"}):
    catLinks.append(baseUrl + tag.get("href"))

In [5]:
len(catLinks)

11

### 2. Collecting sub-categories

If no sub-category is found, then append the category itself in the sub-category

In [6]:
%%time

subCategoryLinks = []
noSubCategory = []

for link in catLinks:
    print(link) #to check all the categories have been iterated
    soup = find_content(link)
    productlist = soup.find_all("div", class_="item-box")

    for item in productlist:
        if(item.find("h2", {"class":"title"})):
            info = item.find("h2", {"class":"title"})
            for link in info.find_all("a", href=True):
                subCategoryLinks.append(baseUrl + link["href"])
        elif(link not in noSubCategory):
            noSubCategory.append(link)
            
for val in noSubCategory:
    subCategoryLinks.append(val)

https://www.othoba.com/men
https://www.othoba.com/womens-fashion
https://www.othoba.com/baby-kids
https://www.othoba.com/food-grocery
https://www.othoba.com/mobile
https://www.othoba.com/home-living
https://www.othoba.com/electronics
https://www.othoba.com/books-stationery
https://www.othoba.com/chocolate-candy
https://www.othoba.com/o-biz
https://www.othoba.com/miscellaneous
Wall time: 7.17 s


In [7]:
len(subCategoryLinks), len(noSubCategory)

(77, 1)

### 3. Collecting all the product links using the `subCategoryLinks`

In [12]:
%%time
productLinks = []

for link in subCategoryLinks:
    soup1 = find_content(link)
    
    if(soup1.find("span", {"id":"product-count"})):
        itemNumber = int(soup1.find("span", {"id":"product-count"}).text.strip())
        for page in range(math.ceil(itemNumber/48)):
            soup2 = find_content(link + f"?pagenumber={page+1}")
            productList = soup2.find_all("div", class_="product-item")

            for item in productList:
                info = item.find("h2", {"class":"product-title"})
                for tag in info.find_all("a", href=True):
                    productLinks.append(baseUrl + tag["href"])

Wall time: 8min 3s


In [15]:
len(productLinks)

30881

In [32]:
finalProductData = []

#### utility function-2: `product_data()`

Using the product link, it will collect 7 features:
1. Product Category
2. Product Sub-Category
3. Product Name
4. Product Brand
5. Product Seller
6. Product Price
7. Shipping Price

If any of the features are unavailable for the product, then the value will be assigned `None`

In [16]:
def product_data(pLinks):
    data = []

    for link in pLinks:
        soup = find_content(link)

        if(soup.find("div", {"class":"product-name"})):
            productName = soup.find("div", {"class":"product-name"}).text.strip()
        else:
            productName = None

        if(soup.find("span", {"itemprop":"name", "class":"value"})):
            productBrand = soup.find("span", {"itemprop":"name", "class":"value"}).text.strip()
        else:
            productBrand = None

        if(soup.find("div", { "class":"product-vendor"})):
            findSeller = soup.find("div", { "class":"product-vendor"})
            productSeller = findSeller.find("span", { "class":"value"}).text.strip()
        else:
            productSeller = None

        if(soup.find("span", { "itemprop":"price"})):
            productPrice = soup.find("span", { "itemprop":"price"}).text.strip()
        else:
            productPrice = None

        if(soup.find("p", { "class":"delivery-charge no-free-ship"})):
            shippingPrice = soup.find("p", { "class":"delivery-charge no-free-ship"}).text.strip()
        else:
            shippingPrice = None

        if(soup.find("div", {"class":"breadcrumb"})):
            findCategory = soup.find("div", {"class":"breadcrumb"})
            category = findCategory.find_all("a", href=True)
            productCategory = category[1]["href"][1:]
            productSubCategory = category[2]["href"][1:] if(len(category) > 2) else None
        else:
            productCategory, productSubCategory = None, None

        product = {
            "Category": productCategory,
            "subCategory": productSubCategory,
            "Name": productName,
            "Brand": productBrand,
            "Seller": productSeller,
            "Price": productPrice,
            "shippingPrice": shippingPrice
        }

        data.append(product)
        
    return data

### 4. Data Collection

As there are about 30,000+ sub-categories, data is collected by dividing all of the sub-categories into 6 parts. Because there will be error from the website for lots of requests.

**Data Collection Date:  15-Oct-2021**

In [34]:
%%time
finalProductData1of6 = product_data(productLinks[:5000])

Wall time: 1h 47min 14s


In [35]:
df1of6 = pd.DataFrame(finalProductData1of6)
df1of6.head(3)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,men,mens-clothing,Men's Button Polo Shirt Red,,Ecohutt,Tk 250,Tk 50
1,men,mens-clothing,Men's Button Polo Shirt Olive,,Ecohutt,Tk 250,Tk 50
2,men,mens-clothing,Men's Button Polo Shirt Blue,,Ecohutt,Tk 250,Tk 50


In [36]:
df1of6.to_csv("df1of6.csv", index=False)

In [47]:
finalProductData2of6 = product_data(productLinks[5000:10000])

len(finalProductData2of6)

5000

In [48]:
df2of6 = pd.DataFrame(finalProductData2of6)
df2of6.head(3)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,womens-fashion,beauty-care,Rasasi Junoon Satin Pour Femme EDP 50 Ml For W...,Rasasi,Perfume Bangladesh,Tk 8000,Tk 50
1,womens-fashion,beauty-care,Rasasi Afshana EDP 100ml For Women,Rasasi,Perfume Bangladesh,Tk 1400,Tk 50
2,womens-fashion,beauty-care,Parachute SkinPure Beauty Olive Oil 200ml,Marico Bangladesh,Marico Bangladesh Ltd.,Tk 250,Tk 50


In [49]:
df2of6.to_csv("df2of6.csv", index=False)

**Data Collection Date: 16-Oct-2021**

In [50]:
%%time
productData1of5of3of6 = product_data(productLinks[10000:11000])

Wall time: 29min 18s


In [51]:
%%time
productData2of5of3of6 = product_data(productLinks[11000:12000])

Wall time: 19min 26s


In [53]:
productData3of5of3of6 = product_data(productLinks[12000:13000])

In [1]:
productData4of5of3of6 = product_data(productLinks[13000:14000])

In [2]:
productData5of5of3of6 = product_data(productLinks[14000:15000])

In [70]:
finalProductData3of6 = (productData1of5of3of6 + productData2of5of3of6 + productData3of5of3of6 +
                       productData4of5of3of6 + productData5of5of3of6)

In [71]:
df3of6 = pd.DataFrame(finalProductData3of6)
df3of6.head()

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,home-living,furniture,Decorate Chair Tube Rose T Red,RFL Furniture,Best Buy,Tk 510,Free Shipping
1,home-living,furniture,Rodo Casual Chair -Orange BB88720,RFL Furniture,Best Buy,Tk 1350,Free Shipping
2,home-living,furniture,Amass Closet 4 Drawer Sunflower,RFL Houseware,Best Buy,Tk 2875,Free Shipping
3,home-living,furniture,Royal Kitchen Shelf Trendy 2 Door Blue,RFL Furniture,Best Buy,Tk 4200,Free Shipping
4,home-living,furniture,Chair Dining Super Tree Rose Wood,RFL Furniture,Best Buy,Tk 475,Free Shipping


In [72]:
df3of6.to_csv("df3of6.csv", index=False)

In [None]:
finalProductData4of6 = product_data(productLinks[15000:20000])

df4of6 = pd.DataFrame(finalProductData4of6)

df4of6.to_csv("df4of6.csv", index=False)

In [None]:
finalProductData5of6 = product_data(productLinks[20000:25000])

df5of6 = pd.DataFrame(finalProductData5of6)

df5of6.to_csv("df5of6.csv", index=False)

In [None]:
finalProductData6of6 = product_data(productLinks[25000:])

df6of6 = pd.DataFrame(finalProductData6of6)

df6of6.to_csv("df6of6.csv", index=False)

### 4.2. Merge all the collected data in a single dataframe

**Date: 17-Oct-2021**

In [35]:
df1of6 = pd.read_csv("data/df1of6.csv")
df2of6 = pd.read_csv("data/df2of6.csv")
df3of6 = pd.read_csv("data/df3of6.csv")
df4of6 = pd.read_csv("data/df4of6.csv")
df5of6 = pd.read_csv("data/df5of6.csv")

len(df1of6), len(df2of6), len(df3of6), len(df4of6), len(df5of6)

(5000, 5000, 5000, 5000, 5000)

In [36]:
df1of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,men,mens-clothing,Men's Button Polo Shirt Red,,Ecohutt,Tk 250,Tk 50
1,men,mens-clothing,Men's Button Polo Shirt Olive,,Ecohutt,Tk 250,Tk 50


In [37]:
df2of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,womens-fashion,beauty-care,Rasasi Junoon Satin Pour Femme EDP 50 Ml For W...,Rasasi,Perfume Bangladesh,Tk 8000,Tk 50
1,womens-fashion,beauty-care,Rasasi Afshana EDP 100ml For Women,Rasasi,Perfume Bangladesh,Tk 1400,Tk 50


In [38]:
df3of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,home-living,furniture,Decorate Chair Tube Rose T Red,RFL Furniture,Best Buy,Tk 510,Free Shipping
1,home-living,furniture,Rodo Casual Chair -Orange BB88720,RFL Furniture,Best Buy,Tk 1350,Free Shipping


In [39]:
df4of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,books-stationery,ekushey-boimela,প্লেয়িং ইট মাই ওয়ে (হার্ডকভার),Sachin Tendulkar,Anyadhara,Tk 500,Tk 30
1,books-stationery,translations,অরিজিন,Dan Brown,Anyadhara,Tk 550,Tk 30


In [40]:
df5of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,books-stationery,local-books,আলতাফ মাহমুদ : রক্ত দিয়ে লিখে গেল জীবনের গান (...,আসাদুল হক,Shahitya Prokash,Tk 100,Tk 30
1,books-stationery,local-books,চার্লস ডারউইন (হার্ডকভার),জহুরুল আলম সিদ্দিকী,Shahitya Prokash,Tk 500,Tk 30


In [41]:
df6of6.head(2)

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,books-stationery,best-seller,লালসালু,সৈয়দ ওয়ালীউল্লাহ্‌,Adorn Publication,Tk 150,Tk 30
1,books-stationery,best-seller,ঝিলাম নদীর দেশ,বুলবুল সরওয়ার,Adorn Publication,Tk 250,Tk 30


In [42]:
final_df = pd.concat([df1of6, df2of6, df3of6, df4of6, df5of6, df6of6],
                     axis=0,
                     ignore_index=True)

len(final_df)

30881

In [43]:
final_df.tail()

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
30876,chocolate-candy,,Chocobean Ball Toffee 23 gm,PRAN Agro,Daily Shopping,Tk 50,Tk 50
30877,chocolate-candy,,Chocolord Bar 12.5gm,PRAN Agro,Daily Shopping,Tk 10,Tk 50
30878,chocolate-candy,,Cadbury Dairy Milk Lickables 20gm,,Daily Shopping,Tk 112,Tk 50
30879,chocolate-candy,,Sesame Bar Star 20 gm,PRAN Agro,Daily Shopping,Tk 10,Tk 50
30880,chocolate-candy,,Rapinda Hazelnut Diamond Box 300gm,Rapinda,Daily Shopping,Tk 575,Free Shipping


In [44]:
final_df.to_csv("Othoba_productData.csv", index=False)

In [46]:
df = pd.read_csv("Othoba_productData.csv")
len(df)

30881

In [47]:
df.head()

Unnamed: 0,Category,subCategory,Name,Brand,Seller,Price,shippingPrice
0,men,mens-clothing,Men's Button Polo Shirt Red,,Ecohutt,Tk 250,Tk 50
1,men,mens-clothing,Men's Button Polo Shirt Olive,,Ecohutt,Tk 250,Tk 50
2,men,mens-clothing,Men's Button Polo Shirt Blue,,Ecohutt,Tk 250,Tk 50
3,men,mens-clothing,Men's Button Polo Shirt Black,,Ecohutt,Tk 250,Tk 50
4,men,mens-clothing,Men's Boxer Mixed 3 Pcs,,Living Tex,Tk 240,Tk 50
