# Scraping Jumia Ecommerce Site: DSE HACKATHON

## Import libraries

In [105]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import time

## Define function to fetch website html data

In [106]:
def fetch_html_data(web_address):
    try:
        print(f"\nFetching data from {web_address}...")
        res = req.get(web_address)
        return res
    except req.exceptions.RequestException as e:
        print('Stopped:', e)
    except TypeError as e:
        print('Stopped:', e)

# Define function to convert html data to BeautifulSoup object 

In [107]:
def convert_web_data_to_beautiful_soup_obj(web_data):
    try:
        print("Creating BeautifulSoup object...")
        soup_obj = BeautifulSoup(web_data.text, "html.parser")
        print("Success! Object created!")
        return soup_obj
    except Exception as e:
        print("Stopped:", e)

### Create array of products on each page, add each to product dictionary & append to array iteratively

In [108]:
all_products_list = []

def append_one_product_details_dictionary_to_list(page):
    print(f"Appending page {page} products' details to array")
    
    page_products_details_soup = soup.find_all("article", class_="prd _fb col c-prd")
    
    for detail in page_products_details_soup:
        details_dict = {"name": detail.find("h3", class_="name").text.strip(),
                        "new_price": detail.find("div", class_="prc").text.strip(),
                        "old_price": detail.find("div", class_="old").text.strip() if detail.find("div", class_="old") else None,
                        "discount(percent)": detail.find("div", class_="bdg _dsct _sm").text.strip() if detail.find("div", class_="bdg _dsct _sm") else None,
                        "rating": detail.find("div", class_="stars _s").text.strip() if detail.find("div", class_="stars _s") else None,
                        "votes": detail.find("div", class_="rev").text.strip() if detail.find("div", class_="rev") else None}
        all_products_list.append(details_dict)


# len(product_details_clean)

### Declare the products' url & the total webpage count

In [109]:
other_pages_url = "https://www.jumia.co.ke/all-products/?page="
webpage_num_total = 50

### Fetch webdata, convert to BeautifulSoup Object, add product details dictionary to list

In [110]:
for page in range(1, webpage_num_total+1, 1):
    other_pages_url = "https://www.jumia.co.ke/all-products/?page="
    page = str(page)
    other_pages_url = other_pages_url + page
    response = fetch_html_data(other_pages_url)
    # time.sleep(2.5)
    soup = convert_web_data_to_beautiful_soup_obj(response)
    append_one_product_details_dictionary_to_list(page)
all_products_list


Fetching data from https://www.jumia.co.ke/all-products/?page=1...
Creating BeautifulSoup object...
Success! Object created!
Appending page 1 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=2...
Creating BeautifulSoup object...
Success! Object created!
Appending page 2 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=3...
Creating BeautifulSoup object...
Success! Object created!
Appending page 3 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=4...
Creating BeautifulSoup object...
Success! Object created!
Appending page 4 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=5...
Creating BeautifulSoup object...
Success! Object created!
Appending page 5 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=6...
Creating BeautifulSoup object...
Success! Object created!
Appending page 6 product

[{'name': 'NIVEA Perfect & Radiant Even Tone Day And Night Cream For Women - 50ml',
  'new_price': 'KSh 999',
  'old_price': 'KSh 1,560',
  'discount(percent)': '36%',
  'rating': '4.5 out of 5',
  'votes': '4.5 out of 5(2216)'},
 {'name': 'NIVEA Radiant & Beauty Advanced Care Lotion For Women - 400ml (Pack Of 2)',
  'new_price': 'KSh 949',
  'old_price': 'KSh 1,460',
  'discount(percent)': '35%',
  'rating': '4.7 out of 5',
  'votes': '4.7 out of 5(606)'},
 {'name': 'NIVEA Nourishing Cocoa Body Lotion With Cocoa Butter 400ml (Pack Of 2)',
  'new_price': 'KSh 1,174',
  'old_price': 'KSh 1,302',
  'discount(percent)': '10%',
  'rating': '4.7 out of 5',
  'votes': '4.7 out of 5(1114)'},
 {'name': 'NIVEA Pearl & Beauty Anti-Perspirant Rollon, 48h - 50ml (Pack Of 2)',
  'new_price': 'KSh 728',
  'old_price': 'KSh 1,040',
  'discount(percent)': '30%',
  'rating': '4.6 out of 5',
  'votes': '4.6 out of 5(829)'},
 {'name': 'NIVEA MEN Deep Antibacterial Anti-Perspirant Rollon,48h - 50ml (Pack 

### Check product count

In [111]:
len(all_products_list)

2000

### Convert products list to pandas dataframe

In [112]:
products_df = pd.DataFrame(all_products_list)
products_df

Unnamed: 0,name,new_price,old_price,discount(percent),rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,KSh 999,"KSh 1,560",36%,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,KSh 949,"KSh 1,460",35%,4.7 out of 5,4.7 out of 5(606)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,"KSh 1,174","KSh 1,302",10%,4.7 out of 5,4.7 out of 5(1114)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",KSh 728,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,KSh 728,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(619)
...,...,...,...,...,...,...
1995,Qwen 40 Colors Eyeshadow Palette Nude Matte Gl...,KSh 429,KSh 600,29%,3.5 out of 5,3.5 out of 5(4)
1996,ST64 Amber Edison Bulb E27,KSh 499,,,5 out of 5,5 out of 5(8)
1997,Fashion Ladies Shoulder Length Black Wigs Bob ...,KSh 790,"KSh 1,770",55%,3.8 out of 5,3.8 out of 5(20)
1998,L'Oréal Paris Revitalift Filler Replumping Nig...,"KSh 2,513","KSh 3,350",25%,4.3 out of 5,4.3 out of 5(7)


In [113]:
products_df.shape

(2000, 6)

# Data Cleaning Section

### Clean new_price column

In [114]:
# Remove currency name 'Ksh' & convert values to float
def remove_currency_name_in_new_price(val):
    val = val.split(' ')[1]
    return val
products_df['new_price'] = products_df['new_price'].apply(remove_currency_name_in_new_price)

In [115]:
print(type(products_df['new_price'][1]))
print(products_df['new_price'].head())

<class 'str'>
0      999
1      949
2    1,174
3      728
4      728
Name: new_price, dtype: object


In [116]:
# Remove comma from thousands price values
products_df['new_price'] = products_df['new_price'].apply(lambda x: x.replace(',','') if ',' in x else x)
products_df['new_price']

0        999
1        949
2       1174
3        728
4        728
        ... 
1995     429
1996     499
1997     790
1998    2513
1999    1170
Name: new_price, Length: 2000, dtype: object

In [117]:
print(type(products_df['new_price'][1]))

<class 'str'>


In [118]:
# Convert new_price column to float
products_df['new_price'] = products_df['new_price'].apply(lambda x: float(x))

In [119]:
print(type(products_df['new_price'][1]))

<class 'numpy.float64'>


In [120]:
products_df.head()

Unnamed: 0,name,new_price,old_price,discount(percent),rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,"KSh 1,560",36%,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,"KSh 1,460",35%,4.7 out of 5,4.7 out of 5(606)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,"KSh 1,302",10%,4.7 out of 5,4.7 out of 5(1114)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(619)


### Clean old_price column

In [121]:
products_df['old_price'].unique()

array(['KSh 1,560', 'KSh 1,460', 'KSh 1,302', 'KSh 1,040', 'KSh 5,610',
       'KSh 780', 'KSh 3,840', 'KSh 17,999', 'KSh 38,000', 'KSh 410',
       'KSh 3,500', 'KSh 688', 'KSh 2,300', 'KSh 1,750', 'KSh 1,200',
       'KSh 1,399', None, 'KSh 23,999', 'KSh 4,000', 'KSh 815',
       'KSh 3,827', 'KSh 1,299', 'KSh 345', 'KSh 2,100', 'KSh 18,999',
       'KSh 7,599', 'KSh 1,599', 'KSh 1,700', 'KSh 14,999', 'KSh 1,295',
       'KSh 1,250', 'KSh 350', 'KSh 2,871', 'KSh 1,850', 'KSh 3,045',
       'KSh 31,354', 'KSh 899', 'KSh 1,670', 'KSh 210', 'KSh 2,599',
       'KSh 1,014', 'KSh 250', 'KSh 180', 'KSh 1,500', 'KSh 900',
       'KSh 1,400', 'KSh 1,050', 'KSh 44,999', 'KSh 4,245', 'KSh 4,099',
       'KSh 1,000', 'KSh 2,695 - KSh 3,576', 'KSh 920', 'KSh 1,638',
       'KSh 4,500', 'KSh 20,000', 'KSh 1,999', 'KSh 1,258 - KSh 1,938',
       'KSh 7,220 - KSh 7,980', 'KSh 1,800', 'KSh 25,999',
       'KSh 2,300 - KSh 2,700', 'KSh 14,947', 'KSh 29,999', 'KSh 5,195',
       'KSh 314', 'KSh 499', 

In [122]:
# Remove currency name 'Ksh'
products_df['old_price'] = products_df['old_price'].apply(lambda x: x.split(' ')[1] if x is not None else x)
products_df['old_price']

0       1,560
1       1,460
2       1,302
3       1,040
4       1,040
        ...  
1995      600
1996     None
1997    1,770
1998    3,350
1999    2,137
Name: old_price, Length: 2000, dtype: object

In [124]:
# Remove commas from values
products_df['old_price'] = products_df['old_price'].apply(lambda x: x.replace(',','') if x is not None else x)
products_df['old_price']

0       1560
1       1460
2       1302
3       1040
4       1040
        ... 
1995     600
1996    None
1997    1770
1998    3350
1999    2137
Name: old_price, Length: 2000, dtype: object

In [126]:
# Convert the string values into floats
products_df['old_price'] = products_df['old_price'].apply(lambda x: float(x))
products_df['old_price']

0       1560.0
1       1460.0
2       1302.0
3       1040.0
4       1040.0
         ...  
1995     600.0
1996       NaN
1997    1770.0
1998    3350.0
1999    2137.0
Name: old_price, Length: 2000, dtype: float64

In [128]:
type(products_df['old_price'][18])

numpy.float64