# Scraping Jumia Ecommerce Site: DSE HACKATHON

## Import libraries

In [154]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import time

## Define function to fetch website html data

In [155]:
def fetch_html_data(web_address):
    try:
        print(f"\nFetching data from {web_address}...")
        res = req.get(web_address)
        return res
    except req.exceptions.RequestException as e:
        print('Stopped:', e)
    except TypeError as e:
        print('Stopped:', e)

# Define function to convert html data to BeautifulSoup object 

In [156]:
def convert_web_data_to_beautiful_soup_obj(web_data):
    try:
        print("Creating BeautifulSoup object...")
        soup_obj = BeautifulSoup(web_data.text, "html.parser")
        print("Success! Object created!")
        return soup_obj
    except Exception as e:
        print("Stopped:", e)

### Create array of products on each page, add each to product dictionary & append to array iteratively

In [157]:
all_products_list = []

def append_one_product_details_dictionary_to_list(page):
    print(f"Appending page {page} products' details to array")
    
    page_products_details_soup = soup.find_all("article", class_="prd _fb col c-prd")
    
    for detail in page_products_details_soup:
        details_dict = {"name": detail.find("h3", class_="name").text.strip(),
                        "new_price": detail.find("div", class_="prc").text.strip(),
                        "old_price": detail.find("div", class_="old").text.strip() if detail.find("div", class_="old") else None,
                        "discount": detail.find("div", class_="bdg _dsct _sm").text.strip() if detail.find("div", class_="bdg _dsct _sm") else None,
                        "rating": detail.find("div", class_="stars _s").text.strip() if detail.find("div", class_="stars _s") else None,
                        "votes": detail.find("div", class_="rev").text.strip() if detail.find("div", class_="rev") else None}
        all_products_list.append(details_dict)


# len(product_details_clean)

### Declare the products' url & the total webpage count

In [158]:
other_pages_url = "https://www.jumia.co.ke/all-products/?page="
webpage_num_total = 50

### Fetch webdata, convert to BeautifulSoup Object, add product details dictionary to list

In [159]:
for page in range(1, webpage_num_total+1, 1):
    other_pages_url = "https://www.jumia.co.ke/all-products/?page="
    page = str(page)
    other_pages_url = other_pages_url + page
    response = fetch_html_data(other_pages_url)
    # time.sleep(2.5)
    soup = convert_web_data_to_beautiful_soup_obj(response)
    append_one_product_details_dictionary_to_list(page)
all_products_list


Fetching data from https://www.jumia.co.ke/all-products/?page=1...
Creating BeautifulSoup object...
Success! Object created!
Appending page 1 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=2...
Creating BeautifulSoup object...
Success! Object created!
Appending page 2 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=3...
Creating BeautifulSoup object...
Success! Object created!
Appending page 3 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=4...
Creating BeautifulSoup object...
Success! Object created!
Appending page 4 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=5...
Creating BeautifulSoup object...
Success! Object created!
Appending page 5 products' details to array

Fetching data from https://www.jumia.co.ke/all-products/?page=6...
Creating BeautifulSoup object...
Success! Object created!
Appending page 6 product

[{'name': 'NIVEA Perfect & Radiant Even Tone Day And Night Cream For Women - 50ml',
  'new_price': 'KSh 999',
  'old_price': 'KSh 1,560',
  'discount': '36%',
  'rating': '4.5 out of 5',
  'votes': '4.5 out of 5(2216)'},
 {'name': 'NIVEA Radiant & Beauty Advanced Care Lotion For Women - 400ml (Pack Of 2)',
  'new_price': 'KSh 949',
  'old_price': 'KSh 1,460',
  'discount': '35%',
  'rating': '4.7 out of 5',
  'votes': '4.7 out of 5(607)'},
 {'name': 'NIVEA Nourishing Cocoa Body Lotion With Cocoa Butter 400ml (Pack Of 2)',
  'new_price': 'KSh 1,174',
  'old_price': 'KSh 1,302',
  'discount': '10%',
  'rating': '4.7 out of 5',
  'votes': '4.7 out of 5(1113)'},
 {'name': 'NIVEA Pearl & Beauty Anti-Perspirant Rollon, 48h - 50ml (Pack Of 2)',
  'new_price': 'KSh 728',
  'old_price': 'KSh 1,040',
  'discount': '30%',
  'rating': '4.6 out of 5',
  'votes': '4.6 out of 5(829)'},
 {'name': 'NIVEA MEN Deep Antibacterial Anti-Perspirant Rollon,48h - 50ml (Pack Of 2)',
  'new_price': 'KSh 728',
  

### Check product count

In [160]:
len(all_products_list)

2000

### Convert products list to pandas dataframe

In [161]:
products_df = pd.DataFrame(all_products_list)
products_df.head()

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,KSh 999,"KSh 1,560",36%,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,KSh 949,"KSh 1,460",35%,4.7 out of 5,4.7 out of 5(607)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,"KSh 1,174","KSh 1,302",10%,4.7 out of 5,4.7 out of 5(1113)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",KSh 728,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,KSh 728,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(619)


In [162]:
products_df.shape

(2000, 6)

# Data Cleaning Section

### Clean new_price column

In [163]:
# Remove currency name 'Ksh' & convert values to float
def remove_currency_name_in_new_price(val):
    val = val.split(' ')[1]
    return val
products_df['new_price'] = products_df['new_price'].apply(remove_currency_name_in_new_price)

In [164]:
print(type(products_df['new_price'][1]))
print(products_df['new_price'].head())

<class 'str'>
0      999
1      949
2    1,174
3      728
4      728
Name: new_price, dtype: object


In [165]:
# Remove comma from thousands price values
products_df['new_price'] = products_df['new_price'].apply(lambda x: x.replace(',','') if ',' in x else x)
products_df['new_price'].head()

0     999
1     949
2    1174
3     728
4     728
Name: new_price, dtype: object

In [166]:
print(type(products_df['new_price'][1]))

<class 'str'>


In [167]:
# Convert new_price column to float
products_df['new_price'] = products_df['new_price'].apply(lambda x: float(x))

In [168]:
print(type(products_df['new_price'][1]))

<class 'numpy.float64'>


In [169]:
products_df.head()

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,"KSh 1,560",36%,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,"KSh 1,460",35%,4.7 out of 5,4.7 out of 5(607)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,"KSh 1,302",10%,4.7 out of 5,4.7 out of 5(1113)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,"KSh 1,040",30%,4.6 out of 5,4.6 out of 5(619)


### Clean old_price column

In [170]:
products_df['old_price'].unique()

array(['KSh 1,560', 'KSh 1,460', 'KSh 1,302', 'KSh 1,040', 'KSh 5,610',
       'KSh 780', 'KSh 3,840', 'KSh 17,999', 'KSh 38,000', 'KSh 410',
       'KSh 3,500', 'KSh 688', 'KSh 2,300', 'KSh 1,750', 'KSh 1,200',
       'KSh 1,399', None, 'KSh 23,999', 'KSh 4,000', 'KSh 815',
       'KSh 3,827', 'KSh 1,299', 'KSh 345', 'KSh 2,100', 'KSh 18,999',
       'KSh 7,599', 'KSh 1,599', 'KSh 1,700', 'KSh 14,999', 'KSh 1,295',
       'KSh 1,250', 'KSh 350', 'KSh 2,871', 'KSh 1,850', 'KSh 3,045',
       'KSh 31,354', 'KSh 899', 'KSh 1,670', 'KSh 2,599', 'KSh 210',
       'KSh 1,014', 'KSh 250', 'KSh 180', 'KSh 1,500', 'KSh 900',
       'KSh 1,400', 'KSh 1,050', 'KSh 4,245', 'KSh 4,099', 'KSh 8,499',
       'KSh 1,000', 'KSh 44,999', 'KSh 21,500', 'KSh 2,695 - KSh 3,576',
       'KSh 920', 'KSh 1,638', 'KSh 4,500', 'KSh 20,000', 'KSh 1,999',
       'KSh 800', 'KSh 1,553', 'KSh 940', 'KSh 599', 'KSh 500',
       'KSh 12,999', 'KSh 2,373 - KSh 3,000', 'KSh 1,371 - KSh 1,913',
       'KSh 1,148', 'KSh

In [171]:
# Remove currency name 'Ksh'
products_df['old_price'] = products_df['old_price'].apply(lambda x: x.split(' ')[1] if x is not None else x)
products_df['old_price'].head()

0    1,560
1    1,460
2    1,302
3    1,040
4    1,040
Name: old_price, dtype: object

In [172]:
# Remove commas from values
products_df['old_price'] = products_df['old_price'].apply(lambda x: x.replace(',','') if x is not None else x)
products_df['old_price'].head(10)

0     1560
1     1460
2     1302
3     1040
4     1040
5     1040
6     5610
7      780
8     3840
9    17999
Name: old_price, dtype: object

In [173]:
# Convert the string values into floats
products_df['old_price'] = products_df['old_price'].apply(lambda x: float(x) if x is not None else x)
products_df['old_price'].head(10)

0     1560.0
1     1460.0
2     1302.0
3     1040.0
4     1040.0
5     1040.0
6     5610.0
7      780.0
8     3840.0
9    17999.0
Name: old_price, dtype: float64

In [174]:
products_df['old_price']

0       1560.0
1       1460.0
2       1302.0
3       1040.0
4       1040.0
         ...  
1995    7200.0
1996    5500.0
1997     800.0
1998     599.0
1999    1699.0
Name: old_price, Length: 2000, dtype: float64

In [175]:
# Index 18 from above was null. So checking its type: 
type(products_df['old_price'][18])

numpy.float64

In [176]:
products_df.head(20)

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,1560.0,36%,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,1460.0,35%,4.7 out of 5,4.7 out of 5(607)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,1302.0,10%,4.7 out of 5,4.7 out of 5(1113)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,1040.0,30%,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,1040.0,30%,4.6 out of 5,4.6 out of 5(619)
5,NIVEA Pearl & Beauty Black Pearl Fine Fragranc...,728.0,1040.0,30%,4.7 out of 5,4.7 out of 5(219)
6,NIVEA Perfect & Radiant Luminous630 Anti Dark ...,3647.0,5610.0,35%,4.4 out of 5,4.4 out of 5(350)
7,NIVEA Perfect & Radiant Even Tone Day Cream SP...,585.0,780.0,25%,4.4 out of 5,4.4 out of 5(560)
8,NIVEA Q10 Power Anti-Wrinkle Day Cream 50ml & ...,2496.0,3840.0,35%,4.6 out of 5,4.6 out of 5(413)
9,"Vitron Smart 32"" Frameless Tv Htc3200s Netflix...",14999.0,17999.0,17%,4.2 out of 5,4.2 out of 5(425)


### Cleaning 'discount' column

In [177]:
# Check unique values if any
products_df['discount'].unique()

array(['36%', '35%', '10%', '30%', '25%', '17%', '26%', '12%', '20%',
       '15%', '43%', None, '48%', '11%', '32%', '38%', '34%', '42%',
       '28%', '33%', '31%', '5%', '16%', '37%', '27%', '24%', '49%',
       '39%', '22%', '46%', '41%', '18%', '40%', '44%', '47%', '19%',
       '14%', '8%', '29%', '56%', '50%', '4%', '1%', '13%', '45%', '23%',
       '9%', '21%', '57%', '51%', '6%', '79%', '54%', '7%', '58%', '53%',
       '2%', '52%', '60%', '64%', '3%', '67%', '80%', '55%', '91%'],
      dtype=object)

In [178]:
# Remove the percent sign
products_df['discount'] = products_df['discount'].apply(lambda x: x.replace('%','') if x is not None else x)
products_df['discount'].unique()

array(['36', '35', '10', '30', '25', '17', '26', '12', '20', '15', '43',
       None, '48', '11', '32', '38', '34', '42', '28', '33', '31', '5',
       '16', '37', '27', '24', '49', '39', '22', '46', '41', '18', '40',
       '44', '47', '19', '14', '8', '29', '56', '50', '4', '1', '13',
       '45', '23', '9', '21', '57', '51', '6', '79', '54', '7', '58',
       '53', '2', '52', '60', '64', '3', '67', '80', '55', '91'],
      dtype=object)

In [179]:
# Convert values into float & divide by 100 to represent the percentages as decimal values
products_df['discount'] = products_df['discount'].apply(lambda x: float(x)/100 if x is not None else x)
products_df['discount'].unique()  # A numbers now floats and None replaced with NaN

array([0.36, 0.35, 0.1 , 0.3 , 0.25, 0.17, 0.26, 0.12, 0.2 , 0.15, 0.43,
        nan, 0.48, 0.11, 0.32, 0.38, 0.34, 0.42, 0.28, 0.33, 0.31, 0.05,
       0.16, 0.37, 0.27, 0.24, 0.49, 0.39, 0.22, 0.46, 0.41, 0.18, 0.4 ,
       0.44, 0.47, 0.19, 0.14, 0.08, 0.29, 0.56, 0.5 , 0.04, 0.01, 0.13,
       0.45, 0.23, 0.09, 0.21, 0.57, 0.51, 0.06, 0.79, 0.54, 0.07, 0.58,
       0.53, 0.02, 0.52, 0.6 , 0.64, 0.03, 0.67, 0.8 , 0.55, 0.91])

In [180]:
# Check head once more
products_df.head()

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,1560.0,0.36,4.5 out of 5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,1460.0,0.35,4.7 out of 5,4.7 out of 5(607)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,1302.0,0.1,4.7 out of 5,4.7 out of 5(1113)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,1040.0,0.3,4.6 out of 5,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,1040.0,0.3,4.6 out of 5,4.6 out of 5(619)


### Cleaning the rating column

In [181]:
# Check for unique values first
products_df['rating'].unique()

array(['4.5 out of 5', '4.7 out of 5', '4.6 out of 5', '4.4 out of 5',
       '4.2 out of 5', '3.7 out of 5', '4 out of 5', '4.3 out of 5',
       '4.1 out of 5', '3.8 out of 5', '3.6 out of 5', '3.9 out of 5',
       '3.4 out of 5', '3.3 out of 5', '4.8 out of 5', '3 out of 5',
       '2 out of 5', '4.9 out of 5', '5 out of 5', None, '3.2 out of 5',
       '3.5 out of 5', '1 out of 5', '2.5 out of 5', '3.1 out of 5',
       '1.7 out of 5', '2.3 out of 5', '2.8 out of 5', '1.5 out of 5',
       '1.8 out of 5', '1.3 out of 5'], dtype=object)

In [182]:
# Split rating string values into 4 elements and select the first element unless its None(left as is)
products_df['rating'] = products_df['rating'].apply(lambda x: x.split(' ')[0] if x is not None else x)
products_df['rating'].unique()

array(['4.5', '4.7', '4.6', '4.4', '4.2', '3.7', '4', '4.3', '4.1', '3.8',
       '3.6', '3.9', '3.4', '3.3', '4.8', '3', '2', '4.9', '5', None,
       '3.2', '3.5', '1', '2.5', '3.1', '1.7', '2.3', '2.8', '1.5', '1.8',
       '1.3'], dtype=object)

In [183]:
# Convert the values into float
products_df['rating'] = products_df['rating'].apply(lambda x: float(x) if x is not None else x)
type(products_df['rating'][1])

numpy.float64

In [184]:
products_df.head(20)

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,1560.0,0.36,4.5,4.5 out of 5(2216)
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,1460.0,0.35,4.7,4.7 out of 5(607)
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,1302.0,0.1,4.7,4.7 out of 5(1113)
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,1040.0,0.3,4.6,4.6 out of 5(829)
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,1040.0,0.3,4.6,4.6 out of 5(619)
5,NIVEA Pearl & Beauty Black Pearl Fine Fragranc...,728.0,1040.0,0.3,4.7,4.7 out of 5(219)
6,NIVEA Perfect & Radiant Luminous630 Anti Dark ...,3647.0,5610.0,0.35,4.4,4.4 out of 5(350)
7,NIVEA Perfect & Radiant Even Tone Day Cream SP...,585.0,780.0,0.25,4.4,4.4 out of 5(560)
8,NIVEA Q10 Power Anti-Wrinkle Day Cream 50ml & ...,2496.0,3840.0,0.35,4.6,4.6 out of 5(413)
9,"Vitron Smart 32"" Frameless Tv Htc3200s Netflix...",14999.0,17999.0,0.17,4.2,4.2 out of 5(425)


### Cleaning the 'votes' column

In [185]:
# Explore unique values
products_df['votes'].unique()

array(['4.5 out of 5(2216)', '4.7 out of 5(607)', '4.7 out of 5(1113)',
       '4.6 out of 5(829)', '4.6 out of 5(619)', '4.7 out of 5(219)',
       '4.4 out of 5(350)', '4.4 out of 5(560)', '4.6 out of 5(413)',
       '4.2 out of 5(425)', '4.5 out of 5(26)', '4.4 out of 5(1399)',
       '4.6 out of 5(438)', '3.7 out of 5(1385)', '4 out of 5(2716)',
       '4.3 out of 5(1426)', '4.4 out of 5(960)', '4.4 out of 5(10)',
       '4.2 out of 5(718)', '4.1 out of 5(1613)', '4.2 out of 5(304)',
       '4.4 out of 5(3741)', '4.5 out of 5(471)', '4.4 out of 5(2012)',
       '4.6 out of 5(240)', '4 out of 5(878)', '4.7 out of 5(446)',
       '4 out of 5(2218)', '4.5 out of 5(1527)', '4.2 out of 5(672)',
       '4.2 out of 5(208)', '4.2 out of 5(633)', '4.2 out of 5(380)',
       '4.6 out of 5(981)', '4.5 out of 5(1067)', '4.6 out of 5(133)',
       '4.4 out of 5(216)', '4.4 out of 5(4704)', '4.4 out of 5(1215)',
       '4.5 out of 5(227)', '4.5 out of 5(49)', '4.5 out of 5(445)',
       '4.2 out

In [186]:
# Extract the votes count with string manipulation methods & convert values to integer values
products_df['votes'] = products_df['votes'].apply(lambda x: float(x.split('(')[1].split(')')[0]) if x is not None else x)
products_df['votes'].unique()

array([2.216e+03, 6.070e+02, 1.113e+03, 8.290e+02, 6.190e+02, 2.190e+02,
       3.500e+02, 5.600e+02, 4.130e+02, 4.250e+02, 2.600e+01, 1.399e+03,
       4.380e+02, 1.385e+03, 2.716e+03, 1.426e+03, 9.600e+02, 1.000e+01,
       7.180e+02, 1.613e+03, 3.040e+02, 3.741e+03, 4.710e+02, 2.012e+03,
       2.400e+02, 8.780e+02, 4.460e+02, 2.218e+03, 1.527e+03, 6.720e+02,
       2.080e+02, 6.330e+02, 3.800e+02, 9.810e+02, 1.067e+03, 1.330e+02,
       2.160e+02, 4.704e+03, 1.215e+03, 2.270e+02, 4.900e+01, 4.450e+02,
       7.500e+01, 1.240e+02, 9.900e+01, 4.420e+02, 8.700e+01, 1.350e+02,
       4.590e+02, 1.427e+03, 2.130e+02, 7.740e+02, 1.104e+03, 7.000e+01,
       3.480e+02, 7.310e+02, 1.495e+03, 3.960e+02, 1.119e+03, 3.200e+02,
       1.888e+03, 6.750e+02, 8.500e+01, 1.800e+02, 6.520e+02, 5.800e+01,
       2.730e+02, 1.502e+03, 3.920e+02, 6.300e+01, 6.200e+01, 1.700e+02,
       1.490e+02, 4.760e+02, 2.011e+03, 1.530e+02, 5.410e+02, 4.910e+02,
       8.400e+02, 9.800e+01, 5.630e+02, 4.440e+02, 

In [187]:
products_df.head(20)

Unnamed: 0,name,new_price,old_price,discount,rating,votes
0,NIVEA Perfect & Radiant Even Tone Day And Nigh...,999.0,1560.0,0.36,4.5,2216.0
1,NIVEA Radiant & Beauty Advanced Care Lotion Fo...,949.0,1460.0,0.35,4.7,607.0
2,NIVEA Nourishing Cocoa Body Lotion With Cocoa ...,1174.0,1302.0,0.1,4.7,1113.0
3,"NIVEA Pearl & Beauty Anti-Perspirant Rollon, 4...",728.0,1040.0,0.3,4.6,829.0
4,NIVEA MEN Deep Antibacterial Anti-Perspirant R...,728.0,1040.0,0.3,4.6,619.0
5,NIVEA Pearl & Beauty Black Pearl Fine Fragranc...,728.0,1040.0,0.3,4.7,219.0
6,NIVEA Perfect & Radiant Luminous630 Anti Dark ...,3647.0,5610.0,0.35,4.4,350.0
7,NIVEA Perfect & Radiant Even Tone Day Cream SP...,585.0,780.0,0.25,4.4,560.0
8,NIVEA Q10 Power Anti-Wrinkle Day Cream 50ml & ...,2496.0,3840.0,0.35,4.6,413.0
9,"Vitron Smart 32"" Frameless Tv Htc3200s Netflix...",14999.0,17999.0,0.17,4.2,425.0
