#### Installing Necessary Libraries

In [None]:

%pip install beautifulsoup4 sqlalchemy

In [2]:
import requests
import pandas as pd
import re #importing Regex (Regular expression)
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import psycopg2 as psy


#### Test The Response

In [4]:
#Connecting to the url for scraping

resp=requests.get('https://www.jumia.com.ng')
resp.status_code

200

In [5]:
#Putting response in text
jumia_data=resp.text
print(jumia_data)

<!DOCTYPE html><html lang="en" dir="ltr"><head><meta charset="utf-8"/><title>Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport</title><meta property="og:type" content="product"/><meta property="og:site_name" content="Jumia Nigeria"/><meta property="og:title" content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport"/><meta property="og:description" content="Jumia Nigeria the #1 of Online Shopping in Nigeria - Shop Online All Products : Smartphones, Appliances, Clothing... ✓ Top Brands :  Samsung, Xiaomi, Adidas... ✓ Best prices in Nigeria ✓ Order now and enjoy pay on delivery ! "/><meta property="og:url" content="/"/><meta property="og:image" content="https://ng.jumia.is/cms/jumialogonew.png"/><meta property="og:locale" content="en_NG"/><meta name="title" content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport"/><meta name="robots" content="index,follow"/><meta name="description" cont

#### Integrate Request With BeautifulSoup
###### To access the different tags and attributes asscociated with the website. In summary, parsing with BeautifulSoup involves analyzing the structure of an HTML document to create a parse tree, which can then be navigated to extract and manipulate content efficiently.

In [6]:

#'lxml' is an extensive library for parsing XML and HTML documents very quickly, and it's one of the most commonly used parsers with BeautifulSoup due to its speed and flexibility.
resp=requests.get('https://www.jumia.com.ng')

soup= BeautifulSoup(jumia_data, "lxml")

print(soup)

<!DOCTYPE html>
<html dir="ltr" lang="en"><head><meta charset="utf-8"/><title>Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport</title><meta content="product" property="og:type"/><meta content="Jumia Nigeria" property="og:site_name"/><meta content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport" property="og:title"/><meta content="Jumia Nigeria the #1 of Online Shopping in Nigeria - Shop Online All Products : Smartphones, Appliances, Clothing... ✓ Top Brands :  Samsung, Xiaomi, Adidas... ✓ Best prices in Nigeria ✓ Order now and enjoy pay on delivery ! " property="og:description"/><meta content="/" property="og:url"/><meta content="https://ng.jumia.is/cms/jumialogonew.png" property="og:image"/><meta content="en_NG" property="og:locale"/><meta content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport" name="title"/><meta content="index,follow" name="robots"/><meta content="Jumia Nigeria

#### Tags
##### HTML tags mark the beginning and end of HTML elements in web documents.
##### Role in Web Scraping:Identifying specific data points (e.g., headings, paragraphs, links)


In [7]:
#####Soup can be used to find out about the Tags on the website
print(soup.head)

<head><meta charset="utf-8"/><title>Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport</title><meta content="product" property="og:type"/><meta content="Jumia Nigeria" property="og:site_name"/><meta content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport" property="og:title"/><meta content="Jumia Nigeria the #1 of Online Shopping in Nigeria - Shop Online All Products : Smartphones, Appliances, Clothing... ✓ Top Brands :  Samsung, Xiaomi, Adidas... ✓ Best prices in Nigeria ✓ Order now and enjoy pay on delivery ! " property="og:description"/><meta content="/" property="og:url"/><meta content="https://ng.jumia.is/cms/jumialogonew.png" property="og:image"/><meta content="en_NG" property="og:locale"/><meta content="Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty &amp; Sport" name="title"/><meta content="index,follow" name="robots"/><meta content="Jumia Nigeria the #1 of Online Shopping in Nigeria - Sh

In [8]:
print(soup.body.div.header)

<header class="header"><section class="row -i-ctr -fw-nw -pvm"><div class="col3 -df -i-ctr"><a class="-df -i-ctr -mra" href="/"><svg aria-label="Jumia Nigeria: Online Shopping for Electronics, Phones &amp; Fashion" class="ic" height="30" role="img" viewbox="0 0 172 30" width="134"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-shop-jumia.9f5451c7.svg#logo"></use></svg></a></div><form action="/catalog/" class="cola -df" data-track-onsubmit="search" id="search" method="get"><div class="find"><svg class="ic" height="24" viewbox="0 0 24 24" width="24"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-icons.a66628fd.svg#search"></use></svg><input aria-label="Search" autocomplete="off" id="fi-q" name="q" placeholder="Search products, brands and categories" required="" type="text" value=""/><button aria-label="Reset" class="rst" type="button"><svg class="ic" height="24" viewbox="0 0 24 24" width="24"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-icons.a

#### Attributes
##### Attributes provide additional information about HTML elements 

In [9]:
tags=soup.body.div.article
print(tags.attrs)

{'class': ['banner', '_pp'], 'data-ppb-rev': 'v1.0'}


In [10]:
print(tags.attrs['class'])

['banner', '_pp']


#### Navigable Strings
##### A "navigable string" in BeautifulSoup refers to a string that is not part of a tag, but rather a standalone string within the document. These strings can still be navigated and manipulated within the parse tree like tags, hence the term "navigable."

In [11]:
#Printing the sring within the head tag
tags=soup.head.title
print(tags.string)

Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty & Sport


In [12]:
print(tags.text)

Jumia Nigeria | Online Shopping for Electronics, Fashion, Home, Beauty & Sport


#### Find() Functions: find(), find_all(), find() with Regex

##### Find():
###### The find() method is used to search for the first occurrence of a particular tag or set of tags that match the specified criteria.
###### It returns the first matching element found in the parse tree.
###### If no matching element is found, it returns None.

In [13]:
#Connecting to the URL for requests
url= 'https://www.jumia.com.ng/peak-power-5.5kva-remote-control-and-key-starter-generator-100-copper-80342298.html'
resp= requests.get(url)

soup= BeautifulSoup(resp.text, 'lxml' )
print(soup.find('h1'))

<h1 class="-fs20 -pts -pbxs">Peak Power 5.5kva Remote Control And Key Starter Generator- 100% Copper</h1>


In [14]:
print(soup.find('h1').string)

Peak Power 5.5kva Remote Control And Key Starter Generator- 100% Copper


In [15]:
print(soup.find('div', {'class': 'col10'}))

<div class="col10 -df -j-ctr -fs0"><a class="vent-link" title="Jumia"><svg class="ic" height="24" viewbox="0 0 67 24" width="67"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-global.6dd5a6c0.svg#venture-jumia"></use></svg></a><a class="vent-link" href="https://pay.jumia.com.ng/?utm_source=jumia&amp;utm_medium=mall&amp;utm_campaign=venturebar" rel="nofollow noopener" target="_blank" title="JumiaPay"><svg class="ic" height="24" viewbox="0 0 60 24" width="60"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-global.6dd5a6c0.svg#venture-pay"></use></svg></a></div>


In [16]:
print(soup.find('div', {'class': 'col10'}))

<div class="col10 -df -j-ctr -fs0"><a class="vent-link" title="Jumia"><svg class="ic" height="24" viewbox="0 0 67 24" width="67"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-global.6dd5a6c0.svg#venture-jumia"></use></svg></a><a class="vent-link" href="https://pay.jumia.com.ng/?utm_source=jumia&amp;utm_medium=mall&amp;utm_campaign=venturebar" rel="nofollow noopener" target="_blank" title="JumiaPay"><svg class="ic" height="24" viewbox="0 0 60 24" width="60"><use xlink:href="https://www.jumia.com.ng/assets_he/images/i-global.6dd5a6c0.svg#venture-pay"></use></svg></a></div>


In [17]:
print(soup.find('h1', {'class': '-fs20 -pts -pbxs'}).string)

Peak Power 5.5kva Remote Control And Key Starter Generator- 100% Copper


#### find_all()
###### The find_all() method is used to search for all occurrences of a particular tag or set of tags that match the specified criteria.
###### It returns a list containing all matching elements found in the parse tree.
###### If no matching elements are found, it returns an empty list.

In [18]:
#Connecting to the URL for requests
url='https://www.jumia.com.ng/catalog/?q=generators'
resp= requests.get(url)

soup=BeautifulSoup(resp.text, 'lxml')

In [19]:
print(soup.find_all('h3', class_='name'))

[<h3 class="name">BM2 Real Time Car Tester 12V Bluetooth 4.0 Diagnostic Tool</h3>, <h3 class="name">6 Digital Finger Tally Counter 8 Channels With LED</h3>, <h3 class="name">Senwei SV6200E2 2.8KVA Key Starter Low Noise Quality Gen</h3>, <h3 class="name">4 Digit Number Mini Tally Stainless Steel Metal Shell</h3>, <h3 class="name">Mini Manual Electronic Counter Stitch Marker And Row Finger</h3>, <h3 class="name">6.5KW Automatic Voltage Regulator AVR Rectifier for 5KW-6.5KW Generators Halfmoon Style 450V 680UF</h3>, <h3 class="name">Mini Digital Finger Hand Ring Portable Tally Electronic</h3>, <h3 class="name">HT107D 90-250V Outlet Socket Tester Automatic Electric Circuit Polarity Voltage Detector Breaker Finder</h3>, <h3 class="name">Firman 1600/2000 Watt Gasoline Powered Inverter Generator.</h3>, <h3 class="name">Mini 5 Digit LCD Digital Display Finger Hand Ring Tally</h3>, <h3 class="name">High Precision Digital DDS Dual-channel Function Signal</h3>, <h3 class="name">Digital LCD Finger

In [20]:
#To print a string list of the above under generators 
products= soup.find_all('h3', class_='name')

for i in products:
    print(i.string)

BM2 Real Time Car Tester 12V Bluetooth 4.0 Diagnostic Tool
6 Digital Finger Tally Counter 8 Channels With LED
Senwei SV6200E2 2.8KVA Key Starter Low Noise Quality Gen
4 Digit Number Mini Tally Stainless Steel Metal Shell
Mini Manual Electronic Counter Stitch Marker And Row Finger
6.5KW Automatic Voltage Regulator AVR Rectifier for 5KW-6.5KW Generators Halfmoon Style 450V 680UF
Mini Digital Finger Hand Ring Portable Tally Electronic
HT107D 90-250V Outlet Socket Tester Automatic Electric Circuit Polarity Voltage Detector Breaker Finder
Firman 1600/2000 Watt Gasoline Powered Inverter Generator.
Mini 5 Digit LCD Digital Display Finger Hand Ring Tally
High Precision Digital DDS Dual-channel Function Signal
Digital LCD Finger Ring Tally Counter 6Digit Buddha Beads
Dual Mode LCD PWM Signal Generator PWM Pulse Frequency Duty
NEMA 17 Motor For Lead 8mm Pitch Screw 300mm M8 Z Axis 3D Printer RepRap
168 Recoil Pull Starter Start For Honda GX120 GX140 GX160 GX200 5.5-6.5H Type 168
BLUETTI EB70 716

In [21]:
#Checking the number of products we have under generators

len(products)

40

##### find_all() with Regex

###### The find_all() method with regex allows you to search for elements using regular expressions (regex) to specify the search criteria.
###### It behaves similarly to the regular find() method but allows for more flexible and complex matching patterns.

In [22]:
##Connecting to the URL for requests
url= 'https://www.jumia.com.ng/generators/'

resp=requests.get(url)

soup= BeautifulSoup(resp.text, 'lxml')

In [23]:
data= soup.find_all(string=re.compile('Haier Thermocool'))
print(data)

['Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)', 'Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)', 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)', 'Haier Thermocool 2.5kVA/2.0kW Single Phase Petrol Electric start Generator (BOBO 2800ES)', 'Haier Thermocool', 'Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)', 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)', "Discover the best of power generating devices from top brands such as Hyundai, Firman, Sumec, Mikano, Honda, Elemax, Binatone, Haier Thermocool, Tiger and many more. Our generators provide you with effective alternative options of stable light when there is blackout from your main power source. Generators are one of the effective alternative options to provide the benefits of electricity especially in regions with high concentration of population like Abuja and Lagos, but also in the more rural regions, where the power supply isn't always guarant

In [24]:
data= soup.find_all(string=re.compile('Haier Thermocool'))
for i in data:
    print(i.text)

Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)
Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)
Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)
Haier Thermocool 2.5kVA/2.0kW Single Phase Petrol Electric start Generator (BOBO 2800ES)
Haier Thermocool
Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)
Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)
Discover the best of power generating devices from top brands such as Hyundai, Firman, Sumec, Mikano, Honda, Elemax, Binatone, Haier Thermocool, Tiger and many more. Our generators provide you with effective alternative options of stable light when there is blackout from your main power source. Generators are one of the effective alternative options to provide the benefits of electricity especially in regions with high concentration of population like Abuja and Lagos, but also in the more rural regions, where the power supply isn't always guaranteed. A 10 KVA generator

In [25]:
len(data)

9

In [26]:
#Listing just the generators with details method 1
haier= data[0:5] + data[6:9]
haier

['Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)',
 'Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)',
 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)',
 'Haier Thermocool 2.5kVA/2.0kW Single Phase Petrol Electric start Generator (BOBO 2800ES)',
 'Haier Thermocool',
 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)',
 "Discover the best of power generating devices from top brands such as Hyundai, Firman, Sumec, Mikano, Honda, Elemax, Binatone, Haier Thermocool, Tiger and many more. Our generators provide you with effective alternative options of stable light when there is blackout from your main power source. Generators are one of the effective alternative options to provide the benefits of electricity especially in regions with high concentration of population like Abuja and Lagos, but also in the more rural regions, where the power supply isn't always guaranteed. A 10 KVA generator will power all your ",
 'window.__

In [27]:
#Listing just the generators with details method 2
haier= data[0:5]
p = data[6:9]

for i in p:
    haier.append(i)

print(haier)

['Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)', 'Haier Thermocool 3.75kVA/3.0kW Petrol Generator (HSTL4000ES)', 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)', 'Haier Thermocool 2.5kVA/2.0kW Single Phase Petrol Electric start Generator (BOBO 2800ES)', 'Haier Thermocool', 'Haier Thermocool 1.25kVA/1kW Single Phase Petrol Generator (1500MS)', "Discover the best of power generating devices from top brands such as Hyundai, Firman, Sumec, Mikano, Honda, Elemax, Binatone, Haier Thermocool, Tiger and many more. Our generators provide you with effective alternative options of stable light when there is blackout from your main power source. Generators are one of the effective alternative options to provide the benefits of electricity especially in regions with high concentration of population like Abuja and Lagos, but also in the more rural regions, where the power supply isn't always guaranteed. A 10 KVA generator will power all your ", 'window.__STORE__

### Data Extraction

In [28]:
#Connecting to the URL for requests
url= 'https://www.jumia.com.ng/catalog/?q=laptops'

resp= requests.get(url)

soup=BeautifulSoup(resp.text, 'lxml')

In [29]:
#Product

product_laptop=soup.find_all("h3", class_="name")
print(product_laptop)

[<h3 class="name">Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-Intel CELERON QUAD CORE WIN10 Pro +Mouse &amp;USB Light</h3>, <h3 class="name">Hp EliteBook 840 G6 Intel Core I5 Touchscreen 16GB RAM/1TB SSD/Backlit Keyboard/FP Win 11 Pro</h3>, <h3 class="name">Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+Win10 Pro</h3>, <h3 class="name">Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB SSD/Backlit Keyboard/FP Reader Wins 11 Pro Laptop +BAG</h3>, <h3 class="name">Hp EliteBook 840 G6 Touchscreen Intel Core I5 16GB RAM/512GB SSD/Backlit Keyboard/FP Win 11 Pro</h3>, <h3 class="name">Ace Elec ACE 14.1'' Intel Celeron J4105 4Core CPU 16GB+128GB Laptop</h3>, <h3 class="name">Hp Stream 11-Intel Celeron 64gb Ssd/4gb Ram+pouch</h3>, <h3 class="name">Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ BAG AND LIGHT FOR Keyboard</h3>, <h3 class="name">Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ Mouse &amp; USB LIGHT FOR Keyboard</h3>, <h3 class="name">Hp Elite

In [30]:
#Product Names method 1 (list comprehension)
product_names= [x.string for x in product_laptop]
product_names

['Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-Intel CELERON QUAD CORE WIN10 Pro +Mouse &USB Light',
 'Hp EliteBook 840 G6 Intel Core I5 Touchscreen 16GB RAM/1TB SSD/Backlit Keyboard/FP Win 11 Pro',
 'Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+Win10 Pro',
 'Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB SSD/Backlit Keyboard/FP Reader Wins 11 Pro Laptop +BAG',
 'Hp EliteBook 840 G6 Touchscreen Intel Core I5 16GB RAM/512GB SSD/Backlit Keyboard/FP Win 11 Pro',
 "Ace Elec ACE 14.1'' Intel Celeron J4105 4Core CPU 16GB+128GB Laptop",
 'Hp Stream 11-Intel Celeron 64gb Ssd/4gb Ram+pouch',
 'Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ BAG AND LIGHT FOR Keyboard',
 'Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ Mouse & USB LIGHT FOR Keyboard',
 'Hp EliteBook 840 G6 Touchscreen Intel Core I5 Backlit Keyboard 16GB RAM/256 SSD Win 11 Pro',
 'Hp Stream 11 Intel Celeron Quad Core - 64GB SSD 4GB RAM Windows 10 PRO+ Mouse &USB Light For Keyboard',
 'Hp

In [31]:
# product names using method 2 (using for loop)
product_names=[]
product_laptop=soup.find_all("h3", class_="name")

for laptop in product_laptop:
    product_names.append(laptop.string)

product_names


['Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-Intel CELERON QUAD CORE WIN10 Pro +Mouse &USB Light',
 'Hp EliteBook 840 G6 Intel Core I5 Touchscreen 16GB RAM/1TB SSD/Backlit Keyboard/FP Win 11 Pro',
 'Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+Win10 Pro',
 'Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB SSD/Backlit Keyboard/FP Reader Wins 11 Pro Laptop +BAG',
 'Hp EliteBook 840 G6 Touchscreen Intel Core I5 16GB RAM/512GB SSD/Backlit Keyboard/FP Win 11 Pro',
 "Ace Elec ACE 14.1'' Intel Celeron J4105 4Core CPU 16GB+128GB Laptop",
 'Hp Stream 11-Intel Celeron 64gb Ssd/4gb Ram+pouch',
 'Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ BAG AND LIGHT FOR Keyboard',
 'Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64GB SSD Windows 10 Pro+ Mouse & USB LIGHT FOR Keyboard',
 'Hp EliteBook 840 G6 Touchscreen Intel Core I5 Backlit Keyboard 16GB RAM/256 SSD Win 11 Pro',
 'Hp Stream 11 Intel Celeron Quad Core - 64GB SSD 4GB RAM Windows 10 PRO+ Mouse &USB Light For Keyboard',
 'Hp

In [32]:
#Current prices method 1 (using list comprehension)

price_laptop=soup.find_all('div', class_='prc')

current_prices= [x.string for x in price_laptop]
current_prices

['₦ 240,000',
 '₦ 505,950',
 '₦ 125,000',
 '₦ 505,000',
 '₦ 484,000',
 '₦ 235,000',
 '₦ 125,000',
 '₦ 160,000',
 '₦ 150,000',
 '₦ 455,000',
 '₦ 129,000',
 '₦ 515,000',
 '₦ 200,000',
 '₦ 150,000',
 '₦ 215,000',
 '₦ 193,000',
 '₦ 200,000',
 '₦ 275,000',
 '₦ 130,000',
 '₦ 290,000',
 '₦ 515,000',
 '₦ 385,000',
 '₦ 206,979',
 '₦ 200,000',
 '₦ 339,000',
 '₦ 120,000',
 '₦ 505,950',
 '₦ 185,000',
 '₦ 505,000',
 '₦ 430,000',
 '₦ 124,000',
 '₦ 465,000',
 '₦ 465,000',
 '₦ 485,800',
 '₦ 130,000',
 '₦ 240,000',
 '₦ 290,000',
 '₦ 325,000',
 '₦ 525,000',
 '₦ 185,000']

In [33]:
#Current prices method 2 (using for loop)
current_prices=[]

price_laptop=soup.find_all('div', class_='prc')

for price in price_laptop:
    current_prices.append(price.string)

current_prices


['₦ 240,000',
 '₦ 505,950',
 '₦ 125,000',
 '₦ 505,000',
 '₦ 484,000',
 '₦ 235,000',
 '₦ 125,000',
 '₦ 160,000',
 '₦ 150,000',
 '₦ 455,000',
 '₦ 129,000',
 '₦ 515,000',
 '₦ 200,000',
 '₦ 150,000',
 '₦ 215,000',
 '₦ 193,000',
 '₦ 200,000',
 '₦ 275,000',
 '₦ 130,000',
 '₦ 290,000',
 '₦ 515,000',
 '₦ 385,000',
 '₦ 206,979',
 '₦ 200,000',
 '₦ 339,000',
 '₦ 120,000',
 '₦ 505,950',
 '₦ 185,000',
 '₦ 505,000',
 '₦ 430,000',
 '₦ 124,000',
 '₦ 465,000',
 '₦ 465,000',
 '₦ 485,800',
 '₦ 130,000',
 '₦ 240,000',
 '₦ 290,000',
 '₦ 325,000',
 '₦ 525,000',
 '₦ 185,000']

In [34]:
#Previous price (using for loop)

previous_price=[]

fmr_laptop_price= soup.find_all('div', class_="old")

for price in fmr_laptop_price:
    previous_price.append(price.string)

previous_price

['₦ 800,000',
 '₦ 770,900',
 '₦ 800,000',
 '₦ 770,900',
 '₦ 799,000',
 '₦ 150,000',
 '₦ 180,000',
 '₦ 650,000',
 '₦ 270,000',
 '₦ 850,000',
 '₦ 250,000',
 '₦ 200,000',
 '₦ 600,000',
 '₦ 220,000',
 '₦ 245,000',
 '₦ 1,399,000',
 '₦ 500,000',
 '₦ 570,000',
 '₦ 398,450',
 '₦ 279,422',
 '₦ 270,000',
 '₦ 1,699,000',
 '₦ 770,900',
 '₦ 295,000',
 '₦ 900,000',
 '₦ 900,000',
 '₦ 550,000',
 '₦ 965,100',
 '₦ 200,000',
 '₦ 300,000',
 '₦ 300,000',
 '₦ 680,000']

In [35]:
#Discount percentages

discount_percs=[]
discounts= soup.find_all('div', class_="bdg _dsct _sm")

for discount in discounts:
    discount_percs.append(discount.string)

discount_percs

['70%',
 '34%',
 '37%',
 '37%',
 '71%',
 '17%',
 '17%',
 '30%',
 '52%',
 '39%',
 '20%',
 '25%',
 '64%',
 '12%',
 '18%',
 '80%',
 '42%',
 '10%',
 '3%',
 '26%',
 '26%',
 '80%',
 '34%',
 '37%',
 '44%',
 '52%',
 '15%',
 '52%',
 '35%',
 '20%',
 '3%',
 '23%']

#### Merge Data

In [36]:
data={'title': product_names, 'current_price': current_prices, 'former_price': previous_price, 'discount': discount_percs}

laptop_df=pd.DataFrame.from_dict(data,orient='index')

In [37]:
laptop_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
title,Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-I...,Hp EliteBook 840 G6 Intel Core I5 Touchscreen ...,Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+...,Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB...,Hp EliteBook 840 G6 Touchscreen Intel Core I5 ...,Ace Elec ACE 14.1'' Intel Celeron J4105 4Core ...,Hp Stream 11-Intel Celeron 64gb Ssd/4gb Ram+pouch,Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64...,Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64...,Hp EliteBook 840 G6 Touchscreen Intel Core I5 ...,...,Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+...,Hp EliteBook 840 G6 Intel Core I5-8GB RAM/512G...,Hp EliteBook 840 G6 Intel Core I5-8GB RAM/512G...,Hp EliteBook 840 G6 Intel Core I5 Touchscreen...,Hp Stream 11 Intel Celeron 4GB RAM-64GB SSD WI...,Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-I...,Hp Stream 11 Intel Celeron 4gb Ram 64gb HDD Wi...,Hp Notebook 14 AMD RYZEN 3- 12GB RAM - 1TB HDD...,Hp ELITEBOOK X360 1030 G2 CORE I5 8GB RAM/ 512...,"Hp 14 Intel Celeron, 64GB, 4GB, Windows 11 In ..."
current_price,"₦ 240,000","₦ 505,950","₦ 125,000","₦ 505,000","₦ 484,000","₦ 235,000","₦ 125,000","₦ 160,000","₦ 150,000","₦ 455,000",...,"₦ 124,000","₦ 465,000","₦ 465,000","₦ 485,800","₦ 130,000","₦ 240,000","₦ 290,000","₦ 325,000","₦ 525,000","₦ 185,000"
former_price,"₦ 800,000","₦ 770,900","₦ 800,000","₦ 770,900","₦ 799,000","₦ 150,000","₦ 180,000","₦ 650,000","₦ 270,000","₦ 850,000",...,"₦ 300,000","₦ 680,000",,,,,,,,
discount,70%,34%,37%,37%,71%,17%,17%,30%,52%,39%,...,3%,23%,,,,,,,,


In [38]:
#transpose data frame
laptop_df=laptop_df.transpose()
laptop_df


Unnamed: 0,title,current_price,former_price,discount
0,Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-I...,"₦ 240,000","₦ 800,000",70%
1,Hp EliteBook 840 G6 Intel Core I5 Touchscreen ...,"₦ 505,950","₦ 770,900",34%
2,Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+...,"₦ 125,000","₦ 800,000",37%
3,Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB...,"₦ 505,000","₦ 770,900",37%
4,Hp EliteBook 840 G6 Touchscreen Intel Core I5 ...,"₦ 484,000","₦ 799,000",71%
5,Ace Elec ACE 14.1'' Intel Celeron J4105 4Core ...,"₦ 235,000","₦ 150,000",17%
6,Hp Stream 11-Intel Celeron 64gb Ssd/4gb Ram+pouch,"₦ 125,000","₦ 180,000",17%
7,Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64...,"₦ 160,000","₦ 650,000",30%
8,Hp Stream 11 Pro- Intel Celeron - 4GB RAM - 64...,"₦ 150,000","₦ 270,000",52%
9,Hp EliteBook 840 G6 Touchscreen Intel Core I5 ...,"₦ 455,000","₦ 850,000",39%


In [39]:
#For all the pages

pages=[ x for x in range(1,51)]

product_names=[]
current_prices=[]
previous_price=[]
discount_percs=[]

for page in pages:
    url= f"https://www.jumia.com.ng/catalog/?q=laptops&page={page}#catalog-listing"

    resp= requests.get(url)

    soup=BeautifulSoup(resp.text, "lxml")

    #product name
    product_laptop=soup.find_all("h3", class_="name")
    for laptop in product_laptop:
        product_names.append(laptop.string)

    #curent price
    price_laptop=soup.find_all('div', class_='prc')
    for price in price_laptop:
        current_prices.append(price.string)

    #Previous price (using for loop)
    fmr_laptop_price= soup.find_all('div', class_="old")
    for price in fmr_laptop_price:
        previous_price.append(price.string)

    #Discount percentages
    discounts= soup.find_all('div', class_="bdg _dsct _sm")
    for discount in discounts:
        discount_percs.append(discount.string)

data={'title': product_names, 'current_price': current_prices, 'former_price': previous_price, 'discount': discount_percs}

laptop_df=pd.DataFrame.from_dict(data,orient='index')
laptop_df=laptop_df.transpose()
laptop_df
        



Unnamed: 0,title,current_price,former_price,discount
0,Hp ProBook 11 X360- TOUCH- 512GB SSD/4GB RAM-I...,"₦ 240,000","₦ 800,000",70%
1,Hp EliteBook 840 G6 Intel Core I5 Touchscreen ...,"₦ 505,950","₦ 770,900",34%
2,Hp Stream11intel Celeron D/C 64GB HDD+4GB RAM+...,"₦ 125,000","₦ 800,000",37%
3,Hp EliteBook 840 G6 Intel Core I5-16GB RAM/1TB...,"₦ 505,000","₦ 770,900",37%
4,Hp EliteBook 840 G6 Touchscreen Intel Core I5 ...,"₦ 484,000","₦ 799,000",71%
...,...,...,...,...
1994,"Hp Stream 11 Pro G4 Education Edition, 11.6"", ...","₦ 130,000",,
1995,Hp EliteBook 840 G5 Intel Core I5- 8GB RAM/256...,"₦ 445,000",,
1996,Hp Omen 16 Gaming 13th Gen Intel Core I7 16GB ...,"₦ 1,799,000",,
1997,For Macbook Air 13.6 M2 A2681 Case Laptop Pro ...,"₦ 49,766",,


#### Data Transformation and Loading

In [40]:
#Checking for the data info
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          1999 non-null   object
 1   current_price  1999 non-null   object
 2   former_price   1407 non-null   object
 3   discount       1407 non-null   object
dtypes: object(4)
memory usage: 62.6+ KB


In [41]:
#Checking for the null columns in the data
laptop_df.columns[laptop_df.isnull().any()]

Index(['former_price', 'discount'], dtype='object')

In [42]:
#Replacing nulls with zeros
laptop_df.fillna({'former_price':'0' , 'discount': '0%'} , inplace=True)

In [43]:
# Will be converting the price column into integers
# Removing unwanted characters
laptop_df['current_price']= laptop_df['current_price'].str.replace('₦', '').str.replace(',', '')

In [44]:
# Removing the price ranges in the current price column
laptop_df['current_price']= [(str(price).split('-')[-1]) if '-' in price else price for price in laptop_df['current_price']]

In [45]:
#converting current price column finally to integer
laptop_df['current_price']= laptop_df['current_price'].astype(int)

In [46]:
# Removing unwanted characters and then converting to integer
laptop_df['former_price']=laptop_df['former_price'].str.replace('₦', '').str.replace(',', '')
laptop_df['former_price']= laptop_df['former_price'].astype(int)

In [47]:
#Converting the discount column to a string
laptop_df['discount']= laptop_df['discount'].astype(str)

In [64]:
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          1999 non-null   object
 1   current_price  1999 non-null   int32 
 2   former_price   1999 non-null   int32 
 3   discount       1999 non-null   object
dtypes: int32(2), object(2)
memory usage: 47.0+ KB


In [65]:
#Converting to csv file
laptop_df.to_csv ('juno_product.csv', index=False)

##### Loading Data Into Postgre Database Method 1

In [52]:
import psycopg2 as psy

In [57]:
# Creating database connection function parameters
def get_conn ():
    connection= psy.connect("dbname= juno_ecommerce user= postgres password=romlrd host=localhost port= 5432")
    return connection

conn = get_conn()

In [66]:
# Creating table

conn= get_conn()
cur=conn.cursor()

create_table_query= '''DROP TABLE IF EXISTS juno_product CASCADE;

                        CREATE TABLE IF NOT EXISTS juno_product(
                           title VARCHAR (1000),
                           current_price INT,
                           former_price INT,
                           discount VARCHAR(50) 
                           );
'''
cur.execute(create_table_query)

conn.commit()
cur.close()
conn.close()




In [71]:
#Loading csv file
conn= get_conn()
cur=conn.cursor()

with open ('juno_product.csv', 'r') as csvfile:
   # next (csvfile)
   # cur.copy_from(csvfile, 'juno_product', sep=',')
    cur.copy_expert(f"COPY juno_product FROM STDIN WITH CSV HEADER", csvfile )

conn.commit()
cur.close()
conn.close()