# 1.4 Accessing web data without API

In [50]:
# Import libraries

import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

In [51]:
## Setup chrome options

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [52]:
# Install Chrome driver manager

service = Service(executable_path=ChromeDriverManager().install())

In [53]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")   # optional: run without opening window
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)   # Selenium auto-finds driver
driver.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")
print(driver.title)
driver.quit()

Key events of the 20th century - Wikipedia


In [54]:
service = Service("/usr/local/bin/chromedriver")  
driver = webdriver.Chrome(service=service)

## Scraping Wikipedia with requests and bs4

In [55]:
# Second portion

import requests
from bs4 import BeautifulSoup

In [56]:
# Get URL

page = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [57]:
url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/117.0 Safari/537.36"
}

page = requests.get(url, headers=headers)

print(page.status_code)  

200


In [58]:
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.title)         
print(soup.title.string) 

<title>Key events of the 20th century - Wikipedia</title>
Key events of the 20th century - Wikipedia


In [59]:
text = soup.get_text()

In [60]:
text = text.encode('utf-8')

In [61]:
with open('Key events of the 20th century.txt', 'wb') as f: 
    f.write(text)

## Scraping Countries wiki page using selenium

In [62]:
# Get the page's contents

page_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(page_url)

In [63]:
# Click on Accept cookies

# time.sleep(3)
# driver.find_element(By.XPATH, '//div[text()="ACCEPT"]').click()

In [64]:
content = driver.find_element(By.ID, "content-collapsible-block-0")

In [65]:
# Get all <a> tags inside
links = content.find_elements(By.TAG_NAME, "a")


In [66]:
# Extract their text (remove empty strings)
texts = [link.text for link in links if link.text.strip() != ""]


In [67]:
print(texts[:20])   # quick preview

['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan']


In [68]:
# Create DataFrame
df = pd.DataFrame(texts, columns=["Country"])


In [69]:
df.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [70]:
df.info

<bound method DataFrame.info of          Country
0    Afghanistan
1        Albania
2        Algeria
3        Andorra
4         Angola
..           ...
190    Venezuela
191      Vietnam
192        Yemen
193       Zambia
194     Zimbabwe

[195 rows x 1 columns]>

In [71]:
df.shape

(195, 1)

In [72]:
# Save the dataframe as csv

df.to_csv('List of Countries_2.0.csv')

In [73]:
content_2 = driver.find_element(By.ID, "content-collapsible-block-2")

In [74]:
# Get all <a> tags inside
links_2 = content_2.find_elements(By.TAG_NAME, "a")

In [75]:
# Extract their text (remove empty strings)
texts_2 = [link_2.text for link_2 in links_2 if link_2.text.strip() != ""]

In [76]:
# Create DataFrame
df_2 = pd.DataFrame(texts_2, columns=["Dependant Countries"])

In [77]:
df_2.head()

Unnamed: 0,Dependant Countries
0,Akrotiri
1,Åland Islands
2,American Samoa
3,Anguilla
4,Aruba


In [78]:
countries_test = driver.find_elements(By.XPATH, "//a[contains(@href,'/wiki/') and not(contains(@href, 'File:'))]")

In [83]:
header = driver.find_element(By.XPATH, "//*[normalize-space(text())='Countries']")
countries = header.find_elements(By.XPATH, "following::p[.//a][not(.//span[contains(text(),'Disputed countries')])]")
for p in countries:
    print(p.find_element(By.XPATH, ".//a[1]").text)

Afghanistan
Bahamas
Cabo Verde
Democratic Republic of the Congo
Ecuador
Fiji
Gabon
Haiti
Iceland
Jamaica
Kazakhstan
Laos
Madagascar
Namibia
Oman
Pakistan
Qatar
Republic of the Congo
Saint Kitts and Nevis
Tajikistan
Uganda
Vanuatu
Yemen
Zambia
Abkhazia
Akrotiri
Agaléga
Banc du Geyser
Canary Islands
Desecheo
Glorioso Islands
Heligoland
Mona
Petite Martinique
Rodrigues Island
Tromelin Island
Adélie Land
Balleny Islands
Marie Byrd Land
Adjara
Bangsamoro


In [None]:
for country in countries_test:
    print(country.text)

In [None]:
countries_list = [c.text for c in countries_test]
df_test = pd.DataFrame(countries_list, columns=["Countries"])