# 1.4 Accessing web data without API

In [1]:
# Import libraries

import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

In [2]:
## Setup chrome options

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [3]:
# Install Chrome driver manager

service = Service(executable_path=ChromeDriverManager().install())

In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")   # optional: run without opening window
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)   # Selenium auto-finds driver
driver.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")
print(driver.title)
driver.quit()

Key events of the 20th century - Wikipedia


In [19]:
service = Service("/usr/local/bin/chromedriver")  
driver = webdriver.Chrome(service=service)

## Scraping Wikipedia with requests and bs4

In [6]:
# Second portion

import requests
from bs4 import BeautifulSoup

In [7]:
# Get URL

page = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [8]:
url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/117.0 Safari/537.36"
}

page = requests.get(url, headers=headers)

print(page.status_code)  

200


In [9]:
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.title)         
print(soup.title.string) 

<title>Key events of the 20th century - Wikipedia</title>
Key events of the 20th century - Wikipedia


In [10]:
text = soup.get_text()

In [11]:
text = text.encode('utf-8')

In [13]:
with open('Key events of the 20th century.txt', 'wb') as f: 
    f.write(text)

## Scraping Countries wiki page using selenium

In [20]:
# Get the page's contents

page_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(page_url)

In [21]:
# Click on Accept cookies

# time.sleep(3)
# driver.find_element(By.XPATH, '//div[text()="ACCEPT"]').click()

In [36]:
content = driver.find_element(By.CLASS_NAME, "mw-parser-output")

In [37]:
# Get all <a> tags inside
links = content.find_elements(By.TAG_NAME, "a")


In [38]:
# Extract their text (remove empty strings)
texts = [link.text for link in links if link.text.strip() != ""]


In [39]:
print(texts[:20])   # quick preview

['sovereign states', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin']


In [41]:
# Create DataFrame
df = pd.DataFrame(texts, columns=["Country"])


In [42]:
df.head()

Unnamed: 0,Country
0,sovereign states
1,Afghanistan
2,Albania
3,Algeria
4,Andorra


In [43]:
df=df.drop(index=0)

In [44]:
df.head()

Unnamed: 0,Country
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [45]:
df.info

<bound method DataFrame.info of            Country
1      Afghanistan
2          Albania
3          Algeria
4          Andorra
5           Angola
..             ...
356         Adjara
357         Adygea
358          Altai
359     Bangsamoro
360  Bashkortostan

[360 rows x 1 columns]>

In [46]:
df.shape

(360, 1)

In [47]:
# Save the dataframe as csv

df.to_csv('List of Countries.csv')