# Ex 1.4 Accessing Webdata by Web Scraping 

In [2]:
# Importing libraries

import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

### Main Exercise

In [3]:
## Setup chrome options

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [4]:

# Specify the path to the correct ChromeDriver
service = Service("/Users/andymiller/Documents/ChromeDriver/chromedriver-mac-x64/chromedriver")

# Launch Chrome
driver = webdriver.Chrome(service=service)

# Open a page to test
driver.get("https://www.google.com")


In [5]:
#  It should be mentioned what a huge hassle it was trying to set up ChromeDriver 
# due to version mismatch issues.

In [6]:
# Get the page's contents

page_url = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"
driver.get(page_url)

In [7]:
# Get the visible text from the page

page_text = driver.find_element("tag name", "body").text

In [8]:
# Define the pathname of the working directory

working_dir = "/Users/andymiller/20th-Century"

# Set filename
file_path = os.path.join(working_dir, "key_events_20th_century.txt")


In [9]:
# Save the text to the file.

with open(file_path, "w", encoding="utf-8") as f:
    f.write(page_text)

### Bonus Exercise - Scrape a list of countries

In [17]:
# Since all countries will be hyperlinks the first time that they are mentioned on the page we 
# can use "a" after TAG_NAME to create a list of these hyperlinked words.

linked_elements = driver.find_elements(By.TAG_NAME, "a")

In [19]:
# Now we can extract the visible text from each link

link_texts = [link.text for link in linked_elements if link.text]  # skip empty text
print(link_texts)



In [24]:
# Now let's do the same thing for the list of countries page:
# https://simple.wikipedia.org/wiki/List_of_countries

page_url_2 = "https://simple.wikipedia.org/wiki/List_of_countries"
driver.get(page_url_2)

linked_elements_2 = driver.find_elements(By.TAG_NAME, "a")

In [26]:
# Now extract the visible text
link_texts_2 = [link.text for link in linked_elements_2 if link.text]
print(link_texts_2)

['Jump to content', 'Give to Wikipedia', 'Create account', 'Log in', 'Beginning', 'Countries', 'Disputed Countries', 'Dependent Territories', 'Dependencies', 'Territorial Claims', 'Autonomous Republics', 'Page', 'Talk', 'Read', 'View source', 'View history', 'sovereign states', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Esw

In [36]:
# Convert both lists to sets and find the common values

common_values = list(set(link_texts) & set(link_texts_2))

print(common_values)

['Ghana', 'Philippines', 'Poland', 'Mongolia', 'United States', 'Statistics', 'Terms of Use', 'Morocco', 'Libya', 'Category', 'Lithuania', 'São Tomé and Príncipe', 'Cuba', 'France', 'Disclaimers', 'Kenya', 'Solomon Islands', 'Germany', 'Finland', 'Jump to content', 'Guam', 'Laos', 'Log in', 'Pakistan', 'India', 'Iran', 'Estonia', 'Rwanda', 'Luxembourg', 'Hong Kong', 'Wake Island', 'Israel', 'Mobile view', 'Romania', 'Cambodia', 'About Wikipedia', 'Thailand', 'Cookie statement', 'Crimea', 'Albania', 'North Korea', 'Privacy policy', 'Hungary', 'China', 'Create account', 'Talk', 'Iraq', 'Bulgaria', 'Belarus', 'Ukraine', 'Egypt', 'Guinea-Bissau', 'United Kingdom', 'Mozambique', 'Singapore', 'Angola', 'South Korea', 'Algeria', 'Code of Conduct', 'Developers', 'View history', 'Papua New Guinea', 'South Africa', 'Vietnam', 'Latvia', 'Read']


In [35]:
# Create list of the unwanted items

unwanted = ['Statistics', 'Terms of Use', 'Category', 'Disclaimers', 'Jump to content', 'Log in', 'About Wikipedia',
           'Cookie statement', 'Privacy policy', 'Create account', 'Talk', 'Code of Conduct', 'Developers', 'View history',
           'Read','Mobile view']

In [37]:
# Remove unwanted items from the common_values list

common_values = [item for item in common_values if item not in unwanted]
print(common_values)

['Ghana', 'Philippines', 'Poland', 'Mongolia', 'United States', 'Morocco', 'Libya', 'Lithuania', 'São Tomé and Príncipe', 'Cuba', 'France', 'Kenya', 'Solomon Islands', 'Germany', 'Finland', 'Guam', 'Laos', 'Pakistan', 'India', 'Iran', 'Estonia', 'Rwanda', 'Luxembourg', 'Hong Kong', 'Wake Island', 'Israel', 'Romania', 'Cambodia', 'Thailand', 'Crimea', 'Albania', 'North Korea', 'Hungary', 'China', 'Iraq', 'Bulgaria', 'Belarus', 'Ukraine', 'Egypt', 'Guinea-Bissau', 'United Kingdom', 'Mozambique', 'Singapore', 'Angola', 'South Korea', 'Algeria', 'Papua New Guinea', 'South Africa', 'Vietnam', 'Latvia']


In [39]:
# Put the countries into a dataframe

df = pd.DataFrame(common_values, columns =  ["country"])

In [40]:
df

Unnamed: 0,country
0,Ghana
1,Philippines
2,Poland
3,Mongolia
4,United States
5,Morocco
6,Libya
7,Lithuania
8,São Tomé and Príncipe
9,Cuba


In [33]:
# Close the Chrome window

driver.quit()

In [41]:
# Export df to csv file

df.to_csv('/Users/andymiller/20th-Century/list_of_countries.csv')