## Web Scraping 20th Century

## 01. Import Libraries

In [5]:
# import libraries


import pandas as pd
import time
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4
from bs4 import BeautifulSoup
import requests

## 02. Set Up ChromeDriver

In [8]:
# Setup chrome options
# unsure why this was done in the exercise, minimal explanation

chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [10]:
# set up driver:

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

## 03. Scrape Key Events of the 20th Century

In [12]:
# project URL: Key Events of the 20th Century

# (https://en.wikipedia.org/wiki/Key_events_of_the_20th_century)

In [13]:
# will use beautifulsoup / requests to scrape
# this is a better option as we want the entire page instead of a small list / segment

# have already imported beautifulsoup/requests

In [33]:
# get page's contents:

page_url = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [35]:
# create soup / get title

soup = BeautifulSoup(page_url.text, 'html.parser')
print(soup.title)

<title>Key events of the 20th century - Wikipedia</title>


In [37]:
# print(soup.text)

# this definitely worked to display the full text
# changing to a markdown to prevent unnecessary scrolling

In [39]:
# creating a new object to store the text:

text = soup.get_text()

In [41]:
# define the text encoding

text = text.encode('utf-8')

In [43]:
# quick google said that utf-8 is now preferred for spatial efficiency
# encoding allows computers to display digital data as letters,numbers,characters

In [45]:
# save file to working directory

with open('20th-Century.txt', 'wb') as f:
       f.write(text)

## 04. Scrape List of Countries

In [48]:
# project URL: List of Countries

# url: ("https://simple.m.wikipedia.org/wiki/List_of_countries"

In [50]:
# as we are creating a list, selenium will suffice here
# get page contents:

list_url = "https://simple.m.wikipedia.org/wiki/List_of_countries"
driver.get(list_url)

In [52]:
# elements of interest will be the countries themselves, 
# after inspecting, the div id appears to be "mw-content-text" class = "mw-body-content"
# another option is "bodyContent"

#div class = "mw-content-ltr mw-parser-output"

# going to try "mw-body-content" first

In [54]:
# use find element to create a collection of the countries:

countries_elem = driver.find_elements(by = By.CLASS_NAME, value = 'mw-body-content')

In [58]:
# checking to see if it worked:

#countries_elem[0].text

In [60]:
# okay, the intro is still there and i'll need to split these multiple ways

In [62]:
# first splitting at /n

# country_list = countries_elem[0].text.split("\n")

In [64]:
# country_list

In [66]:
# still need to split at ("- ")


# country_list = countries_elem[0].text.split("-")

In [68]:
# country_list

In [76]:
# going to pivot and try another way to get a better result:

# driver.find_elements(By.TAG_NAME, …)


countries_elem2 = driver.find_elements(by = By.TAG_NAME, value = 'a')

# tried printing this, and it returned a list of elements... need to extract the test from each element.

In [88]:
# this output is a lot better, lets create a new dataframe and populate it with this outcome:

countrieslist = [element.text for element in countries_elem2]
    
print(countrieslist)

['', '', '', '', '', '', '', '', '', '', '', 'Language', 'Download PDF', 'Watch', 'Change', 'sovereign states', 'Top', '0-9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'change', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'change', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'change', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Democratic Republic of the', 'Congo, Republic of the', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'change', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'change', 'East Timor', 'Ecuador', 'Egypt',

In [94]:
# need to remove extra entries...
# remove any entries if the length of the string >= 1

countries_1 = [element.text for element in countries_elem2 if len(element.text) > 1]

print(countries_1)

['Language', 'Download PDF', 'Watch', 'Change', 'sovereign states', 'Top', '0-9', 'change', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'change', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'change', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Democratic Republic of the', 'Congo, Republic of the', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'change', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'change', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'change', 'Fiji', 'Finland', 'France', 'change', 'Gabon', 'Gambia', 'Georgia', 'Germany', '

In [120]:
# need to remove anything with 'change' or 'Change'

countries_2 = [element for element in countries_1 if
               element != 'Change' and
              element != 'change']

print(countries_2)


['Language', 'Download PDF', 'Watch', 'sovereign states', 'Top', '0-9', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Democratic Republic of the', 'Congo, Republic of the', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', '

In [122]:
print(len(countries_2))

293


In [124]:
# still have quite a few unnecessary entries... will continue to work on this.