In [3]:
#!pip install bs4

In [4]:
# for performing your HTTP requests
import requests  

# for xml & html scrapping 
from bs4 import BeautifulSoup 

# for table analysis
import pandas as pd

# write to csv
import csv

# Time
import time

#Visuals
import matplotlib.pyplot as plt

In [5]:
# url of wikipedia page from which you want to scrap tabular data.
url = "https://en.wikipedia.org/wiki/Winter_Olympic_Games#List_of_Winter_Olympic_Games"

In [6]:
# Session helps to object allows you to persist certain parameters across requests
# By default, Request will keep waiting for a response indefinitely. Therefore, it is advised to set the timeout parameter.
# If the request was successful, you should see the reponse output as '200'.
s = requests.Session()
response = s.get(url, timeout=10)
response

<Response [200]>

In [7]:
# parse response content to html
soup = BeautifulSoup(response.content, 'html.parser')

# to view the content in html format
pretty_soup = soup.prettify()

In [8]:
# title of Wikipedia page
soup.title.string

'Winter Olympic Games - Wikipedia'

In [10]:
# find all the tables in the html
all_tables=soup.find_all('table')

# get right table to scrap
right_table=soup.find('table', {"class":'sortable wikitable'})

In [17]:
# Get columns in the table
cells = right_table.findAll("td")

lst_data = []
for row in cells:
            lst_data.append(str(row))

lst_data

['<td>1924<sup class="reference" id="cite_ref-Games1924_19-2"><a href="#cite_note-Games1924-19">[16]</a></sup>\n</td>',
 '<td><a href="/wiki/1924_Winter_Olympics" title="1924 Winter Olympics">I</a>\n</td>',
 '<td align="left"><span class="flagicon"><a href="/wiki/France" title="France"><img alt="France" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_France_%281794%E2%80%931815%2C_1830%E2%80%931974%2C_2020%E2%80%93present%29.svg/23px-Flag_of_France_%281794%E2%80%931815%2C_1830%E2%80%931974%2C_2020%E2%80%93present%29.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_France_%281794%E2%80%931815%2C_1830%E2%80%931974%2C_2020%E2%80%93present%29.svg/35px-Flag_of_France_%281794%E2%80%931815%2C_1830%E2%80%931974%2C_2020%E2%80%93present%29.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/bc/Flag_of_France_%281794%E2%80%931815%2C_1830%E2%80%93197

In [18]:
lst_dates = [x for x in lst_data if "February" in x]
lst_dates_clean = [x.split('<td>')[1].split('<br/>')[0].replace(" ", "") for x in lst_dates] 

In [19]:
lst_dates_clean = [x.replace('\xa0', '') for x in lst_dates_clean]
lst_dates_clean = [x.replace('<i>', '') for x in lst_dates_clean]
lst_dates_clean = [x.replace('</i>\n</td>', '') for x in lst_dates_clean]
lst_dates_clean 

['25January–5February1924',
 '11–19February1928',
 '4–15February1932',
 '6–16February1936',
 '30January–8February1948',
 '14–25February1952',
 '26January–5February1956',
 '18–28February1960',
 '29January–9February1964',
 '6–18February1968',
 '3–13February1972',
 '4–15February1976',
 '13–24February1980',
 '8–19February1984',
 '13–28February1988',
 '8–23February1992',
 '12–27February1994',
 '7–22February1998',
 '8–24February2002',
 '10–26February2006',
 '12–28February2010',
 '7–23February2014',
 '9–25February2018',
 '4–20February2022',
 '6–22February2026',
 '8–24February2030']

In [25]:
import re
import numpy as np
lst_date_pairs = [re.findall(r"(\d+)[^0-9.]", x) for x in lst_dates_clean]
lst_date_pairs = [[int(int(j)) for j in i] for i in lst_date_pairs]
lst_date_pairs

[[25, 5],
 [11, 19],
 [4, 15],
 [6, 16],
 [30, 8],
 [14, 25],
 [26, 5],
 [18, 28],
 [29, 9],
 [6, 18],
 [3, 13],
 [4, 15],
 [13, 24],
 [8, 19],
 [13, 28],
 [8, 23],
 [12, 27],
 [7, 22],
 [8, 24],
 [10, 26],
 [12, 28],
 [7, 23],
 [9, 25],
 [4, 20],
 [6, 22],
 [8, 24]]

In [27]:
lst_year = [x.split("February")[1] for x in lst_dates_clean]
lst_year = [int(x) for x in lst_year]
lst_year

[1924,
 1928,
 1932,
 1936,
 1948,
 1952,
 1956,
 1960,
 1964,
 1968,
 1972,
 1976,
 1980,
 1984,
 1988,
 1992,
 1994,
 1998,
 2002,
 2006,
 2010,
 2014,
 2018,
 2022,
 2026,
 2030]

In [28]:
winter_olympics_df = pd.DataFrame()
winter_olympics_df['date_pairs'] = lst_date_pairs
winter_olympics_df[['start_day','end_day']] = pd.DataFrame(winter_olympics_df.date_pairs.tolist(), index= winter_olympics_df.index)
winter_olympics_df['start_month'] = np.where(winter_olympics_df['start_day'] >= winter_olympics_df['end_day'], 1, 2)
winter_olympics_df['end_month'] = 2
winter_olympics_df['year'] = lst_year

winter_olympics_df.to_pickle("./data/winter_olympics.pkl") 