In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('max_colwidth', 400)

### Extract the olympic_hosts.csv Data.

In [2]:
# Store filepaths into variables 
olympic_hosts_to_load = Path("olympic_hosts.csv")

# Read Olympic Hosts Data File and store into a Pandas DataFrame
host_data = pd.read_csv(olympic_hosts_to_load)
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z,Russian Federation,Sochi 2014,Winter,2014


In [3]:
# Convert the datetime columns to datetime format and remove the time 
host_data['game_end_date'] = pd.to_datetime(host_data['game_end_date'], format='%Y-%m-%dT%H:%M:%SZ').dt.date
host_data['game_start_date'] = pd.to_datetime(host_data['game_start_date'], format='%Y-%m-%dT%H:%M:%SZ').dt.date

# Display the first few rows of the DataFrame
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20,2022-02-04,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Sochi 2014,Winter,2014


In [4]:
# Create new columns for city and year
host_data[['city', 'year']] = host_data['game_name'].str.split(' ', n=1, expand=True)

# Merge back the remaining parts of the city name
host_data['city'] = host_data.apply(lambda x: ' '.join([x['city'], x['year']]), axis=1)

# Drop the 'year' column
host_data.drop('year', axis=1, inplace=True)

# Display the modified DataFrame
host_data.head(20)

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year,city
0,beijing-2022,2022-02-20,2022-02-04,China,Beijing 2022,Winter,2022,Beijing 2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Tokyo 2020,Summer,2020,Tokyo 2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,PyeongChang 2018,Winter,2018,PyeongChang 2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Rio 2016,Summer,2016,Rio 2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Sochi 2014,Winter,2014,Sochi 2014
5,london-2012,2012-08-12,2012-07-27,Great Britain,London 2012,Summer,2012,London 2012
6,vancouver-2010,2010-02-28,2010-02-12,Canada,Vancouver 2010,Winter,2010,Vancouver 2010
7,beijing-2008,2008-08-24,2008-08-08,China,Beijing 2008,Summer,2008,Beijing 2008
8,turin-2006,2006-02-26,2006-02-10,Italy,Turin 2006,Winter,2006,Turin 2006
9,athens-2004,2004-08-29,2004-08-13,Greece,Athens 2004,Summer,2004,Athens 2004


In [5]:
# Split the city and year in the 'city' column
host_data[['city', 'year']] = host_data['game_name'].str.rsplit(' ', n=1, expand=True)

# Display the modified DataFrame
host_data.head(20)

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year,city,year
0,beijing-2022,2022-02-20,2022-02-04,China,Beijing 2022,Winter,2022,Beijing,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Tokyo 2020,Summer,2020,Tokyo,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,PyeongChang 2018,Winter,2018,PyeongChang,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Rio 2016,Summer,2016,Rio,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Sochi 2014,Winter,2014,Sochi,2014
5,london-2012,2012-08-12,2012-07-27,Great Britain,London 2012,Summer,2012,London,2012
6,vancouver-2010,2010-02-28,2010-02-12,Canada,Vancouver 2010,Winter,2010,Vancouver,2010
7,beijing-2008,2008-08-24,2008-08-08,China,Beijing 2008,Summer,2008,Beijing,2008
8,turin-2006,2006-02-26,2006-02-10,Italy,Turin 2006,Winter,2006,Turin,2006
9,athens-2004,2004-08-29,2004-08-13,Greece,Athens 2004,Summer,2004,Athens,2004


In [6]:
# Drop unwanted columns
host_data.drop(['game_name', 'year'], axis=1, inplace=True)

# Display the first few rows
host_data.head(30)

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_season,game_year,city
0,beijing-2022,2022-02-20,2022-02-04,China,Winter,2022,Beijing
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Summer,2020,Tokyo
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,Winter,2018,PyeongChang
3,rio-2016,2016-08-21,2016-08-05,Brazil,Summer,2016,Rio
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Winter,2014,Sochi
5,london-2012,2012-08-12,2012-07-27,Great Britain,Summer,2012,London
6,vancouver-2010,2010-02-28,2010-02-12,Canada,Winter,2010,Vancouver
7,beijing-2008,2008-08-24,2008-08-08,China,Summer,2008,Beijing
8,turin-2006,2006-02-26,2006-02-10,Italy,Winter,2006,Turin
9,athens-2004,2004-08-29,2004-08-13,Greece,Summer,2004,Athens


In [7]:
# Rename columns
host_data.rename(columns={'game_end_date': 'end_date', 
                          'game_start_date': 'start_date', 
                          'game_location': 'country',
                          'game_season': 'season'}, inplace=True)

# Display the first few rows
host_data.head(20)

Unnamed: 0,game_slug,end_date,start_date,country,season,game_year,city
0,beijing-2022,2022-02-20,2022-02-04,China,Winter,2022,Beijing
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Summer,2020,Tokyo
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,Winter,2018,PyeongChang
3,rio-2016,2016-08-21,2016-08-05,Brazil,Summer,2016,Rio
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Winter,2014,Sochi
5,london-2012,2012-08-12,2012-07-27,Great Britain,Summer,2012,London
6,vancouver-2010,2010-02-28,2010-02-12,Canada,Winter,2010,Vancouver
7,beijing-2008,2008-08-24,2008-08-08,China,Summer,2008,Beijing
8,turin-2006,2006-02-26,2006-02-10,Italy,Winter,2006,Turin
9,athens-2004,2004-08-29,2004-08-13,Greece,Summer,2004,Athens


In [8]:
# Reorder columns
host_cleaned = host_data[['game_slug', 'start_date', 'end_date', 'season', 'country', 'city', 'game_year']]

host_cleaned.head()

Unnamed: 0,game_slug,start_date,end_date,season,country,city,game_year
0,beijing-2022,2022-02-04,2022-02-20,Winter,China,Beijing,2022
1,tokyo-2020,2021-07-23,2021-08-08,Summer,Japan,Tokyo,2020
2,pyeongchang-2018,2018-02-08,2018-02-25,Winter,Republic of Korea,PyeongChang,2018
3,rio-2016,2016-08-05,2016-08-21,Summer,Brazil,Rio,2016
4,sochi-2014,2014-02-07,2014-02-23,Winter,Russian Federation,Sochi,2014


In [9]:
# Export the DataFrame as a CSV file. 
host_cleaned.to_csv("cleaned_olympic_hosts.csv", index=False)