In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('max_colwidth', 400)

### Extract the olympic_hosts.csv Data.

In [2]:
# Store filepaths into variables 
olympic_hosts_to_load = Path("olympic_hosts.csv")

# Read Olympic Hosts Data File and store into a Pandas DataFrame
host_data = pd.read_csv(olympic_hosts_to_load)
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z,Russian Federation,Sochi 2014,Winter,2014


In [3]:
# Convert the datetime columns to datetime format and remove the time 
host_data['game_end_date'] = pd.to_datetime(host_data['game_end_date'], format='%Y-%m-%dT%H:%M:%SZ').dt.date
host_data['game_start_date'] = pd.to_datetime(host_data['game_start_date'], format='%Y-%m-%dT%H:%M:%SZ').dt.date

# Display the first few rows of the DataFrame
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20,2022-02-04,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Sochi 2014,Winter,2014


In [4]:
# Create new columns for city and year
host_data[['city', 'year']] = host_data['game_name'].str.split(' ', n=1, expand=True)

# Display the first few rows
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year,city,year
0,beijing-2022,2022-02-20,2022-02-04,China,Beijing 2022,Winter,2022,Beijing,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Tokyo 2020,Summer,2020,Tokyo,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,PyeongChang 2018,Winter,2018,PyeongChang,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Rio 2016,Summer,2016,Rio,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Sochi 2014,Winter,2014,Sochi,2014


In [5]:
# Drop unwanted columns
host_data.drop(['game_name', 'game_year'], axis=1, inplace=True)

# Display the first few rows
host_data.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_season,city,year
0,beijing-2022,2022-02-20,2022-02-04,China,Winter,Beijing,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Summer,Tokyo,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,Winter,PyeongChang,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Summer,Rio,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Winter,Sochi,2014


In [6]:
# Rename columns
host_data.rename(columns={'game_end_date': 'end_date', 
                          'game_start_date': 'start_date', 
                          'game_location': 'country',
                          'game_season': 'season'}, inplace=True)

# Display the first few rows
host_data.head()

Unnamed: 0,game_slug,end_date,start_date,country,season,city,year
0,beijing-2022,2022-02-20,2022-02-04,China,Winter,Beijing,2022
1,tokyo-2020,2021-08-08,2021-07-23,Japan,Summer,Tokyo,2020
2,pyeongchang-2018,2018-02-25,2018-02-08,Republic of Korea,Winter,PyeongChang,2018
3,rio-2016,2016-08-21,2016-08-05,Brazil,Summer,Rio,2016
4,sochi-2014,2014-02-23,2014-02-07,Russian Federation,Winter,Sochi,2014


In [7]:
# Reorder columns
host_cleaned = host_data[['game_slug', 'start_date', 'end_date', 'season', 'country', 'city', 'year']]

host_cleaned.head()

Unnamed: 0,game_slug,start_date,end_date,season,country,city,year
0,beijing-2022,2022-02-04,2022-02-20,Winter,China,Beijing,2022
1,tokyo-2020,2021-07-23,2021-08-08,Summer,Japan,Tokyo,2020
2,pyeongchang-2018,2018-02-08,2018-02-25,Winter,Republic of Korea,PyeongChang,2018
3,rio-2016,2016-08-05,2016-08-21,Summer,Brazil,Rio,2016
4,sochi-2014,2014-02-07,2014-02-23,Winter,Russian Federation,Sochi,2014


In [8]:
# Export the DataFrame as a CSV file. 
host_cleaned.to_csv("cleaned_olympic_hosts.csv", index=False)