In [8]:
#Import pandas
import pandas as pd

# Create file path: file_path
file_path = 'Datasets/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - EDITIONS.tsv'

# Load DataFrame from file_path: editions
medals = pd.read_csv(file_path, sep='\t')

# Extract the relevant columns: editions
editions = editions[['Edition', 'Grand Total', 'City','Country']]

# Print editions DataFrame
print(editions.head(2))
print(editions.tail(2))

   Edition  Grand Total    City Country
0     1896          151  Athens  Greece
1     1900          512   Paris  France
    Edition  Grand Total     City Country
24     2004         1998   Athens  Greece
25     2008         2042  Beijing   China


In [9]:
# Create the file path: file_path
file_path = 'Datasets/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - IOC COUNTRY CODES.csv'

# Load DataFrame from file_path: ioc_codes
ioc_codes = pd.read_csv(file_path)

# Extract the relevant columns: ioc_codes
ioc_codes = ioc_codes[['Country', 'NOC']]

# Print first and last 5 rows of ioc_codes
print(ioc_codes.head(2))
print(ioc_codes.tail(2))

       Country  NOC
0  Afghanistan  AFG
1      Albania  ALB
      Country  NOC
199    Zambia  ZAM
200  Zimbabwe  ZIM


## 1)Counting medals by country/edition in a pivot table

In [23]:
medal = pd.read_csv('Datasets/Summer Olympic medals/Summer Olympic medalists 1896 to 2008 - ALL MEDALISTS.tsv', sep='\t')
medals = medal[['Athlete', 'NOC', 'Medal', 'Edition']]

# Construct the pivot_table: medal_counts
medal_counts = medals.pivot_table(index='Edition', values='Athlete', columns='NOC', aggfunc='count')

# Print the first & last 5 rows of medal_counts
print(medal_counts.head(2))

NOC      AFG  AHO  ALG  ANZ  ARG  ARM  AUS  AUT  AZE  BAH  ...  URS  URU  \
Edition                                                    ...             
1896     NaN  NaN  NaN  NaN  NaN  NaN  2.0  5.0  NaN  NaN  ...  NaN  NaN   
1900     NaN  NaN  NaN  NaN  NaN  NaN  5.0  6.0  NaN  NaN  ...  NaN  NaN   

NOC       USA  UZB  VEN  VIE  YUG  ZAM  ZIM   ZZX  
Edition                                            
1896     20.0  NaN  NaN  NaN  NaN  NaN  NaN   6.0  
1900     55.0  NaN  NaN  NaN  NaN  NaN  NaN  34.0  

[2 rows x 138 columns]


## 2)Computing fraction of medals per Olympic edition

In [22]:
# Set Index of editions: totals
totals = editions.set_index('Edition')

# Reassign totals['Grand Total']: totals
totals = totals['Grand Total']

# Divide medal_counts by totals: fractions
fractions = medal_counts.divide(totals, axis='rows')

# Print first & last 5 rows of fractions
print(fractions.head(2))

NOC      AFG  AHO  ALG  ANZ  ARG  ARM       AUS       AUT  AZE  BAH  ...  URS  \
Edition                                                              ...        
1896     NaN  NaN  NaN  NaN  NaN  NaN  0.013245  0.033113  NaN  NaN  ...  NaN   
1900     NaN  NaN  NaN  NaN  NaN  NaN  0.009766  0.011719  NaN  NaN  ...  NaN   

NOC      URU       USA  UZB  VEN  VIE  YUG  ZAM  ZIM       ZZX  
Edition                                                         
1896     NaN  0.132450  NaN  NaN  NaN  NaN  NaN  NaN  0.039735  
1900     NaN  0.107422  NaN  NaN  NaN  NaN  NaN  NaN  0.066406  

[2 rows x 138 columns]


## 3)Computing percentage change in fraction of medals won

In [25]:
# Apply the expanding mean: mean_fractions
mean_fractions = fractions.expanding().mean()

# Compute the percentage change: fractions_change
fractions_change = mean_fractions.pct_change()*100

# Reset the index of fractions_change: fractions_change
fractions_change = fractions_change.reset_index()

# Print first & last 5 rows of fractions_change
print(fractions_change.head(2))

NOC  Edition  AFG  AHO  ALG  ANZ  ARG  ARM        AUS        AUT  AZE  ...  \
0       1896  NaN  NaN  NaN  NaN  NaN  NaN        NaN        NaN  NaN  ...   
1       1900  NaN  NaN  NaN  NaN  NaN  NaN -13.134766 -32.304688  NaN  ...   

NOC  URS  URU       USA  UZB  VEN  VIE  YUG  ZAM  ZIM        ZZX  
0    NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN        NaN  
1    NaN  NaN -9.448242  NaN  NaN  NaN  NaN  NaN  NaN  33.561198  

[2 rows x 139 columns]


## 4)Building hosts DataFrame

In [27]:
# Left join editions and ioc_codes: hosts
hosts = pd.merge(editions, ioc_codes, how='left')

# Extract relevant columns and set index: hosts
hosts = hosts[['Edition','NOC']].set_index('Edition')

# Fix missing 'NOC' values of hosts
print(hosts.loc[hosts.NOC.isnull()])
hosts.loc[1972, 'NOC'] = 'FRG'
hosts.loc[1980, 'NOC'] = 'URS'
hosts.loc[1988, 'NOC'] = 'KOR'

# Reset Index of hosts: hosts
hosts = hosts.reset_index()

# Print hosts
print(hosts.head(2))

         NOC
Edition     
1972     NaN
1980     NaN
1988     NaN
   Edition  NOC
0     1896  GRE
1     1900  FRA


## 5)Reshaping for analysis

In [28]:
# Reshape fractions_change: reshaped
reshaped = pd.melt(fractions_change, id_vars='Edition', value_name='Change')

# Print reshaped.shape and fractions_change.shape
print(reshaped.shape, fractions_change.shape)

# Extract rows from reshaped where 'NOC' == 'CHN': chn
chn = reshaped.loc[reshaped.NOC == 'CHN']

# Print last 5 rows of chn
print(chn.tail())

(3588, 3) (26, 139)
     Edition  NOC     Change
567     1992  CHN   4.240630
568     1996  CHN   7.860247
569     2000  CHN  -3.851278
570     2004  CHN   0.128863
571     2008  CHN  13.251332


## 6)Merging to compute influence

In [30]:
# Merge reshaped and hosts: merged
merged = pd.merge(reshaped, hosts)

# Print first 5 rows of merged
print(merged.head(2))

# Set Index of merged and sort it: influence
influence = merged.set_index('Edition').sort_index()

# Print first 5 rows of influence
print(influence.head(2))

   Edition  NOC     Change
0     1956  AUS  54.615063
1     2000  AUS  12.554986
         NOC      Change
Edition                 
1896     GRE         NaN
1900     FRA  198.002486


## 7)Plotting influence of host country

In [31]:
# Import pyplot
import matplotlib.pyplot as plt

# Extract influence['Change']: change
change = influence['Change']

# Make bar plot of change: ax
ax = change.plot(kind='bar')

# Customize the plot to improve readability
ax.set_ylabel("% Change of Host Country Medal Count")
ax.set_title("Is there a Host Country Advantage?")
ax.set_xticklabels(editions['City'])

# Display the plot
plt.show()

<Figure size 640x480 with 1 Axes>