# Practical statistics for Data Scientists

In [1]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("arjunprasadsarkhel/2021-olympics-in-tokyo")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\adminuser\.cache\kagglehub\datasets\arjunprasadsarkhel\2021-olympics-in-tokyo\versions\7


#### NOTE!!! copy the above files to 'learn_Statistics' folder before moving on

In [2]:
medal_count = pd.read_excel("data\\2021-olympics-in-tokyo\\Medals.xlsx")
medal_count

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Rank,Team/NOC,Gold,Silver,Bronze,Total,Rank by Total
0,1,United States of America,39,41,33,113,1
1,2,People's Republic of China,38,32,18,88,2
2,3,Japan,27,14,17,58,5
3,4,Great Britain,22,21,22,65,4
4,5,ROC,20,28,23,71,3
...,...,...,...,...,...,...,...
88,86,Ghana,0,0,1,1,77
89,86,Grenada,0,0,1,1,77
90,86,Kuwait,0,0,1,1,77
91,86,Republic of Moldova,0,0,1,1,77


### **Estimates of location**

In [3]:
# Calculate mean of all medals
medal_mean = medal_count["Total"].mean()
print(f"Mean of all medals: {medal_mean}")

Mean of all medals: 11.612903225806452


In [4]:
# Weighted mean for rankings
import numpy as np

weighted_mean=np.average(medal_count["Total"], weights=medal_count["Gold"])
print(weighted_mean)

46.832352941176474


In [5]:
# Trimmed mean
#   Generally mean is sensitive to outlier, in order to remove this dependency we can
#   trim the values that are not helps reduce the impact of outliers
from scipy import stats
#   on checking how the gold medals are distributed we see that
#   after the first 10 records, the no. of gold medals drastically reduces
#   the same is for the last 10 entires
print("First 20 gold entries: ", list(medal_count['Gold'][0:20]))
print("Last 20 gold entries : ", list(medal_count['Gold'][-20:-1]))
#   in this case we are going to trim of the top 10% and the bottom 10%
print("\nMean excluding top and bottom 10% :",stats.trim_mean(medal_count["Gold"], proportiontocut=0.1))

First 20 gold entries:  [39, 38, 27, 22, 20, 17, 10, 10, 10, 10, 7, 7, 7, 7, 6, 6, 4, 4, 4, 4]
Last 20 gold entries :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Mean excluding top and bottom 10% : 1.96


#### _Median_

In [6]:
# Calcuate the median medal count for total medals
print(f"Median total medal count: {medal_count['Total'].median()}")
print(f"Total medal count after removing top and bottom 10% outliers: {stats.trim_mean(medal_count['Total'], 0.1)}")
#print(medal_count.head())

Median total medal count: 4.0
Total medal count after removing top and bottom 10% outliers: 6.8933333333333335


#### _Weighted Median_

In [7]:
# A standard median gives equal weight to all data points but not a weigted median
#   A weighted median is a measure of central tendencey where cumulative weight of
#   the values to the left >= half of the total weight

#   Step 1. Sort the data values
#   Step 2. Pair the values and weights
#   Step 3. Find the total weight
#   Step 4. Find the mid-point of the total weight (total weight / 2)
#   Step 5. Find the cumulative weight (add all sorted weights one by one keeping a running total)
#   Step 6. Find the cumulative weight >= mid-point of total weight in step 4

def weighted_median(df, data_col, weights):
    df_sorted = df.sort_values(data_col)
    cutoff = df_sorted[weights].sum() / 2
    cumsum = df_sorted[weights].cumsum()
    return df_sorted[cumsum >= cutoff][data_col].iloc[0]

w_median = weighted_median(medal_count, "Total", "Gold")
print("Weighted median of 'Total' medals for 'Gold' as weight:", w_median)



Weighted median of 'Total' medals for 'Gold' as weight: 40


#### _Percentile_

In [8]:
# Find the 75th percentile of total medals
print(np.percentile(medal_count['Total'], 75))

11.0


In [9]:
# Defines min, max, mean, median and std deviation of the table
medal_count.describe()

Unnamed: 0,Rank,Gold,Silver,Bronze,Total,Rank by Total
count,93.0,93.0,93.0,93.0,93.0,93.0
mean,46.333333,3.655914,3.634409,4.322581,11.612903,43.494624
std,26.219116,7.022471,6.626339,6.210372,19.091332,24.171769
min,1.0,0.0,0.0,0.0,1.0,1.0
25%,24.0,0.0,0.0,1.0,2.0,23.0
50%,46.0,1.0,1.0,2.0,4.0,47.0
75%,70.0,3.0,4.0,5.0,11.0,66.0
max,86.0,39.0,41.0,33.0,113.0,77.0
