## Working with Summary Statistics

In [1]:
import pandas as pd

In [2]:
# Read in file

sports = pd.read_csv("sports.csv", skiprows=2)

# Reset column names
col_names = ['Month', 'Golf', 'Soccer', 'Tennis', 'Hockey', 'Baseball']
sports.columns = col_names

sports.head()

Unnamed: 0,Month,Golf,Soccer,Tennis,Hockey,Baseball
0,2004-01,45,21,13,22,24
1,2004-02,50,24,13,23,32
2,2004-03,63,27,15,23,45
3,2004-04,80,29,16,16,53
4,2004-05,82,31,17,14,52


In [3]:
# Set index
sports.set_index('Month', inplace=True)

In [4]:
# Gives you the rows and columns
sports.shape

(191, 5)

In [5]:
# Obtain the descriptive (summary) statistics
sports.describe()

Unnamed: 0,Golf,Soccer,Tennis,Hockey,Baseball
count,191.0,191.0,191.0,191.0,191.0
mean,49.387435,28.853403,13.502618,14.879581,34.183246
std,18.247326,8.682073,4.067683,5.916182,14.037279
min,23.0,17.0,7.0,6.0,13.0
25%,34.0,24.0,11.0,11.0,23.5
50%,48.0,28.0,13.0,15.0,30.0
75%,59.0,32.0,16.0,18.0,48.5
max,100.0,84.0,26.0,55.0,61.0


In [6]:
# Transpose data to flip rows and columns
sports.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Golf,191.0,49.387435,18.247326,23.0,34.0,48.0,59.0,100.0
Soccer,191.0,28.853403,8.682073,17.0,24.0,28.0,32.0,84.0
Tennis,191.0,13.502618,4.067683,7.0,11.0,13.0,16.0,26.0
Hockey,191.0,14.879581,5.916182,6.0,11.0,15.0,18.0,55.0
Baseball,191.0,34.183246,14.037279,13.0,23.5,30.0,48.5,61.0


In [7]:
# Give you the unique values, excludes NaNs
sports['Golf'].unique()

array([ 45,  50,  63,  80,  82, 100,  94,  85,  65,  42,  44,  61,  98,
        95,  86,  51,  43,  48,  57,  78,  76,  91,  88,  49,  41,  40,
        46,  56,  71,  77,  87,  62,  36,  39,  52,  68,  69,  90,  66,
        31,  33,  64,  74,  37,  32,  30,  58,  34,  27,  26,  35,  29,
        47,  75,  54,  55,  25,  53,  23,  59,  24,  28])

In [8]:
# Minimum values
sports.min()

Golf        23
Soccer      17
Tennis       7
Hockey       6
Baseball    13
dtype: int64

In [9]:
# Maximum values
sports.max()

Golf        100
Soccer       84
Tennis       26
Hockey       55
Baseball     61
dtype: int64

### Measures of Central Tendency

In [10]:
# Mean is the average value of the data set.  It represents the typical value.
sports.mean()

Golf        49.387435
Soccer      28.853403
Tennis      13.502618
Hockey      14.879581
Baseball    34.183246
dtype: float64

In [11]:
# Median value is the middle value after you reorganize the data set in ascending order.
# If there are an even number of observations, then we take the average of the 2 middle values.
sports.median()

Golf        48.0
Soccer      28.0
Tennis      13.0
Hockey      15.0
Baseball    30.0
dtype: float64

In [12]:
# Mode is defined as the value most frequent in our data
# If a value appears repeatedly in the data, it will influence the average
# towards that frequent value
# Modal value is like a highly weighted contributing factor for the mean value
sports['Golf'].mode()

0    26
dtype: int64

In [13]:
sports['Tennis'].mode()

0    12
dtype: int64

### Ranges and Percentiles

In [14]:
# Range is one indicator of spread
# To calculate range, you subtract the smallest value from the largest value
range = sports.max() - sports.min()
range

Golf        77
Soccer      67
Tennis      19
Hockey      49
Baseball    48
dtype: int64

In [15]:
# Percentile values in summary, include 10% and 90%
sports.describe(percentiles=[.1, .25, .5, .75, .9])

Unnamed: 0,Golf,Soccer,Tennis,Hockey,Baseball
count,191.0,191.0,191.0,191.0,191.0
mean,49.387435,28.853403,13.502618,14.879581,34.183246
std,18.247326,8.682073,4.067683,5.916182,14.037279
min,23.0,17.0,7.0,6.0,13.0
10%,28.0,20.0,9.0,9.0,17.0
25%,34.0,24.0,11.0,11.0,23.5
50%,48.0,28.0,13.0,15.0,30.0
75%,59.0,32.0,16.0,18.0,48.5
90%,77.0,36.0,20.0,20.0,54.0
max,100.0,84.0,26.0,55.0,61.0


In [16]:
from scipy.stats import iqr

# Calculate Interquartile Range (IQR), 75% - 25%
IQR = iqr(sports['Golf'])
IQR

25.0

### Measures of Spread

In [17]:
# Standard deviation tells you how much your data point deviates from the mean.
# Standard deviation is a measure of spread. It has the same units as your data. 
# How to calculate standard deviation

# step 1:  For each observations, subtract away the mean value
diff = sports['Golf'] - sports['Golf'].mean()

# step 2: Square each difference
square_diff = diff**2

# step 3: Sum up all of these squared differences
sum_squared = square_diff.sum()

# step 4: Divide sum by the total number of observations minus 1
total_divided = sum_squared/(191 - 1)

# step 5: Square root of the result
squareroot_result = total_divided**0.5

# standard deviation, units are the same as the original data
squareroot_result

18.247325664605434

In [18]:
# Variance is a measure of spread, similar to standard deviation.
# To calculate variance, take steps 1-4 from standard deviation
# Units are interpreted in terms of squared units, which is not easy to understand or to interpret
# How to calculate variance

# step 1:  For each observations, subtract away the mean value
diff = sports['Golf'] - sports['Golf'].mean()

# step 2: Square each difference
square_diff = diff**2

# step 3: Sum up all of these squared differences
sum_squared = square_diff.sum()

# step 4: Divide sum by the total number of observations minus 1
variance = sum_squared/(191 - 1)

print(variance)

332.96489391016814


In [None]:
# end