### Chapter 1: Summary Statistics

In [None]:
""" 
Types of Statistics
- Descriptive Statistics - describe and summarizes the data
- Inferential Statistics -  use of data to make inferences about a larger population

Types of Data
- Numeric (Quantitative)
    - Continuous: Measured - Airplane, Time Spent
    - Discrete: Counted - Number of Pets, Number of Packages Shipped

- Categorical (Qualitative)
    - Nominal: Unordered
    - Ordinal: Ordered
    
    
Continuous vs Discrete vs Categorical

"""

In [None]:
""" 
The field of statistics - the practice or study of collecting and analyzing data
Summary Statistics - a fact about or summary of some data.
"""

In [None]:
# Import numpy with alias np
import numpy as np

food_consumption = None

# Filter for Belgium
be_consumption = food_consumption[food_consumption['country'] == 'Belgium']

# Filter for USA
usa_consumption = food_consumption[food_consumption['country'] == 'USA']

# Calculate mean and median consumption in Belgium
print(np.mean(be_consumption['consumption']))
print(np.median(be_consumption['consumption']))

# Calculate mean and median consumption in USA
print(np.mean(usa_consumption['consumption']))
print(np.median(usa_consumption['consumption']))

In [None]:
# Import numpy as np
import numpy as np

# Subset for Belgium and USA only
be_and_usa = food_consumption[(food_consumption['country'] == 'Belgium') | (food_consumption['country'] == 'USA')]

# Group by country, select consumption column, and compute mean and median
print(be_and_usa.groupby("country")["consumption"].agg([np.mean, np.median]))

In [None]:
# Measure the Spread of the data
# - Standard Deviation: np.std(df["columns"], ddof=1)
# - Variance: np.var(df["columns"], ddof=1)
# - Mean Absolute Deviation

# standard deviation squares distances, penalizing longer distances more than shorter ones.
# Standard Deviation is more common other than Mean Absolute Deviate

In [None]:
# Quantiles
# Interquartile Range (IQR)
# Outliers

In [None]:
df = None

# Quantiles
np.quantile(df["columns"], 0.5)

In [None]:
np.linspace()

### Measure of Spread (Location based within the data)

Measures how apart or close together the data points are.
- Variance
- Standard Deviation
- Mean Absolute Deviation

- Quantiles | Percentiles
- Interquartile Range (IQR)
- Outliers

Note
- Standard Deviation is more common than Mean Absolute Deviation.

In [None]:
# Variance on a sample population
np.var(df["columns"], ddof=1)

In [None]:
# Standard Deviation
np.std(df["columns"], ddof=1)

In [None]:
# Mean Absolute Deviation
# dists = msleep["sleep_total"] - mean(msleep$sleep_total)
# np.mean(np.abs(dists))

In [None]:
# Quantiles
np.quantile(df["column"], 0.5)  # Same as the median
np.quantile(df["column"], [0,0.25,0.5,0.75,1])

In [None]:
# Interquartile Range
np.quantile(df["column"], 0.75) - np.quantile(df["column"], 0.25)

from scipy.stats import iqr
iqr(df["column"])

In [None]:
# Outliers - data point that is substantially different from the others
# Formula
# data < Q1 - 1.5 x IQR or data > Q3 + 1.5 x IQR

# Process
# - calculate the IQR
# - calculate the lower threshold
# - calculate the upper threshold
# - subset or slice the dataframe to get the dataframe

In [None]:
# Calculate the IQR
from scipy.stats import iqr
iqr = iqr(df["columns"])

# Calculate the upper and lower threshold
lower_threshold = np.quantile(df["columns"], 0.25) - 1.5 * iqr
upper_threshold = np.quantile(df["columns"], 0.75) + 1.5 * iqr

# Subset the data (Slicing the data)
df[(df["columns"]<lower_threshold) | (df["columns"]>upper_threshold)]

In [None]:
# Calculating everything
df["columns"].describe

In [2]:
import numpy as np
np.linspace(0, 1, 10)

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [None]:
# EXAMPLE

# Print variance and sd of co2_emission for each food_category
print(food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std]))

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Create histogram of co2_emission for food_category 'beef'
food_consumption[food_consumption['food_category'] == 'beef']['co2_emission'].hist()
# Show plot
plt.show()

# Create histogram of co2_emission for food_category 'eggs'
food_consumption[food_consumption['food_category'] == 'eggs']['co2_emission'].hist()
# Show plot
plt.show()

### Chapter 2: Random Numbers and Probability

In [4]:
np.random.seed(10)

In [None]:
# With or Without Replacement

In [None]:
# Count the deals for each product
product_count = df["product"].value_count()

# Calculate the probability of picking a deal with each product
product_probability = product_count / df.shape[0]

In [None]:
# Setting the random seed
np.random.seed(24)

# Sample 5 deals without replacement
sample = df.sample(5)

# Sample 5 deals with replacement
sample = df.sample(5, replace=True)

In [6]:
# Discrete Distribution - Video
1/5

0.2

In [None]:
# Sampling from Discrete Distribution
die = None
rolls_10 = None

die.sample(10, replace=True)

# Visualizing the die sample
rolls_10["number"].hist(bins=np.linspace(1,7,7))

# Law of Large Number - as the size of your sample increases, the sample\
    # mean will approach the expected value

In [None]:
# Sampling from Discrete Distribution

# # Create probability distribution
# size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]

# # Reset index and rename columns
# size_dist = size_dist.reset_index()
# size_dist.columns = ['group_size', 'prob']

# # Expected value
# expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])

# # Subset groups of size 4 or more
# groups_4_or_more = size_dist[size_dist['group_size'] >= 4]

# # Sum the probabilities of groups_4_or_more
# prob_4_or_more = np.sum(groups_4_or_more['prob'])
# print(prob_4_or_more)

In [None]:
# Sampling from Continuous Distribution


In [None]:
""" 
Types of distribution.
- Discrete
- Continuous
- Binomial 

"""

In [2]:
# Binomial Distributions
# Outcome based on two values
# - True or False
# - Pass or Fail 
# - Head or Tail

# Probability distribution of the number of successes in a sequence of independent trials.

# Number of heads in a sequence of coin flips
# - n: total number of trials
# - p: probability of success


from scipy.stats import binom

In [3]:
# Flip 1 coin with 50% chance of success 8 times. 
binom.rvs(1, 0.5, size=8)

array([0, 1, 1, 0, 1, 0, 1, 1])

In [None]:
# Flip 8 coins with 50% chance of success 1 time
binom.rvs(8, 0.5, size=1)

In [None]:
# Flip 3 coins with 50% chance of success 10 times

# Flip 3 coins with 25% chance of success 10 times

In [4]:
# Probability of 7 heads - Getting more probability of 7 heads
# binom.pmf(num_heads, num_trials, prob_of_heads)
binom.pmf(7,10,0.5)

0.11718750000000014

In [5]:
# Probability of 7 or fewer heads
# p(heads<=7)
binom.cdf(1,3,0.3)

# Probability of closing > 1 deal out of 3 deals
1 - binom.cdf(1,3,0.3)

0.9453125

## Chapter 3: Normal Distribution

Normal Distribution or the Bell Curve.
68

In [None]:
from scipy.stats import norm

# Cumulative distribution function
# What percent of women are shorter than 154?
norm.cdf(154, 161, 7)

# What percent of women are taller than 154?


# What percent of women are 154 to 157 cm?
norm.cdf(157, 161, 7) - \
    norm.cdf(154, 161, 7)


In [None]:
# What height are 90% of the women shorter than?
norm.ppf(0.90, 161, 7)

# What height are 90% of the women taller than?
norm.ppf((1-0.90), 161, 7)

In [None]:
# Generating random numbers
norm.rvs(161, 7, size=10)