In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
import statsmodels.api as sm

# Discrete Probability Distribution

## 1. Binomial Distribution

In [2]:
### Q. n = 3, p = 0.85

rv = stats.binom(3, 0.85)

proba = rv.pmf(np.arange(4))
proba

array([0.003375, 0.057375, 0.325125, 0.614125])

In [4]:
proba.sum()

1.0

In [3]:
pd.DataFrame({"x":np.arange(4),"proba":proba})

Unnamed: 0,x,proba
0,0,0.003375
1,1,0.057375
2,2,0.325125
3,3,0.614125


In [7]:
### Q. p = 0.30 only 30% of students go for higher studies, n = 5 students

n = 5
p = 0.30


rv = stats.binom(n, p)

proba = rv.pmf(np.arange(n+1))
cdf_proba = rv.cdf(np.arange(n+1))

pd.DataFrame({"x":np.arange(n+1),"proba":proba, "cdf_proba":cdf_proba})

Unnamed: 0,x,proba,cdf_proba
0,0,0.16807,0.16807
1,1,0.36015,0.52822
2,2,0.3087,0.83692
3,3,0.1323,0.96922
4,4,0.02835,0.99757
5,5,0.00243,1.0


In [9]:
### n = 100, p = 0.68

n = 100
p = 0.68

rv = stats.binom(n, p)

proba = rv.pmf(np.arange(n+1))
cdf_proba = rv.cdf(np.arange(n+1))

df = pd.DataFrame({"x":np.arange(n+1),"proba":proba, "cdf_proba":cdf_proba})
df

Unnamed: 0,x,proba,cdf_proba
0,0,3.273391e-50,3.273391e-50
1,1,6.955955e-48,6.988689e-48
2,2,7.316795e-46,7.386682e-46
3,3,5.079075e-44,5.152942e-44
4,4,2.617311e-42,2.668840e-42
...,...,...,...
96,96,3.426710e-12,1.000000e+00
97,97,3.002787e-13,1.000000e+00
98,98,1.953343e-14,1.000000e+00
99,99,8.385565e-16,1.000000e+00


In [24]:
print("Probability of exactly 70 adults use social media =",round(df.proba[df.x==70].values[0], 4))
print()
print("Probability of no more than 70 adults use social media =",round(df.cdf_proba[df.x==70].values[0], 4))
print()
print("Probability of at least 70 adults use social media =",round(1- df.cdf_proba[df.x==69].values[0], 4))

Probability of exactly 70 adults use social media = 0.0791

Probability of no more than 70 adults use social media = 0.7007

Probability of at least 70 adults use social media = 0.3784


## 2. Poisson Distribution

In [29]:
### 18 visits in 30 days

mu = 3 ## average visits in 5 days
x = 5
rv = stats.poisson(mu)

proba = rv.pmf(np.arange(x+1))
cdf_proba = rv.cdf(np.arange(x+1))
pd.DataFrame({"x":np.arange(x+1),"proba":proba, "cdf_proba":cdf_proba})

Unnamed: 0,x,proba,cdf_proba
0,0,0.049787,0.049787
1,1,0.149361,0.199148
2,2,0.224042,0.42319
3,3,0.224042,0.647232
4,4,0.168031,0.815263
5,5,0.100819,0.916082


## 3. Hypergeometric Distribution

In [46]:
R = 2 # 2 damaged in all 20
n = 5 # sample of 5
N = 20 # total mangoes

rv = stats.hypergeom(N, n, R)

x = np.arange(n+1)

proba = rv.pmf(x)
cdf_proba = rv.cdf(x)

pd.DataFrame({"x":x,"proba":proba, "cdf_proba":cdf_proba})

Unnamed: 0,x,proba,cdf_proba
0,0,0.552632,0.552632
1,1,0.394737,0.947368
2,2,0.052632,1.0
3,3,0.0,1.0
4,4,0.0,1.0
5,5,0.0,1.0


In [47]:
expected_value = rv.expect()
expected_value

0.49999999999999994

In [48]:
R = 15 # have college degree
n = 4 # random sample
N = 25 # total applicants

rv = stats.hypergeom(N, n, R)

x = np.arange(n+1)

proba = rv.pmf(x)
cdf_proba = rv.cdf(x)

pd.DataFrame({"x":x,"proba":proba, "cdf_proba":cdf_proba})

Unnamed: 0,x,proba,cdf_proba
0,0,0.016601,0.016601
1,1,0.142292,0.158893
2,2,0.373518,0.532411
3,3,0.359684,0.892095
4,4,0.107905,1.0


# Continuous Probability Distribution

## 1. Uniform Distribution

In [87]:
A = np.array([2500, 5000])  ## lower and upper boundary
mean = A.mean()
mean

3750.0

In [90]:
std = (5000 - 2500) * (1/12)**0.5
std

721.6878364870322

## 2. Normal Distribution

In [97]:
## loc = means, scale = std

# 72 mean score
# 8 std

norm = stats.norm(72, 8)

In [98]:
1 - norm.cdf(60) # probability of scoring greater than 60

0.9331927987311419

In [99]:
norm.cdf(84) - norm.cdf(68) ## probability of scoring between 84 and 68

0.624655260005155

In [101]:
## ppf is inverse of cdf - percentiles
norm.ppf(0.90) ## minimum score to place in top 10% (90th percentile)

82.2524125243568

In [102]:
norm.ppf(0.25)

66.60408199843134

In [103]:
### Q.1 probability that the demand is above 20
### Q.2 probability that the demand is below 15
### Q.3 How much should be bought to meet the 90% of the demand

mean = 12
std = 3.2

norm = stats.norm(mean, std)

In [104]:
## 1.
norm.sf(20) # using survival function sf = 1 - cdf

0.006209665325776132

In [106]:
## 2.
norm.cdf(15)

0.8257492881194576

In [107]:
## 3.
norm.ppf(0.90)

16.100965009742723

## 3. Exponential Distribution

### Formulas

$$ f(x) = \lambda e^{-\lambda x}$$
$$ P(X\leq x) = 1 - e^{-\lambda x} $$
$$ P(X>x) = e^{-\lambda x} $$

# Interval Estimation

https://www.statology.org/confidence-intervals-python/

- For Population
$$ UCI = \mu + z_{\alpha /2} \frac{\sigma} {\sqrt n}$$
$$ LCI = \mu - z_{\alpha /2} \frac{\sigma} {\sqrt n}$$

In [10]:
mu = 106
sigma = 15
n = 22
z_alpha_2 = 1.645 #90% confidence interval

uci = mu + z_alpha_2*(sigma/n**0.5)
lci = mu - z_alpha_2*(sigma/n**0.5)

lci, uci

(100.73927232392532, 111.26072767607468)

### Student's t - distribution

In [11]:
df = pd.DataFrame({'cars':[97, 87,102,113,112,117,78,98,111,89,93,83,82,90,92,79,94,96,101,96,97,96,113,99,98]})
df

Unnamed: 0,cars
0,97
1,87
2,102
3,113
4,112
5,117
6,78
7,98
8,111
9,89


In [12]:
mean = df.cars.mean()
std = df.cars.std()

In [14]:
standard_error = stats.sem(df.cars)

In [18]:
## confidence interval for 90%
stats.t.interval(0.90, df=len(df)-1, loc=mean, scale=standard_error)

(92.85972516071553, 100.18027483928445)

In [19]:
dataframe = pd.DataFrame({'month':['jan','feb','mar','apr','may','jun'],
                         'stock_price':[71,73,76,78,81,75]})
dataframe

Unnamed: 0,month,stock_price
0,jan,71
1,feb,73
2,mar,76
3,apr,78
4,may,81
5,jun,75


In [20]:
mean = dataframe.stock_price.mean()
std = dataframe.stock_price.std()

mean, std

(75.66666666666667, 3.559026084010437)

In [22]:
## 90% confidence interval
stats.t.interval(0.90, df=len(dataframe)-1, loc=mean, scale=stats.sem(dataframe.stock_price))

(72.73886925906302, 78.59446407427032)

In [24]:
## Q. 

data = pd.DataFrame({'year':[2010, 2011, 2012, 2013, 2014],
                    'electronics':[17, -8, 4, 39, 38],
                    'utilities':[11, 13, 7, 21, 22]})
data = data.set_index('year')
data

Unnamed: 0_level_0,electronics,utilities
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,17,11
2011,-8,13
2012,4,7
2013,39,21
2014,38,22


In [26]:
## 99% confidence interval
lci, uci = stats.t.interval(0.99, df=len(data)-1, loc=data.mean(axis=0), scale=stats.sem(data))

In [27]:
lci

array([-24.62208345,   1.42433179])

In [28]:
uci

array([60.62208345, 28.17566821])

In [29]:
### for electronics
lci[0], uci[0]

(-24.622083447836552, 60.62208344783655)

In [30]:
### for utilities
lci[1], uci[1]

(1.424331788752216, 28.175668211247785)

### Confidence Interval for the Proportion

- Population Proportion

In [31]:
## successes = 7, trials = 25, alpha = 0.10 or 90% confidence interval
sm.stats.proportion_confint(7, 25, 0.10)

(0.13229251063732272, 0.42770748936267733)

In [35]:
sm.stats.proportion_confint(7, 25, 0.01) # 99% confidence interval

(0.048691102223379495, 0.5113088977766206)