In [1]:
import matplotlib.pyplot as plt
# numpy for vectorized array operations
import numpy as np
# pandas for proper tabular manipulation
import pandas as pd
# scipy stats for our subversions
from scipy import stats

In [14]:
# 1. How likely is it that you roll doubles when rolling two dice?
n_simulations = 100_000
n_trials = 2
outcomes = [1, 2, 3, 4, 5, 6]

In [15]:
double_roll = np.random.choice(outcomes, (n_simulations, n_trials))

In [16]:
double_roll.T

array([[6, 5, 3, ..., 6, 3, 2],
       [4, 1, 1, ..., 2, 5, 3]])

In [17]:
double_roll[:,0]

array([6, 5, 3, ..., 6, 3, 2])

In [18]:
double_roll[:,1]

array([4, 1, 1, ..., 2, 5, 3])

In [19]:
new_double_roll = double_roll[:,0] == double_roll[:,1]
new_double_roll

array([False, False, False, ..., False, False, False])

In [20]:
new_double_roll.sum()

16808

In [22]:
len(new_double_roll)

100000

In [25]:
new_double_roll.mean()

0.16808

In [26]:
# 2. If you flip 8 coins, what is the probability of getting exactly 3 heads? 
# What is the probability of getting more than 3 heads?
outcomes = ['H', 'T']
# n_simulations = 10_000
n_trials = 8

In [27]:
three_heads = np.random.choice(outcomes, (n_simulations, n_trials))

In [28]:
three_heads[:5]

array([['H', 'H', 'T', 'T', 'T', 'T', 'T', 'T'],
       ['H', 'H', 'H', 'H', 'T', 'T', 'T', 'T'],
       ['T', 'H', 'T', 'H', 'H', 'T', 'H', 'T'],
       ['H', 'H', 'T', 'H', 'H', 'H', 'T', 'H'],
       ['H', 'H', 'H', 'H', 'T', 'T', 'H', 'H']], dtype='<U1')

In [31]:
three_head_flips = ((three_heads == 'H').sum(axis=1) == 3)
three_head_flips

array([False, False, False, ..., False,  True,  True])

In [32]:
three_head_flips.sum()

21950

In [188]:
((three_heads == 'H').sum(axis=1) == 3).sum() / len(three_heads)

0.218

In [64]:

((three_heads == 'H').sum(axis=1) == 3).mean()

0.2195

In [65]:
# another way
three_head_flips.mean()

0.2195

In [36]:
other_way_three_heads = (three_heads == 'H').sum(axis=1)
other_way_three_heads

array([2, 4, 4, ..., 4, 3, 3])

In [38]:
(other_way_three_heads == 3).mean()

0.2195

In [189]:
# What is the probability of getting more than 3 heads?
more_than_three_heads = np.random.choice(outcomes, (n_simulations, n_trials))

In [190]:
((more_than_three_heads == 'H').sum(axis=1) > 3).sum()

6355

In [191]:
((more_than_three_heads == 'H').sum(axis=1) > 3).sum() / len(more_than_three_heads)

0.6355

In [192]:
((more_than_three_heads == 'H').sum(axis=1) > 3).mean()

0.6355

In [66]:
# another way
(other_way_three_heads > 3).mean()

0.63554

In [43]:
# 3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. 
# Assuming that Codeup randomly selects an alumni to put on a billboard, 
# what are the odds that the two billboards I drive past both have data science students on them?
n_percentage = 0.25
n_select = 2
# n_simulation = 10_000

In [50]:
data_science_cohort = np.random.random((n_simulations, n_select))

In [51]:
data_science_cohort[:3]

array([[0.89090574, 0.67373999],
       [0.66089249, 0.00781302],
       [0.20438636, 0.06617435]])

In [52]:
data_science_selected = (data_science_cohort < n_percentage)

In [53]:
data_science_selected[:3]

array([[False, False],
       [False,  True],
       [ True,  True]])

In [54]:
(data_science_selected.sum(axis=1) == 2).mean()

0.06111

In [68]:
# another way
outcomes = ['ds', 'wd', 'wd', 'wd']
another_billboards = np.random.choice(outcomes, (n_simulations, n_select))
another_billboards

array([['ds', 'ds'],
       ['wd', 'ds'],
       ['wd', 'wd'],
       ...,
       ['ds', 'wd'],
       ['wd', 'wd'],
       ['wd', 'wd']], dtype='<U2')

In [57]:
# another way
outcomes = ['ds', 'wd']
billboards = np.random.choice(outcomes, (n_simulations, n_select), p=[0.25, 0.75])
billboards

array([['wd', 'wd'],
       ['ds', 'wd'],
       ['ds', 'wd'],
       ...,
       ['wd', 'wd'],
       ['wd', 'wd'],
       ['wd', 'ds']], dtype='<U2')

In [61]:
new_billboards = (billboards == 'ds').sum(axis=1)
new_billboards

array([0, 1, 1, ..., 0, 0, 1])

In [63]:
(new_billboards == 2).mean()

0.06319

In [None]:
# 4. Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. 
# If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts 
# on Friday afternoon? (Remember, if you have mean and standard deviation, use the np.random.normal) You'll need to make a 
# judgement call on how to handle some of your values

In [69]:
mean = 3
sd = 1.5
n_days = 5

In [70]:
poptarts_purchased_daily = np.random.normal(3, 1.5, (n_simulations, n_days))
poptarts_purchased_daily

array([[3.93055973, 2.77730501, 4.78382062, 3.3974185 , 3.01642459],
       [1.068082  , 2.47103279, 4.78614053, 1.42559707, 3.14564917],
       [5.44124322, 0.72017767, 1.6049281 , 1.39376781, 2.57590005],
       ...,
       [1.38244748, 2.76771626, 6.17348197, 3.68062317, 3.58827296],
       [4.4390653 , 4.1958379 , 3.53093673, 5.73877804, 1.87969396],
       [1.39305205, 1.44853021, 2.43064324, 1.11791738, 3.20339081]])

In [71]:
poptarts_purchased_weekly = poptarts_purchased_daily.sum(axis=1)
poptarts_purchased_weekly

array([17.90552845, 12.89650156, 11.73601685, ..., 17.59254183,
       19.78431194,  9.5935337 ])

In [226]:
(17 - poptarts_purchased_weekly) >= 1

array([ True, False, False, ...,  True,  True, False])

In [72]:
((17 - poptarts_purchased_weekly) >= 1).mean()

0.617

In [75]:
# another way
(poptarts_purchased_weekly < 16).mean()

0.617

In [76]:
# 5. Compare Heights

# Men have an average height of 178 cm and standard deviation of 8cm.
# Women have a mean of 170, sd = 6cm.
# Since you have means and standard deviations, you can use np.random.normal to generate observations.
# If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?
man_height = 178
man_sd = 8

woman_height = 170
woman_sd = 6

In [78]:
man_length = np.random.normal(man_height, man_sd, n_simulations) 
man_length

array([178.11658034, 194.66589413, 175.45856036, ..., 165.85398067,
       171.30444752, 160.6099252 ])

In [80]:
woman_length = np.random.normal(woman_height, woman_sd, n_simulations)
woman_length

array([174.57699562, 170.1049334 , 167.70526292, ..., 159.75670734,
       161.23755253, 165.78105075])

In [81]:
(woman_length > man_length).mean()

0.21229

In [83]:
# 6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted 
# and the installation fails. What are the odds that after having 50 students download anaconda, no one has an 
# installation issue? 100 students?
# What is the probability that we observe an installation issue within the first 150 students that download 
# anaconda?
# How likely is it that 450 students all download anaconda without an issue?

# 50 students
outcomes = ['p', 'f']
probability = [249/250, 1/250]
n_students = 50

In [85]:
no_corrupt_file = np.random.choice(outcomes, (n_simulations, n_students), p=probability)
no_corrupt_file

array([['p', 'p', 'p', ..., 'p', 'p', 'p'],
       ['p', 'p', 'p', ..., 'p', 'p', 'p'],
       ['p', 'p', 'p', ..., 'p', 'p', 'p'],
       ...,
       ['p', 'p', 'p', ..., 'p', 'p', 'p'],
       ['p', 'p', 'p', ..., 'p', 'p', 'p'],
       ['p', 'p', 'p', ..., 'p', 'p', 'p']], dtype='<U1')

In [88]:
total_fails = (no_corrupt_file == 'f').sum(axis=1)
total_fails

array([0, 0, 0, ..., 0, 0, 2])

In [90]:
(total_fails == 0).mean()

0.81871

In [95]:
# 100 students?
n_students = 100
installs = np.random.choice(outcomes, (n_simulations, n_students), p=probability)

total_fails = (installs == 'f').sum(axis=1)
(total_fails == 0).mean()

0.67078

In [96]:
# What is the probability that we observe an installation issue within the first 150 students that download anaconda?
# 150 students
n_students = 150
installs = np.random.choice(outcomes, (n_simulations, n_students), p=probability)

# update to at least one failure
total_fails = (installs == 'f').sum(axis=1)
(total_fails > 0).mean()

0.45033

In [97]:
# How likely is it that 450 students all download anaconda without an issue?
# 450 students
n_students = 450
installs = np.random.choice(outcomes, (n_simulations, n_students), p=probability)

# update to no failure
total_fails = (installs == 'f').sum(axis=1)
(total_fails == 0).mean()

0.1648

In [111]:
# 7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. 
# However, you haven't seen a food truck there in 3 days. How unlikely is this?

# How likely is it that a food truck will show up sometime this week?

truck_chances = 0.7
n_days = 3

In [112]:
no_show_truck = np.random.random((n_simulations, n_days))
no_show_truck

array([[0.84032485, 0.09131886, 0.55522139],
       [0.95460936, 0.71962397, 0.73948612],
       [0.38433545, 0.92492784, 0.72867246],
       ...,
       [0.32367305, 0.20974638, 0.01968729],
       [0.77801062, 0.40740114, 0.16756167],
       [0.40715336, 0.81688399, 0.2443765 ]])

In [113]:
new_truck_chances = (no_show_truck < truck_chances)
new_truck_chances

array([[False,  True,  True],
       [False, False, False],
       [ True, False, False],
       ...,
       [ True,  True,  True],
       [False,  True,  True],
       [ True, False,  True]])

In [114]:
# how many times there is no truck show
(new_truck_chances.sum(axis=1) == 0).mean()

0.02742

In [118]:
# How likely is it that a food truck will show up sometime this week?
n_days = 5

In [119]:
no_show_truck = np.random.random((n_simulations, n_days))
no_show_truck

array([[0.72602781, 0.01618549, 0.00837105, 0.0963317 , 0.79433765],
       [0.64958057, 0.93689054, 0.77678875, 0.65199346, 0.97753424],
       [0.96621488, 0.12717625, 0.50455861, 0.02613516, 0.33002077],
       ...,
       [0.19181892, 0.97824417, 0.37864818, 0.71773422, 0.32132403],
       [0.7421283 , 0.56068112, 0.79510681, 0.68306108, 0.49772687],
       [0.46624691, 0.57277732, 0.2390975 , 0.65484004, 0.39954227]])

In [122]:
truck_will_show = no_show_truck < truck_chances
truck_will_show

array([[False,  True,  True,  True, False],
       [ True, False, False,  True, False],
       [False,  True,  True,  True,  True],
       ...,
       [ True, False,  True, False,  True],
       [False,  True, False,  True,  True],
       [ True,  True,  True,  True,  True]])

In [126]:
# chances of food truck will show this week
(truck_will_show.sum(axis=1) >= 1).mean()

0.99756

In [146]:
# 8. If 23 people are in the same room, what are the odds that two of them share a birthday? 
# What if it's 20 people? 40?

n_people = 23
outcomes = list(range(1, 366))

In [147]:
birthdays = np.random.choice(outcomes, (n_simulations, n_people))
birthdays

array([[  8, 110,   7, ...,  68, 364, 232],
       [309, 170, 252, ..., 204, 353, 168],
       [185, 239, 347, ...,   9, 259, 180],
       ...,
       [ 71, 255, 253, ..., 234, 118, 282],
       [233, 126, 291, ...,  15, 246, 181],
       [152, 164, 224, ..., 337, 129, 119]])

In [148]:
birthdays = pd.DataFrame(birthdays)
birthdays

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,8,110,7,357,36,128,41,256,213,59,...,140,343,161,22,192,242,65,68,364,232
1,309,170,252,343,101,363,199,121,352,150,...,245,171,56,65,107,143,3,204,353,168
2,185,239,347,334,123,82,130,144,115,354,...,226,18,260,71,46,209,10,9,259,180
3,108,336,176,26,330,109,118,311,79,277,...,324,61,261,228,119,113,326,184,336,98
4,194,329,288,274,326,232,115,116,348,81,...,51,365,347,50,246,242,248,137,91,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,330,247,162,300,299,253,127,95,237,121,...,364,182,53,278,50,127,196,41,133,285
99996,197,201,79,311,358,240,292,131,298,117,...,211,149,186,320,316,89,351,360,256,248
99997,71,255,253,68,225,101,99,11,322,80,...,188,131,306,199,288,238,126,234,118,282
99998,233,126,291,352,232,71,21,74,240,143,...,250,140,262,58,221,198,249,15,246,181


In [149]:
# how many unique values exist for each row
birthdays.nunique(axis=1)

0        23
1        23
2        23
3        20
4        23
         ..
99995    22
99996    22
99997    23
99998    22
99999    22
Length: 100000, dtype: int64

In [150]:
(birthdays.nunique(axis=1) < n_people).mean()

0.50791

In [151]:
# What if it's 20 people?
n_people = 20
birthdays = np.random.choice(outcomes, (n_simulations, n_people))
birthdays = pd.DataFrame(birthdays)
birthdays

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,155,83,263,130,163,347,232,220,138,209,280,2,290,104,352,242,29,156,123,90
1,266,267,115,193,188,304,300,218,131,185,2,227,59,95,239,246,97,240,63,136
2,220,356,237,72,103,316,359,337,156,29,346,295,215,218,68,76,46,244,132,108
3,114,48,356,188,212,80,28,287,318,268,347,140,86,357,132,27,203,226,54,335
4,150,316,92,129,307,326,171,209,242,271,202,155,184,266,88,192,221,48,99,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,259,342,167,89,323,47,364,182,215,246,25,359,321,189,166,316,94,161,252,238
99996,206,144,52,326,121,338,74,100,312,348,27,49,85,360,304,228,154,307,365,153
99997,211,83,204,255,112,266,157,248,52,250,297,360,206,360,186,44,90,254,67,21
99998,22,287,133,251,51,273,101,114,20,285,122,342,248,30,325,298,142,239,164,230


In [154]:
(birthdays.nunique(axis=1) < n_people).mean()

0.41375

In [155]:
# What if it's 40 people?
n_people = 40
birthdays = np.random.choice(outcomes, (n_simulations, n_people))
birthdays = pd.DataFrame(birthdays)
birthdays

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,315,304,227,64,246,302,37,49,204,139,...,74,226,313,240,70,287,292,212,315,274
1,285,270,198,16,357,322,61,92,205,125,...,228,146,145,83,61,142,260,105,173,116
2,177,286,176,116,70,86,235,16,261,320,...,264,94,166,98,210,28,362,353,124,39
3,109,102,94,35,169,205,150,261,245,46,...,212,105,266,96,126,264,32,294,199,318
4,152,4,101,194,165,248,64,295,135,319,...,308,309,279,42,106,217,347,252,221,221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,232,303,163,12,281,80,80,179,79,234,...,123,39,289,245,224,295,63,289,54,102
99996,327,212,103,61,105,351,175,293,147,213,...,224,347,4,23,290,359,156,226,163,103
99997,292,94,305,279,108,114,185,353,181,316,...,102,247,139,13,187,57,40,156,328,352
99998,203,150,79,171,191,348,252,83,136,12,...,293,311,7,224,158,209,20,229,70,140


In [158]:
(birthdays.nunique(axis=1) < n_people).mean()

0.89127