In [13]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
# Create two populations with similar binomial distributions
pop1 = np.random.binomial(10, 0.2, 10000)
pop2 = np.random.binomial(10, 0.5, 10000)

# Sampling each population, with 100 samples
sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)

sample1_mean = sample1.mean()
sample1_std = sample1.std()
sample2_mean = sample2.mean()
sample2_std = sample2.std()

print('Sample 1 (100 samples) mean is {}'.format(sample1_mean))
print('Sample 1 (100 samples) std is {}'.format(round(sample1_std, 2)))
print('Sample 2 (100 samples) mean is {}'.format(sample2_mean))
print('Sample 2 (100 samples) std is {}'.format(round(sample2_std, 2)))

Sample 1 (100 samples) mean is 1.97
Sample 1 (100 samples) std is 1.25
Sample 2 (100 samples) mean is 5.22
Sample 2 (100 samples) std is 1.67


> For each of the following tasks, first write what you expect will happen, then code the changes and observe what does happen. 

> 1. Increase the size of your samples from 100 to 1000, then calculate the means and standard deviations for your new samples and create histograms for each.

**With a larger sample size, the sample means will vary less and be closer to the population mean after each run (if one were to run the code multiple times).  The standard deviation of each should decrease.**

In [11]:
# Sampling each population, with 1000 samples
sample1_1k = np.random.choice(pop1, 1000, replace=True)
sample2_1k = np.random.choice(pop2, 1000, replace=True)

sample1_1k_mean = sample1_1k.mean()
sample1_1k_std = sample1_1k.std()
sample2_1k_mean = sample2_1k.mean()
sample2_1k_std = sample2_1k.std()

print('Sample 1 (1000 samples) mean is {}'.format(sample1_1k_mean))
print('Sample 1 (1000 samples) std is {}'.format(round(sample1_1k_std, 2)))
print('Sample 2 (1000 samples) mean is {}'.format(sample2_1k_mean))
print('Sample 2 (1000 samples) std is {}'.format(round(sample2_1k_std, 2)))

Sample 1 (1000 samples) mean is 2.001
Sample 1 (1000 samples) std is 1.24
Sample 2 (1000 samples) mean is 4.987
Sample 2 (1000 samples) std is 1.59


As expected the sample means are closer to the population mean than with 100 samples. The standard deviations have also decreased slightly, but not as much as expected.  It seems like 100 samples was probably a large enough sample size to converge on the sample mean.



> Decrease the sample size to 20 and discuss the changes.

**I expect that the sample means will not have converged close to the population means yet and the standard deviations my be larger than the previous samples**

In [12]:
# Sampling each population, with 20 samples
sample1_20 = np.random.choice(pop1, 20, replace=True)
sample2_20 = np.random.choice(pop2, 20, replace=True)

sample1_20_mean = sample1_20.mean()
sample1_20_std = sample1_20.std()
sample2_20_mean = sample2_20.mean()
sample2_20_std = sample2_20.std()

print('Sample 1 (20 samples) mean is {}'.format(sample1_20_mean))
print('Sample 1 (20 samples) std is {}'.format(round(sample1_20_std, 2)))
print('Sample 2 (20 samples) mean is {}'.format(sample2_20_mean))
print('Sample 2 (20 samples) std is {}'.format(round(sample2_20_std, 2)))

Sample 1 (20 samples) mean is 2.1
Sample 1 (20 samples) std is 1.09
Sample 2 (20 samples) mean is 4.75
Sample 2 (20 samples) std is 1.37


Interestingly, the standard deviations are smaller than previous samples, but that is likely just the luck of this random sample draw. The means are not as close to the population means as previously seen, however, they are closer than I anticipated.

>2. Change the probability value (p in the NumPy documentation) for pop1 to 0.3, then take new samples and compute the t-statistic and p-value.

In [14]:
pop1_p3 = np.random.binomial(10, .3, 1000)
sample_p3_100 = np.random.choice(pop1_p3, 100, replace=True)

from scipy.stats import ttest_ind
print(ttest_ind(sample2, sample_p3_100, equal_var=False))

Ttest_indResult(statistic=9.42377677756789, pvalue=1.2439282968548267e-17)


> Then change the probability value p for group 1 to 0.4, and do it again. What changes, and why?

In [16]:
pop1_p4 = np.random.binomial(10, .4, 1000)
sample_p4_100 = np.random.choice(pop1_p4, 100, replace=True)

print(ttest_ind(sample2, sample_p4_100, equal_var=False))

Ttest_indResult(statistic=5.454837424065928, pvalue=1.4524653697494402e-07)


The t-statistic decreases which says that the population means are closer together.  The p-value increase slightly which means that its a bit less likely that this difference meaningfully represents the populations.


>3. Change the distribution of your populations from binomial to a distribution of your choice. Do the sample mean values still accurately represent the population values?

In [56]:
# Creating 2 log normal distributions
pop1_lognormal = np.random.lognormal(0, 1, 10000) # mean=0, std=1 - of underlying normal dist
pop2_lognormal = np.random.lognormal(1, 1, 10000) # mean=1, std=1 - of underlying normal dist

# Sampling each population, with 100 samples
sample1_lognormal = np.random.choice(pop1_lognormal, 100, replace=True)
sample2_lognormal = np.random.choice(pop2_lognormal, 100, replace=True)

sample1_lognormal_mean = sample1_lognormal.mean()
sample1_lognormal_std = sample1_lognormal.std()
sample2_lognormal_mean = sample2_lognormal.mean()
sample2_lognormal_std = sample2_lognormal.std()

print('Pop1 lognormal mean is {}'.format(round(pop1_lognormal.mean())))
print('Pop2 lognormal mean is {}'.format(round(pop2_lognormal.mean())))

print('Sample 1 lognormal mean is {}'.format(round(sample1_lognormal_mean, 2)))
print('Sample 1 lognormal std is {}'.format(round(sample1_lognormal_std, 2)))
print('Sample 2 lognormal mean is {}'.format(round(sample2_lognormal_mean, 2)))
print('Sample 2 lognormal std is {}'.format(round(sample2_lognormal_std, 2)))

Pop1 lognormal mean is 2.0
Pop2 lognormal mean is 5.0
Sample 1 lognormal mean is 1.3
Sample 1 lognormal std is 1.39
Sample 2 lognormal mean is 4.91
Sample 2 lognormal std is 6.42


In [57]:
print(ttest_ind(sample2_lognormal, sample1_lognormal, equal_var=False))

Ttest_indResult(statistic=5.467123440719485, pvalue=2.963581089146306e-07)


The sample means are moderately close to the population means, however, it seems like a larger sample size is needed with a log normal distribution beacuse it is significantly different than a normal distribution. 