# Reference Sheet for DSC 10

Side notes: 
* should `bpd.DataFrame.append()` be added to the **Building and Organizing DataFrames** section? probably no? I feel that it is not introduced in this class (dsc80)
* replace % of data in certain standard deviation ranges with a for loop?
* add NaN values and 0 in the example DataFrame `pets`?
* add a disctionary section and a data format conversion section?

<a id='table of contents'></a>
## Table of Contents
### !! Table of Contents does not work in Github !!
1. [Building and Organizing DataFrames](#building-and-organizing-dataframes)
2. [Accessing Data](#accessing-data)
3. [Series Methods](#series-methods)
4. [Arrays and NumPy](#arrays-and-numpy)
5. [Querying](#querying)
6. [Plotting](#plotting)
7. [Grouping](#grouping)
8. [Writing Functions](#writing-functions)
9. [Applying Functions](#applying-functions)
10. [for-loops and if-statements](#for-loops-and-if-statements) #split this into two sections
11. [Random Sampling](#random-sampling)
12. [Bootstrapping and Confidence Intervals](#bootstrapping-and-confidence-intervals)
13. [Statistics, Hypothesis Testing, and Permutation Testing](#statistics-hypothesis-testing-and-permutation-testing)
14. [Spread of a Distribution](#spread-of-a-distribution)
15. [Standard Units, Correlation, Regression](#standard-units-correlation-regression)
16. [The Standard Normal Distribution](#the-standard-normal-distribution)
17. [The Central Limit Theorem](#the-central-limit-theorem)

In [None]:
import babypandas as bpd
import numpy as np
import matplotlib.pyplot as plt

<a id='building-and-organizing-dataframes'></a>
## Building and Organizing DataFrames
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
pets = bpd.read_csv("pets.csv")
pets

In [None]:
pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'black', 'black', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1, 0.25],
    Age=[5, 8, 9, 2, 0.5, 3, 0.2]
)
pets

In [None]:
pets = pets.assign(ID=['dog_001', 'cat_001', 'cat_002', 'dog_002', 'dog_003', 'ham_001', 'ham_002'],
                   New_column_1=['this', 'is', 'a', 'new', 'column', 'I', 'assigned'], 
                  New_column_2=['this', 'is', 'another', 'new', 'column', 'I', 'assigned'])
pets

In [None]:
pets = pets.drop(columns=['New_column_1', 'New_column_2'])
pets

In [None]:
pets=pets.set_index('ID')
pets

In [None]:
pets=pets.reset_index()
pets

In [None]:
pets.sort_values(by='Weight', ascending=True)

In [None]:
pets.sort_values(by='Age', ascending=False)

In [None]:
pets_info = bpd.DataFrame().assign(
    Pet_ID=['dog_001', 'cat_001', 'cat_002', 'dog_002', 'dog_003', 'ham_001'],
    Breed=['Labrador', 'Sphinx', 'Siamese', 'Chihuahua', 'Labrador', 'Roborovski'],
    Owner=['Jason', 'Lauren', 'Cassidy', 'Bobby', 'Dhruv', 'Cassidy']
)
pets_info

In [None]:
pets.merge(pets_info, left_on='ID', right_on='Pet_ID')

In [None]:
pets_info = pets_info.set_index('Pet_ID')
pets.merge(pets_info, left_on='ID', right_index=True)

In [None]:
pets=pets.set_index('ID')
pets

<a id='accessing-data'></a>
## Accessing Data
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
pets.shape[0], pets.shape[1]

In [None]:
pets.get('Species')

In [None]:
#new
pets.get(['Species'])

In [None]:
pets.get(['Species', 'Color'])

In [None]:
species_ser=pets.get('Species')
species_ser

In [None]:
species_ser.loc['dog_002']

In [None]:
species_ser.iloc[2]

In [None]:
species_ser.iloc[-1]

In [None]:
pets.index[0]

In [None]:
pets.index[-3]

In [None]:
pets.take([0, 3])

In [None]:
pets.take(np.arange(0, 6, 2))

In [None]:
pets[[True, False, True, False, True, False, True]]

In [None]:
weight_ser = pets.get('Weight')
weight_ser

<a id='series-methods'></a>
## Series Methods
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
weight_ser.count()

In [None]:
weight_ser.max()

In [None]:
weight_ser.min()

In [None]:
weight_ser.sum()

In [None]:
weight_ser.mean()

In [None]:
weight_ser.median()

In [None]:
weight_ser.unique()

<a id='arrays-and-numpy'></a>
## Arrays and NumPy
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
weight_arr = np.array(weight_ser)
weight_arr

In [None]:
np.append(weight_arr, 2)

In [None]:
#to do: add 0s and NaNs
np.count_nonzero(weight_arr)

In [None]:
np.arange(1, 9, 2)

In [None]:
np.arange(10, 2, -3)

In [None]:
np.percentile(weight_arr, 50)

## Dictionary (new)
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

## Data Format Conversion (new)
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

<a id='querying'></a>
## Querying
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
(pets.get('Species')=='dog') & (pets.get('Color')=='white')

In [None]:
(pets.get('Species')=='dog') | (pets.get('Color')=='white')

In [None]:
pets[(pets.get('Species')=='dog') & (pets.get('Color')=='white')]

In [None]:
pets[(pets.get('Species')=='dog') | (pets.get('Color')=='white')]

In [None]:
pets[pets.get('Weight') >= 25]

In [None]:
pets[(pets.get('Weight') >= 25) &  (pets.get('Weight') < 80)]

In [None]:
pets[pets.get('Color').str.contains('e')]

In [None]:
pets[pets.index.str.contains('cat')]

In [None]:
pets = pets.reset_index()
pets[pets.index > 3]

<a id='plotting'></a>
## Plotting
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
pets.plot(kind='scatter', x='Age', y='Weight')

In [None]:
pets.plot(kind='hist', y='Age', bins=np.arange(0, 15, 3), density=True)

In [None]:
pets.plot(kind='hist', y='Weight', bins=5, density=True)

<a id='grouping'></a>
## Grouping
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
pets.groupby('Species').count()

In [None]:
pets.groupby('Species').mean()

In [None]:
pets.groupby('Species').median()

In [None]:
pets.groupby('Species').sum()

In [None]:
pets.groupby('Species').max()

In [None]:
pets.groupby('Species').min()

In [None]:
pets.groupby(['Species', 'Color']).count().reset_index()

<a id='writing-functions'></a>
## Writing Functions
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
def descriptive_name(df):
    descriptive_string = df.get('Color')+'_'+df.get('Species')
    return descriptive_string
descriptive_name(pets)

In [None]:
#to do: change to a more complex task
def age_in_days(ser):
    return 365*ser
pets.get('Age').apply(age_in_days)

In [None]:
def more_descriptive_name(id_str, species, color, weight, age):
    return id_str + ': This ' + color + ' ' + species + ' weighs ' + weight + ' lbs and is ' + age + ' years old'

<a id='applying-functions'></a>
## Applying Functions
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
chosen_pet = pets.iloc[0]
more_descriptive_name(chosen_pet.get('ID'), chosen_pet.get('Species'), chosen_pet.get('Color'), \
                      str(chosen_pet.get('Weight')), str(chosen_pet.get('Age')))



<a id='for-loops-and-if-statements'></a>
## for-loops and if-statements
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
#to do: make it two different examples, one for for loop and one for if else
for i in np.arange(pets.shape[0]):
    chosen_pet = pets.iloc[i]
    pet_id = chosen_pet.get('ID')
    species = chosen_pet.get('Species')
    age = chosen_pet.get('Age')
    weight = chosen_pet.get('Weight')
    
    if (age < 1) and (weight < 1):
        print(pet_id + ': This pet is a baby :)')
        
    elif (species == 'dog') or (species == 'cat'):
        color = chosen_pet.get('Color')
        weight = str(weight)
        age = str(age)
        print(more_descriptive_name(pet_id, species, color, weight, age))
        
    else:
        print(pet_id + ': This pet is not a dog or a cat')

<a id='random-sampling'></a>
## Random Sampling
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
example_array = np.array([1, 2, 3, 4, 5])

# choice 

np.random.choice(example_array, 3, replace=True)

In [None]:
np.random.choice(example_array, 3, replace=False)

In [None]:
np.random.choice(example_array, 3, replace=True, p=[0.05, 0.05, 0.05, 0.05, 0.8])

In [None]:
np.random.choice(example_array, 3, replace=True, p=[0, 0, 0, 0, 1])

In [None]:
# multinomial

np.random.multinomial(100, [0.2, 0.5, 0.3])

In [None]:
np.random.multinomial(100, [0.1, 0.1, 0.8]) 

In [None]:
# permutation

for i in range(5):
    print(f'{i + 1}st random permutation:')
    print(np.random.permutation(example_array), end='\n\n')

In [None]:
pets.sample(3, replace=True)

In [None]:
pets.sample(3, replace=False)

<a id='bootstrapping-and-confidence-intervals'></a>
## Bootstrapping and Confidence Intervals
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
pets

In [None]:
extra_pets = bpd.DataFrame().assign(
    ID=['dog_004', 'dog_005', 'dog_006', 'cat_003', 'cat_004', 'ham_003', 'dog_007', 'ham_004', 'dog_008', 'dog_009', 'cat_005'],
    Species=['dog', 'dog', 'dog', 'cat', 'cat', 'hamster', 'dog', 'hamster', 'dog', 'dog', 'cat'],
    Color=['black', 'white', 'golden', 'black', 'white', 'black', 'white', 'golden', 'black', 'white', 'black'],
    Weight=[45, 10, 35, 10, 15, 0.5, 50, 0.25, 40, 30, 5],
    Age=[6.7, 7.0, 4.0, 1.2, 2.9, 0.1, 6.1, 0.2, 5.0, 4.8, 0.5]
)

all_pets = pets.append(extra_pets, ignore_index=True)

all_pets

In [None]:
# Magic to ensure that we get the same results every time this code is run. 
np.random.seed(42)

# sample
pets_sample = all_pets.sample(12, replace=False)
pets_sample

In [None]:
print('Median of pets weight:', all_pets.get('Weight').median())
print('Median of pets_sample weight:', pets_sample.get('Weight').median())

In [None]:
boot_medians = np.array([])
for i in np.arange(10000):
    # 1. resample the data
    resample = pets_sample.sample(pets_sample.shape[0], replace=True)

    # 2. calculate the median of the resample
    boot_median = resample.get('Weight').median()

    # 3. append the median to the array
    boot_medians = np.append(boot_medians, boot_median)

In [None]:
# Get the 90% confidence interval
left = np.percentile(boot_medians, 5) # 5th percentile
right = np.percentile(boot_medians, 95) # 95th percentile

In [None]:
left, right

In [None]:
# Plot the histogram of boot_medians
plt.hist(boot_medians, bins=20, density=True)

plt.show()

We are 90% confident that the population weight median will fall between 15 and 40.

<a id='statistics-hypothesis-testing-and-permutation-testing'></a>
## Statistics, Hypothesis Testing, and Permutation Testing
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

Our pair of hypotheses is:
* **Null Hypothesis:** The mean weights of dogs and cats are the *same*.
* **Alternative Hypothesis:** The mean weights of dogs and cats are *different*.

In other words:
* **Null Hypothesis:** $\mu_{dog\_weight} = \mu_{cat\_weight}$ <br>
* **Alternative Hypothesis:** $\mu_{dog\_weight} \neq \mu_{cat\_weight}$

Since the alternative hypothesis is of the form "A and B are different," the test statistic should measure distance and should contain an absolute value.

***

$\therefore$ Use **absolute difference in group means** as the test statistic.

In [None]:
cats_dogs = all_pets[(all_pets.get('Species') == 'dog') | (all_pets.get('Species') == 'cat')]
cats_dogs

In [None]:
def difference_in_means(cats_dogs):
    means = cats_dogs.groupby('Species').mean()
    return np.abs((means.get('Weight').loc['dog'] - means.get('Weight').loc['cat']))

In [None]:
n = 500
statistics = np.array([])
for i in np.arange(n):
    # 1. Shuffle the species.
    shuffled = cats_dogs.assign(Species=np.random.permutation(cats_dogs.get('Species')))

    # 2. Compute the test statistic.
    statistic = difference_in_means(shuffled)

    # 3. Save the result.
    statistics = np.append(statistics, statistic)

In [None]:
# 4. Calculate the p-value
observed = difference_in_means(cats_dogs)
p_value = np.count_nonzero(statistics >= observed) / n

print("The observed value of the test statistic is:", observed)
print("The p-value is:", p_value)

Using an alpha level of 0.05...

**Conclusion:** 
* Under the null hypothesis, we rarely see a difference greater than 26.444 units.
* Therefore, we reject the null hypothesis: the evidence implies that the groups do not come for the same distribution.
* Still, we cannot conclude that species causes a different weight because there may be other factors at play.

<a id='spread-of-a-distribution'></a>
## Spread of a Distribution
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

$$\text{SD} = \sqrt{\frac{(value_1-mean)^2 + \dots + (value_n - mean)^2}{n}}$$

In [None]:
weights = all_pets.get('Weight')

standard_deviation = np.std(weights, ddof=0)
standard_deviation

In [None]:
# how to implement the standard deviation formula
mean = weights.mean()
numerator = 0
for value in weights.values:
    numerator += (value - mean) ** 2
variance = numerator / (all_pets.shape[0])
variance ** 0.5 

In [None]:
weights.plot(kind='hist', bins=5, density=True)

Chebyshev's Inequality states that in any dataset, at least $1 - \frac{1}{z^2}$ of the data falls within *z* SDs of the mean.

| **Range**        | **All Distributions** | **Normal Distribution**|
|:-----------------|:----------------------|:-----------------------|
|mean $\pm$ 1 SD   |at least 0%            | about 68%              |
|mean $\pm$ 2 SD   |at least 75%           | about 95%              |
|mean $\pm$ 3 SD   |at least 88%           | about 99.73%           |

In [None]:
import matplotlib.pyplot as plt

In [None]:
sd_1left = mean - 1*standard_deviation
sd_1right = mean + 1*standard_deviation

plt.hist(weights, bins=5, density=True, alpha=0.6)
plt.axvline(sd_1right, color='green', linestyle='dashed', linewidth=2)
plt.axvline(sd_1left, color='green', linestyle='dashed', linewidth=2)

plt.show()

print(str(((weights > sd_1left) & (weights < sd_1right)).sum()/all_pets.shape[0] * 100) + '% of the data falls within 1 standard deviation of the mean')


In [None]:
sd_2left = mean - 2*standard_deviation
sd_2right = mean + 2*standard_deviation

plt.hist(weights, bins=5, density=True, alpha=0.6)
plt.axvline(sd_2right, color='green', linestyle='dashed', linewidth=2)
plt.axvline(sd_2left, color='green', linestyle='dashed', linewidth=2)

plt.show()

print(str(((weights > sd_2left) & (weights < sd_2right)).sum()/all_pets.shape[0] * 100) + '% of the data falls within 2 standard deviation of the mean')

In [None]:
sd_3left = mean - 3*standard_deviation
sd_3right = mean + 3*standard_deviation

plt.hist(weights, bins=5, density=True, alpha=0.6)
plt.axvline(sd_3right, color='green', linestyle='dashed', linewidth=2)
plt.axvline(sd_3left, color='green', linestyle='dashed', linewidth=2)

plt.show()

print(str(((weights > sd_3left) & (weights < sd_3right)).sum()/all_pets.shape[0] * 100) + '% of the data falls within 3 standard deviation of the mean')

<a id='standard-units-correlation-regression'></a>
## Standard Units, Correlation, Regression
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
def standard_units(col):
    "Standardizes the units of a column."
    return (col - col.mean()) / np.std(col)

def calculate_r(df, x, y):
    '''Returns the average value of the product of x and y, 
       when both are measured in standard units.'''
    x_su = standard_units(df.get(x))
    y_su = standard_units(df.get(y))
    return (x_su * y_su).mean()

def slope(df, x, y):
    "Returns the slope of the regression line between columns x and y in df (in original units)."
    r = calculate_r(df, x, y)
    return r * np.std(df.get(y)) / np.std(df.get(x))

def intercept(df, x, y):
    "Returns the intercept of the regression line between columns x and y in df (in original units)."
    return df.get(y).mean() - slope(df, x, y) * df.get(x).mean()



In [None]:
r = calculate_r(all_pets, 'Age', 'Weight')
m = slope(all_pets, 'Age', 'Weight')
b = intercept(all_pets, 'Age', 'Weight')

In [None]:
def predict_weight(age):
    # Predicts the weight of a pet that is 'age' years old.
    return m * age + b

In [None]:
all_predictions = np.array([])

for age in all_pets.get('Age').values:
    all_predictions = np.append(all_predictions, predict_weight(age))

In [None]:
plt.scatter(x=all_pets.get('Age'), y=all_pets.get('Weight'))
plt.plot(all_pets.get('Age'), all_predictions, color='red')

<a id='the-standard-normal-distribution'></a>
## The Standard Normal Distribution
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
import scipy

In [None]:
print("Area from -inf to -2:", scipy.stats.norm.cdf(-2))
print("Area from -inf to -1:", scipy.stats.norm.cdf(-1))
print("Area from -inf to 0:", scipy.stats.norm.cdf(0))
print("Area from -inf to 1:", scipy.stats.norm.cdf(1))
print("Area from -inf to 2:", scipy.stats.norm.cdf(2))

In [None]:
print("Area from -1 to 1:", scipy.stats.norm.cdf(1) - scipy.stats.norm.cdf(-1))
print("Area from -2 to 2:", scipy.stats.norm.cdf(2) - scipy.stats.norm.cdf(-2))
print("Area from -3 to 3:", scipy.stats.norm.cdf(3) - scipy.stats.norm.cdf(-3))

<a id='the-central-limit-theorem'></a>
## The Central Limit Theorem
<small>(<a href="#table of contents">return to the table of contents</a>)</small>

In [None]:
sd_dist_possible_sample_means = np.std(all_pets.get('Weight'))/all_pets.shape[0]
sd_dist_possible_sample_means

In [None]:
#this sample size may not be large enough
pets_sample = all_pets.sample(12, replace=True)
pets_sample

In [None]:
# 95% confidence interval
left = pets_sample.get('Weight').mean() - 2*sd_dist_possible_sample_means
right = pets_sample.get('Weight').mean() + 2*sd_dist_possible_sample_means

print("95% confidence interval: " + "[" + str(left) + ", " + str(right) + "]")

In [None]:
print("Population mean: " + str(all_pets.get('Weight').mean()))