In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
import numpy as np
import functools
from functools import reduce
from scipy.stats import kurtosis, skew
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Statistics with PySpark and RDDs

In this exercise we will use imperative programming with Python to compute from a list of  N integers:

   - The mean - the first moment
   
   $\mu = \frac{1}{N} \sum_{i=1}^N x_i.$
        
        
  
   - The standard deviation - second moment
  
   $\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}$
        
        
        
   - The skew  - the third moment
   
   $\gamma_1 = \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^3$
        
        
        
   - The kurtosis - the fourth moment
  
   $\gamma_2 = \Big\{   \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^4  \Big\} - 3$
      

In [None]:
def test_assert():
    """Test that assert is working.

    Args:
        None
    Returns:
        x with value 5
    """
    x = 5
    assert x == 5
    return x
    
y = test_assert() 
assert y == 5

# Define test data set X

In [None]:
# Python variables can be capital letters
np.random.seed(5)
X = np.random.normal(size=100)
sns.set(color_codes=True)
sns.distplot(X)
print(X)

# Convert a list into an RDD


In [None]:
rdd = sc.parallelize(X)
rdd.takeSample(False, 5)

# First Moment

### Define a function, first_moment_term, to  compute the MEAN

 $\mu = \frac{1}{N} \sum_{i=1}^N x_i.$
 
 We will:
 
 1.  Map each term of the list to the correct moment
 2.  Sum the transformed elements over N
 

####  Define the transformation for the first moment terms 
    hint - it is the identity 

In [None]:
def first_moment_term(x):
    """Define the first moment term

    Args:
        x - An element of a list - a term
    Returns:
        transformed term - an identity in this case, x
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    


In [None]:
x = 1

new_term = first_moment_term(x)

print(new_term)

assert new_term == 1

In [None]:
def my_mean(rdd):
    """Compute the mean

    Args:
        rdd - an rdd of  integers
        
    Returns:
         mu - the mean of the rdd
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()
    

In [None]:
mu = my_mean(rdd)
print(mu)
a = np.array(list(X))
print(np.mean(a))
#assert mu == np.mean(a)
assert mu - np.mean(a) <= np.finfo(np.float).eps

# Second Moment

### Define a function, second_moment_term, to compute the standard deviation

$\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}$
 
 We will:
 
 1.  Map each term of the list to the correct moment
 2.  Sum the transformed elements over N
 
 


####  Define the transformation for the first moment terms 

$(x_i - \mu)^2$
   

In [None]:
def second_moment_term(x, mu):
    """Define the first moment term

    Args:
        x - An element of a list - a term
        mu - the mean
    Returns:
        transformed term - an identity in this case, (x - mu)**2
    """
    # YOUR CODE HERE
    raise NotImplementedError()
  

In [None]:
val = 5
mu = 3
term = second_moment_term(x, mu)
assert term == 4


### Transform and reduce to get the standard deviation


   $\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}$
   
    
   N = len(list)
   
   $\mu$ is the mean
   
   
##### Note:  Use Numpy for the square root  - 

      np.sqrt(x)
 

In [None]:
def my_stdev(rdd, mu):
    """Define a function to compute the standard devivation for an RDD

    Args:
        rdd - a list of integers
        mu - the mean of the rdd
        
    Returns:
        the standard deviation of the list
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()
    

In [None]:
mu = my_mean(rdd)
print(mu)
sigma = my_stdev(rdd, mu)
print(sigma)
a = np.array(list(X))
print(np.std(a))
assert mu - np.mean(a) <= np.finfo(np.float).eps
assert sigma - np.std(a) <= np.finfo(np.float).eps

# Third Moment



## Define a function, third_moment_term, to compute the Skew

$\gamma_1 = \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^3$

We will:

Map each term of the list to the correct moment
Sum the transformed elements over N


In [None]:
def third_moment_term(x, mu, sigma):
    """Define the third moment term

    Args:
        x - An element of a list - a term
        mu - the mean
        sigma - the standard deviation
    Returns:
        transformed term - an identity in this case, ((x - mu)/sigma)**3
    """
    # YOUR CODE HERE
    raise NotImplementedError()
  

In [None]:
x = 5
mu = 3
sigma = 1
y = third_moment_term(x, mu, sigma)
assert y == 8

### Transform and reduce to obtain the skew


   $\gamma_1 = \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^3$
   
    
   N = len(list)
   
   $\mu$ is the mean
   
   
   $\sigma$ is the standard deviation
   
   

 

In [None]:
def my_skew(rdd, mu, sigma):
    """Compute the third moment, the skew

    Args:
        rdd - an rdd of integers
        mu - the mean of the rdd
        sigma - the standard deviation
        
    Returns:
        the skew of the list
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    

In [None]:
mu = my_mean(rdd)
print(mu)
sigma = my_stdev(rdd, mu)
print(sigma)
gamma_1 = my_skew(rdd, mu, sigma)
print(gamma_1)
print(skew(a))
a = np.array(list(X))
assert mu - np.mean(a) <= np.finfo(np.float).eps
assert sigma - np.std(a) <= np.finfo(np.float).eps
assert gamma_1 - skew(a) <= np.finfo(np.float).eps

# Fourth Moment

## Define a function, fourth_moment_term, to compute the kurtosis

$\gamma_2 = \Big\{   \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^4  \Big\} - 3$

We will:

Map each term of the list to the correct moment
Sum the transformed elements over N

In [None]:
def fourth_moment_term(x, mu, sigma):
    """Define the fourth moment term

    Args:
        x - An element of a list - a term
        mu - the mean
        sigma - the standard deviation
    Returns:
        transformed term - an identity in this case, ((x - mu)/sigma)**4
    """
    # YOUR CODE HERE
    raise NotImplementedError()
  

In [None]:
x = 5
mu = 3
sigma = 1
y = fourth_moment_term(x, mu, sigma)
assert y == 16

 ### Transform and reduce to obtain the kurtosis

   $g_2 = \Big\{   \frac{1}{N} \sum_{i=1}^N \left[\frac{x_i - \mu}{\sigma}\right]^4  \Big\} - 3$

    
   N = len(list)
   
   $\mu$ is the mean
   
   
   $\sigma$ is the standard deviation
   

In [None]:
def my_kurtosis(rdd, mu, sigma):
    """Define the fourth moment, the kurtosis

    Args:
        rdd - an rdd of integers
        mu - the mean of the list
        sigma - the standard deviation
        
    Returns:
        the skew of the list
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()
    

In [None]:
mu = my_mean(rdd)
print(mu)
sigma = my_stdev(rdd, mu)
print(sigma)
gamma_1 = my_skew(rdd, mu, sigma)
print(gamma_1)
gamma_2 = my_kurtosis(rdd, mu, sigma)
print(gamma_2)
a = np.array(list(X))
assert mu - np.mean(a) <= np.finfo(np.float).eps
assert sigma - np.std(a) <= np.finfo(np.float).eps
assert gamma_1 - skew(a) <= np.finfo(np.float).eps
assert gamma_2 - kurtosis(a) < np.finfo(np.float32).eps
print("Congratulations you passed!")
