# Python for Data Science__Numpy 


## 1. Pearson's correlation $r(x,y)$
<br>$$r(x,y) = \frac{\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum\limits_{i=1}^n (x_i - \bar{x})^2 \sum\limits_{i=1}^n(y_i - \bar{y})^2}} $$

In [9]:
# Find the pearson correlation between two vectors

def pearson_correlation(x, y):
    """Computes Pearson's correlation coefficient between vectors x and y."""
    
    x_mean = np.mean(x)   # calculate x mean
    y_mean = np.mean(y)   # calculate y mean
    
    x_dif = x - x_mean    # calculate the difference between each x and x mean
    y_dif = y - y_mean    # calculate the difference between each y and y mean
    
    x_sse = np.sum(x_dif**2)  # calculate the sum of squared error for x
    y_sse = np.sum(y_dif**2)  # calculate the sum of squared error for y
    
    denominator = np.sqrt(x_sse * y_sse)
    nominator = np.sum(x_dif * y_dif)
    
    return nominator/denominator  

import numpy as np
np.random.seed(10)

n = 10                        # x and y are both NumPy arrays of size(1,𝑛) 
x = np.random.random((1,10))  # create a 1*10 matrix with uniform(0,1) entries
y = np.random.random((1,10))  

# check my code
np.round(pearson_correlation(x,y),6) == np.round(np.corrcoef(x,y)[0,1],6)

True


## 2. Pearson's Correlations inside a matrix X

In [10]:
# Approach_1
def max_corr(n,m):
    array = np.random.random((n,m)) 
    """find maximum correlation between columns of (n,m) random uniform (0,1) array"""
    
    r_list = [] # creative a empty list to store the Pearson correlation coefficients for each of the m choose 2 pairs of columns
    # use two for loops to control the indexs i and j in order to make m choose 2 times loops
    
    for i in range(m):
        for j in range(i+1,m):
            # calculate the Pearson correlation coefficients for array column i and j 
            r = round(np.corrcoef(array[:,i],array[:,j])[0,1], 4) 
            r_list.append(r) # add the results to the r_list 
            
    return max(r_list) # return the maximum value of the Pearson correlation coefficients

import numpy as np
np.random.seed(10)
max_corr(10,20)

0.7339

In [None]:
# Approach_2

import numpy as np
np.random.seed(10)

def max_corr(n,m):
    """find maximum correlation between columns of (n,m) random uniform (0,1) array"""
    array = np.random.random((n,m)) 
    correlations = np.zeros((m,m)) # make matrix to store correlations between columns
    
    for a in range(m):             # loop over columns
        for b in range(m):
            correlations[a,b] = np.corrcoef(array[:,a], array[:,b])[0,1] 
                             # find Pearson's correlation between columns
                                                                         
    return round(max(correlations[np.eye(m)==0]),2) # disregard correlations of columns with themselves 
                             # that is, ignore the "1" entries in the diagonal and find the max of the rest. 

max_corr(10,20)
    

## K-nearest neighbor

Recall, that Euclidean distance between two points is defined as 
<br><br>$$X=(x_1, x_2) \mbox{ and} y=(y_1, y_2)$$
<br>$$d_{\mbox{Euclidean}} = \sqrt{(x_1-y_1)^2 + (x_2-y_2)^2}$$


In [11]:
# Find the pairwise Euclidean distances
import numpy as np
import random
random.seed(10)

# broadcast the arrays to find pair-wise distances btw any two points.
X = np.random.randint(1,10,size = (10,2))
X3d = X[:,np.newaxis,:] - X[np.newaxis,:] 
dist_squared = X3d**2

# add the squared coordinate-wise difference
sum_squared_dist = np.sum(dist_squared, axis = 2)  

# take the squared root of the result
dist_eucl = np.array([np.sqrt(xi) for xi in sum_squared_dist])                                                    

In [12]:
# Find the three nearest neighbors of a point.
# indexing the smallest three entries of each row
neighbors = np.argsort(dist_eucl, axis = 1)                                                               
near_neighbors = neighbors[ : ,1:4]
near_neighbors

array([[6, 7, 1],
       [6, 7, 4],
       [9, 1, 6],
       [5, 8, 2],
       [7, 1, 0],
       [3, 8, 2],
       [0, 1, 7],
       [4, 0, 1],
       [3, 5, 2],
       [2, 6, 1]])

## 1. Covariance between vectors

In [13]:
def covariance(x, y):
    """ Finds the covariance between vectors x and y"""
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    data = [(x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x))]
    return sum(data)/(len(data) - 1)

import numpy as np
x = np.random.random((10,1))
y = np.random.random((10,1))
np.round(covariance(x,y),6) == np.round(np.cov(x.T, y.T)[0,1],6)

array([ True])

## 2. Covariance matrix of matrix X
Suppose that $X$ is a matrix with n rows and m columns. In statistics, these matrices arise freqently when we collect (numerical) data on m variables which are each observed on n independent individuals. Suppose we denote the columns of $X = (x_1, \ldots, x_m)$. Then the covariance matrix of $X$ is defined as 

$$ \mbox{Cov}(X) = \left( \begin{array}{cccc}
Var(x_1) & Cov(x_1, x_2) & \cdots & Cov(x_1, x_m) \\
Cov(x_2, x_1) & Var(x_2) & \cdots & Cov(x_2, x_m) \\
\vdots & \vdots & \ddots & \vdots \\
Cov(x_m, x_1) & Cov(x_m, x_2) & \cdots & Var(x_m) 
\end{array} \right) $$

where Var($x_i$) is the sample variance of the entries in the $i^{th}$ column of $X$ and Cov($x_i, x_j$) ($i \neq j$) is the sample covariance of the entries in columns $i$ and $j$. 

$$ Cov(x_i, x_j) = \frac{1}{n-1}\sum\limits_{k=1}^n (x_{ki} - \bar{x}_i)(x_{kj}-\bar{x_j})$$

That is, the covariance matrix is a square $m\times m$ matrix whose diagonal entries are the sample variances of the columns of $X$ and whose off-diagonal entries are covariances between two columns of $X$, respectively. 

In [14]:
# Approach_1

def covariance_matrix(X):
    """ Finds the covariance matrix of (n,m) shaped array X"""
    
    n = len(X)                                # row number 
    column_means = np.mean(X, axis = 0)       # calculate the column means, column_means_shape=(1,m)
    dif = X[np.newaxis,:,:] - column_means    # broadcast arrays and minus the column mean row-wise, dif.shape=(1,n,m)
    dot_product = dif * dif.T                 # make transfer, dif.T.shape=(m,n,1), dot_product.shape=(m,m)
    cov = np.sum(dot_product, axis = 1)/(n-1) # sum the dot_product row-wise and divide by n-1
    return cov

# check your work
# or make up some other matrix for X 
X = np.random.random((5,3))  

np.array_equal(np.round(np.cov(X.T),4), np.round(covariance_matrix(X),4))  
# Numpy treats each row of array as a separate variable
# again, we're rounding because your "hand" computation will differ slightly from NumPy's internal computations

True

In [None]:
# Approach_2

def covariance_matrix(X):
    """ Finds the covariance matrix of (n,m) shaped array X"""
    
    n = np.shape(X)[0]
    m = np.shape(X)[1]
    cov_matrix = np.zeros((m,m))  # make "empty" of floats to later store covariances in 
    for i in range(m):            # iterating over COLUMNS of X
        for j in range(m):
            cov_matrix[i,j] = (1/(n-1))*np.sum((X[:,i]-X[:,i].mean())*(X[:,j]-X[:,j].mean())) 
                                  # assemble covariance matrix
    return cov_matrix
    
## check your work
X = np.random.random((5,3))  # or make up some other matrix for X 
np.array_equal(np.round(np.cov(X.T),4), np.round(covariance_matrix(X),4)) 
# again, we're rounding because your "hand" computation will differ slightly from NumPy's internal computations