### Finding the Mean
The mean is a common and intuitive way to summarize a set of numbers (the average). 

In [2]:
shortlist = [1, 2, 3]
sum(shortlist)

6

In [3]:
len(shortlist)

3

In [4]:
def calculate_mean(numbers):
    s = sum(numbers)
    N = len(numbers)
    mean = s/N
    return mean

if __name__ == '__main__':
    donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]
    mean = calculate_mean(donations)
    N = len(donations)
    print('Mean donation over the last {0} days is {1}'.format(N, mean))
    


Mean donation over the last 12 days is 477.75


### Finding the Median
The median is a middle value in a collection of numbers sorted in ascending order. If the length of the list of numbers is odd, the number in the middle of the list is the median. If the length of the list is even, the median is the mean of the two middle numbers. 

Ex. Given 100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200
The middle numbers are 500 and 500, so the mean is (500 + 500 / 2) = 500.


In [5]:
samplelist = [4, 1, 3]
samplelist.sort()
samplelist

[1, 3, 4]

In [6]:
def calculate_median(numbers):
    N = len(numbers)
    numbers.sort()
    
    if N % 2 == 0:
        m1 = N/2
        m2 = (N/2) + 1
        m1 = int(m1) - 1
        m2 = int(m2) - 1
        median = (numbers[m1] + numbers[m2]/2)
    else:
        m = (N+1)/2
        m = int(m) - 1
        median = numbers[m]
        
    return median

if __name__ == '__main__':
    donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]
    median = calculate_median(donations)
    N = len(donations)
    print ('Median donations over the last {0} days is {1}'.format(N, median))

Median donations over the last 12 days is 750.0


### Finding the Mode and Creating a Frequency Table

The Mode is the number that occurs most frequently in a set of numbers. 

#### Finding the Most Common Elements

In [7]:
simplelist = [4, 2, 1, 3, 4]
from collections import Counter
c = Counter(simplelist)

# each member of the list is a tuple
# the first member of the first tuple is the number that occurs most frequently
# the second element of the first tuple is the number of occurances
# the remaining tuples are the values and number of occurances
print(c.most_common())

# print only the most common element
print(c.most_common(1))

# print the two most common elements
print (c.most_common(2))

# print only the the element and not the number of occurances
mode = c.most_common(1)
print(mode[0][0])

[(4, 2), (1, 1), (2, 1), (3, 1)]
[(4, 2)]
[(4, 2), (1, 1)]
4


#### Finding the Mode

In [8]:
from collections import Counter

def calculate_mode(numbers):
    c = Counter(numbers)
    mode = c.most_common(1)
    return mode[0][0]

if __name__=='__main__':
    scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 1, 5, 6, 7, 8, 6, 1, 10]
    mode = calculate_mode(scores)
    
    print('The mode of the list of numbers is: {0}'.format(mode))

The mode of the list of numbers is: 9


#### Creating a Frequency Table

In [9]:
from collections import Counter

def frequency_table(numbers):
    table = Counter(numbers)
    print('Number\tFrequency')
    for number in table.most_common():
        print('{0}\t{1}'.format(number[0], number[1]))
        
if __name__=='__main__':
    scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 1, 5, 6, 7, 8, 6, 1, 10]
    frequency_table(scores)
    

Number	Frequency
9	5
6	3
1	2
5	2
7	2
8	2
10	2
2	1
4	1


In [10]:
# print the frequency tables sorted by value of lowest to highest with sort()
from collections import Counter

def frequency_table(numbers):
    table = Counter(numbers)
    numbers_freq = table.most_common()
    numbers_freq.sort()
    
    print('Number\tFrequency')
    for number in numbers_freq:
        print('{0}\t{1}'.format(number[0], number[1]))
        
if __name__=='__main__':
    scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 1, 5, 6, 7, 8, 6, 1, 10]
    frequency_table(scores)

Number	Frequency
1	2
2	1
4	1
5	2
6	3
7	2
8	2
9	5
10	2


### Measuring the Dispersion
Dispersion is an indicator of how far away numbers in a set of data are from the mean of the data set. 
#### Finding the Range of a Set of Numbers
For a list of numbers, the range is the difference between the highest number and the lowest number. 

In [11]:
def find_range(numbers):
    
    lowest = min(numbers)
    highest = max(numbers)
    r = highest - lowest
    
    return lowest, highest, r

if __name__=="__main__":
    donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]
    lowest, highest, r = find_range(donations)
    print('Lowest: {0} Highest: {1} Range: {2}'.format(lowest, highest, r))

Lowest: 60 Highest: 1200 Range: 1140


#### Finding the Variance and Standard Deviation
The variance is the average of the squares of the difference of each number from the mean. 
\begin{equation*}
variance = (\sum(x_1 - {\overline x})^2) / n
\end{equation*}

In [12]:
def calculate_mean(numbers):
    s = sum(numbers)
    N = len(numbers)
    mean = s/N
    
    return mean

def find_differences(numbers):
    mean = calculate_mean(numbers)
    diff = []
    for num in numbers:
        diff.append(num-mean)
        
    return diff

def calculate_variance(numbers):
    
    diff = find_differences(numbers)
    squared_diff = []
    
    for d in diff:
        squared_diff.append(d**2)
        
    sum_squared_diff = sum(squared_diff)
    variance = sum_squared_diff / len(numbers)
    
    return variance

if __name__=='__main__':
    donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]
    variance = calculate_variance(donations)
    
    # if the variance and standard deviation are large, the daily donations
    # vary greatly from the mean.
    print('The variance of the list of numbers is {0}'.format(variance))
    
    std = variance**0.5
    print('The standard deviation of the list of numbers is {0}'.format(std))

The variance of the list of numbers is 141047.35416666666
The standard deviation of the list of numbers is 375.5627166887931


### Calculating the Correlation Between Two Data Sets
#### Calculating the Correlation Coefficient
The correlation coefficent meansures the strength of the linear relationship. 

A correlation coefficent of 0 indicates there there is no linear correlation between the two quantities. 

A correlation coefficient of 1 (or close to 1) indicates that there is a strong positive linear correlation; a coefficient of exactly 1 is referred to as a perfect positive correlation. 

\begin{equation*}
correlation = n\sum xy - \sum x \sum y / (\sqrt (n \sum x^2 - (\sum x)^2) (n \sum y^2 - (\sum y)^2))
\end{equation*}


##### Sum of the products of the individual elements of the two sets of numbers (x, y)

\begin{equation*}
\sum xy 
\end{equation*}

##### Sum of the numbers is set x

\begin{equation*}
\sum x 
\end{equation*}

##### Sum of the numbers is set y

\begin{equation*}
\sum y
\end{equation*}

##### Square of the sum of the numbers in set x

\begin{equation*}
(\sum x)^2 
\end{equation*}

##### Square of the sum of the numbers in set y

\begin{equation*}
(\sum y)^2 
\end{equation*}

##### Sum of the squares of the numbers in set x

\begin{equation*}
\sum x^2 
\end{equation*}

##### Sum of the squares of the numbers in set y

\begin{equation*}
\sum y^2 
\end{equation*}

In [13]:
# zip function: helps calculate the sum of products from two sets of numbers
simple_list1 = [1, 2, 3]
simple_list2 = [4, 5, 6]
print("zip() returns pairs of corresponding elements")
for x,y in zip(simple_list1, simple_list2):
    print (x,y)


zip() returns pairs of corresponding elements
1 4
2 5
3 6


In [14]:
def find_corr_x_y(x, y):
    n = len(x)
    
    prod = []
    for xi, yi in zip(x, y):
        prod.append(xi*yi)
        
    sum_prod_x_y = sum(prod)
    sum_x = sum(x)
    sum_y = sum(y)
    squared_sum_x = sum_x**2
    squared_sum_y = sum_y**2
    
    x_square = []
    for xi in x:
        x_square.append(xi**2)
    x_square_sum = sum(x_square)
    
    y_square = []
    for yi in y:
        y_square.append(yi**z)
    y_square_sum = sum(y_square)
    
    numerator = n*sum_prod_x_y - sum_x*sum_y
    denominator_term1 = n*x_square_sum - squared_sum_x
    denominator_term2 = n*y_square_sum - squared_sum_y
    denominator = (denominator_term1 * denominator_term2)**0.5
    correlation = numerator / denominator

### Reading Data from Files
#### Reading Data from a Text File

In [15]:
# find the sum of numbers stored in a file
def sum_data(filename):
    s = 0
    with open(filename) as f:
        for line in f:
            s = s + float(line)
    print('Sum of the numbers: {0}'.format(s))
    
if __name__=='__main__':
    sum_data('mydata.txt')

Sum of the numbers: 5733.0


In [16]:
# Calculate the mean of numbers stored in a file
def read_data(filename):
    
    numbers = []
    with open(filename) as f:
        for line in f:
            numbers.append(float(line))
    return numbers

def calculate_mean(numbers):
    s = sum(numbers)
    N = len(numbers)
    mean = s/N
    
    return mean

if __name__=="__main__":
    data = read_data('mydata.txt')
    mean = calculate_mean(data)
    print('Mean: {0}'.format(mean))
    

Mean: 477.75


#### Reading Data from a CSV File