# ITNPBD2 Representing and Manipulating Data

# NumPy
## Array processing in Python

* Mostly used for numeric data
* Installed by default in Anaconda Base Distribution
* Use `pip install numpy` if you don't have it
* Can handle 1D, 2D or higher dimensional arrays. Here we stick to 1D and 2D

In [4]:
import numpy as np
import timeit

x = np.array([1, 2, 3])
print(x)

%timeit y = np.array([3, 4, 5])
%timeit y = ([3, 4, 5])

[1 2 3]
849 ns ± 14.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
44.3 ns ± 0.793 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


# Indexing
* Integer index. In 2D the order is row, column
* Slicing using `:` to specify a whole row or column

In [None]:
y = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Array", y)
print("Row 1", y[1])
print("Row 1, Column 2 intersect", y[1, 2])
print("Column 1", y[:,1])

# More Slicing
* `s:e` to slice from `s` to `e-1`

In [None]:
print("Rows 0 to 1", y[0:2])
s = 0
e = 2
print("The same, but with variables defining start and end",y[s:e])
print("Columns 1 and 2", y[:,1:3])
print("The top left square of 2 by 2", y[0:2,0:2])

# Conditional Selection
* Select those elements of an array that satisfy a condition
* Or select with a mask - a NumPy array of `True` or `False`

In [None]:
gr_five = y[y>5]
print(gr_five)

In [None]:
print(y>5)

In [None]:
my_sel=np.array([[True, False, False], [False, True, False], [False, False, True]])
print(my_sel)
print(y[my_sel])

# Arrays of indexes
* Give an array of rows, then an array of columns
* Extract the values from the intersection locations

In [None]:
# Extracts locations [0,0], [1,1], [2,2]
print(y[[0, 1, 2], [0, 1, 2]])

# NumPy Array Shapes
## Specified by a tuple
* In 2D the tuple is (rows, cols)
* in 1D it is (elements, ) note this `,` showing this is still a tuple
* Higher dimensions have more entries: (x,y,z) etc.

## Define an array by shape

In [None]:
ones = np.ones((2, 3))
print(ones)

#int_ones = np.ones((2, 3), dtype=np.int)
#print(int_ones)

In [None]:
rand_ar = np.random.random((5,))
print(rand_ar)

# Get the shape with `shape`
# Reshape an exsting array with `reshape`

In [None]:
z = np.array([[1, 2, 3, 4],[5, 6, 7, 8]])
print(np.shape(z))
# print(z.shape)
z = np.reshape(z, (4, 2))
print(z)

## Now we can see a higher dimensional example
* `arange` creates an array of numbers spanning a given range
* Then we reshape that into a 3D 3 by 3 by 3 array

In [None]:
a = np.arange(27).reshape((3, 3, 3))
print(a)

# Elementwise Maths
* Simple operators like `+,-,*,/` are overloaded in NumPy to operate on whole arrays, one element at a time

In [None]:
y = y+10
print(y)

# Select and Operate
* Combine selection and elementwise operations to operate only on selected elements

In [None]:
y[y>13]+=100
print(y)

# Reading and Writing With Files
* Read a whole file into a NumPy array
* Write data to a new file

In [None]:
sleep = np.loadtxt("data\sleep.csv", skiprows = 1 ,delimiter = ",")
sleep

In [None]:
np.savetxt("data\sleep2.csv",sleep,delimiter=",",
           header="Exercise Minutes,Coffees,Av HR,Eat after 9pm,Steps,Age,Hours awake,Day,Sleep Rating")

In [None]:
np.savetxt("data\sleep2.csv", sleep,delimiter = ",",
           header = "Exercise Minutes,Coffees,Av HR,Eat after 9pm,Steps,Age,Hours awake,Day,Sleep Rating",
          fmt = "%d")

# Aggregation and Other Functions
* Aggregating arrays or parts of arrays with functions like `sum` and `avgerage`
* Note that `df[:,i]` selects the `i`th column

In [None]:
print(sleep[:,0])

In [None]:
print("Average exercise minutes:",np.average(sleep[:,0]))
print("Total Coffees:",np.sum(sleep[:,1]))

## Last variable is `sleep rating`
* Lets find the average sleep rating for 3 coffees and for 0 coffees

In [None]:
print("Max sleep rating:", np.max(sleep[:,8]))
print("Min sleep rating:", np.min(sleep[:,8]))

print("Max Coffees:", np.max(sleep[:,1]))
print("Min Coffees:", np.min(sleep[:,1]))

## Extract Only rows where coffees==0
## Find Average
## Then same for coffees==3

In [None]:
no_coffee = sleep[sleep[:,1]==0]
print(no_coffee)
print(np.average(no_coffee[:,8]))
      
lots_coffee = sleep[sleep[:,1]==3]
print(lots_coffee)
print(np.average(lots_coffee[:,8]))

# Broadcasting
* When performing an arithmetic operation with 2 arrays of different size
* Smaller of the 2 is broadcast across the bigger to line them up

## Simple version - multiply by a scalar

# Extract Indices of interest with `where`
## Then extract a given column using the indices to get the rows

In [None]:
lots_coffee_indices = np.where(sleep[:,1]==0)
print(lots_coffee_indices)
print(sleep[lots_coffee_indices, 8])