# NUMPY PRACTICE - MSDS 7331 DATA MINING
## Cory Nichols

In [166]:
# provides for standard mathematical functions for fast operations on entire arrays of data without having
# to write loops
# linear algebra, random number gen, fourier transformation capabilities
# tools for integrating code written in C, C++ and Fortran

import numpy as np
from numpy.random import randn
import pandas as pd

In [173]:
# lets start with something called an ndarray
# ndarray is a fast and space efficient multidimensional array providing vectorized arithmetic operations
# and sophisticated broadcasting capabilities

anArray = np.array(np.arange(20)) # np.arange is like range() in pure Python, returns an array
print anArray, '\n----------'
aManualArray = np.array([[1,2,3,4,5],[6,7,8,9,10]]) # a manual 2D array
print aManualArray
print 'the shape of the array is', aManualArray.shape
print 'the dimensions of the array are', aManualArray.ndim, 'dimensions' # check num of dimensions

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] 
----------
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
the shape of the array is (2, 5)
the dimensions of the array are 2 dimensions


In [179]:
# easily apply vector operations without having to write a loop:
multiArray = anArray * 20
print multiArray, '\n--------'
print anArray / 2, '\n--------'
print anArray ** 2, '\n--------'
print anArray.shape, 'shows that this array has', anArray.shape[0],  'rows and no columns'

 [  0  20  40  60  80 100 120 140 160 180 200 220 240 260 280 300 320 340
 360 380] 
--------
[0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9] 
--------
[  0   1   4   9  16  25  36  49  64  81 100 121 144 169 196 225 256 289
 324 361] 
--------
(20,) shows that this array has 20 rows and no columns


In [189]:
# array.reshape(rows,cols) will create a matrix out of the array
# recreate array to 4 rows by 5 cols
anArray = anArray.reshape(4,5)
print anArray, anArray.shape, anArray.ndim, '\n------'
print anArray[1], anArray[1].shape, anArray[1].ndim, '\n------' 
# results in a one dimensional array w/5 rows at position 1 of the 4x5 matrix
# also reduces dimensionality since integer was used, [5,6,7,8,9] becomes 1D array in rows
print anArray[1:2,:], anArray[1:2,:].shape, anArray[1:2,:].ndim,'\n------' 
# gets row at position 1 and all columns, keeps dimensionality to 2D
# in 2 dimensions, results in a 1x5 array: 1 row with 5 cols

 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]] (4, 5) 2 
------
[5 6 7 8 9] (5,) 1 
------
[[5 6 7 8 9]] (1, 5) 2 
------


In [None]:
# np.array tries to infer values, mostly as ints or floats when data passed to it
# can check data types with variable.dtype
print anArray.dtype
# make sure to pass tuples...
anotherArray = np.zeros((5,4))
print anotherArray
# create an empty array
emptyArray = np.empty((10,20))
print emptyArray
# create a 1D array with arange
rangeArray = np.array(np.arange(20))
print rangeArray
# reshape it to a 4x5 matrix
rangeArray = np.array(np.arange(20)).reshape(4,5)
print rangeArray
# identity matrix, idents have to be balanced matrices, pass one integer
# eg: 5 = 5x5
identMatrix = np.identity(5)
print identMatrix

In [190]:
# explicitly casting and changing data types
# can change data types with np.astype(variable)
arr1 = np.array([1,2,3], dtype = np.float64)
print arr1.dtype
# dtypes are powerful, most map directly onto an underlying machine representation
# makes them fast, easy to read and write binary streams of data to disk and also connect to code
# written in a low level language like C or Fortran
# beware of floating precision as always
# cast an array as another type
arr1 = arr1.astype(np.int64)
arr1.dtype

float64


dtype('int64')

In [191]:
# lets convert an array of numbers represented as strings back to numbers
numStrings = np.array(['25.3','97.5','32.6','100.0'])
print numStrings.dtype
print 'dtype before conversion: ', numStrings.dtype
numStrings = numStrings.astype('float64')
print 'dtype after conversion:', numStrings.dtype


|S5
dtype before conversion:  |S5
dtype after conversion: float64


In [192]:
# you can also cast an array based on another array's dtype
int_array = np.arange(10)
print int_array.dtype
calibers = np.array([.22,.270,.357,.380,.44,.5], dtype=np.float64)
int_array = int_array.astype(calibers.dtype)
print 'The array is now a float array:'
print int_array.dtype

int64
The array is now a float array:
float64


### Calling astype always creates a new array -- a copy of the data, even if the new dtype is the same as the old type

In [198]:
# arrays are critical because they enable you to express batch operations on data without writing for loops
# which is just awesome
# this is called, formally, vectorization
# ANY arithmetic operations between equal size arrays applies the operation ELEMENTWISE (iteratively)

# lets create a 2D array
array = np.array([[1.,2.,3.], [4.,5.,6.]])
print array.ndim
print array.shape
print array, '\n--------'
# dot product
print array*array[1], '\n--------'
# matrix subtraction
print array-array, '\n--------'
# matrix multiplication
print array-(array*2), '\n--------'
# scalar operations
print array * 5

2
(2, 3)
[[ 1.  2.  3.]
 [ 4.  5.  6.]] 
--------
[[  4.  10.  18.]
 [ 16.  25.  36.]] 
--------
[[ 0.  0.  0.]
 [ 0.  0.  0.]] 
--------
[[-1. -2. -3.]
 [-4. -5. -6.]] 
--------
[[  5.  10.  15.]
 [ 20.  25.  30.]]


### operations between different sized arrays (matrices) is called BROADCASTING 

In [199]:
# indexing and slicing NumPy arrays
# 1D arrays act similarly to Python lists:
arr = np.array([1,2,3,4,5])
# negative indices start at "1", not 0
print arr[-1]
print arr[0:2]
# up to but not including, as usual
arr[2:4] = 50
print arr

5
[1 2]
[ 1  2 50 50  5]


### IMPORTANT NOTE: array slices in NumPy are simply VIEWS to the original array in memory
### BEWARE: ANY changes to the array via a slice will propogate back to the original array

In [None]:
# lets use the original array "arr" like a pirate
array_slice = arr[1:3]
print array_slice
# now check this out
array_slice[0] = 9999
print arr
# the ORIGINAL array, arr, now has 9999 in position 1 because we edited array_slice directly
# this is similar to Java's reference side effect, whereby an array is stored in memory
# if we make edits to the array, within another variable, it points to the same space in memory, and thus edits the
# original array
# numpy does this for performance reasons, to better deal with larger data sets, avoiding copying tons of records
# and storing additional memory



In [None]:
# if you want a true copy, explicitly call it out with .copy()

array_slice = arr[1:3].copy()
print arr
array_slice[0] = 12345
print arr
# no effect, new space in memory
del array_slice

In [201]:
# higher dimensional array slicing
# elements at each index are no longer scalars but rather one-dimensional arrays
arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])
print 'get array in position 2 of 2-dimensional array: ', arr2d[2]
print 'get 0th position array, and the scalar in the 2nd position:', arr2d[0][2]
print 'same as above, except with comma sep list:', arr2d[0,2]
print '''
Two Dimensional Arrays or Matrices look and index to the following diagram
Rows are axis 0, cols are axis 1

         Axis 1
           0    1    2
Axis 0  0
        
        1
        
        2
'''

get array in position 2 of 2-dimensional array:  [7 8 9]
get 0th position array, and the scalar in the 2nd position: 3
same as above, except with comma sep list: 3

Two Dimensional Arrays or Matrices look and index to the following diagram
Rows are axis 0, cols are axis 1

         Axis 1
           0    1    2
Axis 0  0
        
        1
        
        2



In [208]:
# if you omit later indices, the result will be a lower dimensional ndarray consisting of all the data along
# the higher dimensions
# create a 3D array
arr3d = np.array([[[1,2,3], [4,5,6]], [[7,8,9], [10,11,12]]])
print arr3d
print arr3d.ndim, 'dimensions \n--------'
# get first 2D array
print arr3d[0], '\n---------'
# get element 1 in the first array of the first 2D array (whew!)
print 'element 1 in first array:', arr3d[0,0,1], '\n---------'
# change the first 2d array to 99
arr3d[0] = 99
print arr3d, '\n---------'
arr3d[0].ndim
print arr3d, '\n---------'
print arr3d[1,0]

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]
3 dimensions 
--------
[[1 2 3]
 [4 5 6]] 
---------
element 1 in first array: 2 
---------
[[[99 99 99]
  [99 99 99]]

 [[ 7  8  9]
  [10 11 12]]] 
---------
[[[99 99 99]
  [99 99 99]]

 [[ 7  8  9]
  [10 11 12]]] 
---------
[7 8 9]


In [215]:
# indexing with slices
# a slice selects a range of elements among an axis
# you can pass multiple slices like you can pass multiple indexes
print arr2d, '\n---------------'
print arr2d[2:].shape, ' keeps dimensionality (2D) because its a slice \n---------------'
print arr2d[2:].ndim, 'dimensions \n---------------'
# when slicing like this, you always get same number of dimensions back
# by mixing integer slices, you get lower dimensional slices
print arr2d[1,:2], 'reduces dimensions to 1 \n---------------'
print arr2d[1,:2].shape
print arr2d[1,:2].ndim, 'dimensions \n---------------'
# slice only higher dimensional axes with colons
print arr2d[:,:1], 'keeps dimensions since only slices! \n---------------'
print arr2d[:,:1].shape, '\n---------------'
print arr2d[:,:1].ndim, 'dimensions \n---------------'
# set some slices to zero
arr2d[:2,1:] = 0
print arr2d

[[1 0 0]
 [4 0 0]
 [7 8 9]] 
---------------
(1, 3)  keeps dimensionality (2D) because its a slice 
---------------
2 dimensions 
---------------
[4 0] reduces dimensions to 1 
---------------
(2,)
1 dimensions 
---------------
[[1]
 [4]
 [7]] keeps dimensions since only slices! 
---------------
(3, 1) 
---------------
2 dimensions 
---------------
[[1 0 0]
 [4 0 0]
 [7 8 9]]


In [None]:
### Boolean Indexing

In [217]:
# using arbitrary strings to index arrays and matrices
names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'], dtype = np.str)
data = randn(7,4) # create a random matrix of normally distributed values
print data
print '-----'
# Bob is in position 0 and 3
print names == 'Bob', '\n--------'
print data[names == 'Bob']
# since Bob's position is 0, 3, prints arrays at position 0 and 3
print '------'
print data[names == 'Bob', 2:]
# since Bob's position is 0, 3, prints arrays at position 0 and 3 with elements 2 and greater in each array

[[ 1.29313774  0.00514901 -1.26392151  0.08542281]
 [-0.62979118 -1.15246246 -0.26780408  0.27893627]
 [-1.40263692 -2.10162292 -0.31976389 -0.44329617]
 [ 1.76418262  0.11052763 -1.55802298 -0.11021456]
 [ 0.2076581  -0.18809767  1.9111004  -1.64087642]
 [ 0.06850057 -0.86553932 -0.40980652 -0.73576753]
 [-1.44441441 -1.05848685  0.08451821 -0.32478464]]
-----
[ True False False  True False False False] 
-------
[[ 1.29313774  0.00514901 -1.26392151  0.08542281]
 [ 1.76418262  0.11052763 -1.55802298 -0.11021456]]
------
[[-1.26392151  0.08542281]
 [-1.55802298 -0.11021456]]


In [218]:
# use multiple names to index the data
# have to use | for or, or & for and, typical and or does not work here
mask = (names == 'Bob') | (names == 'Will')
print data[mask]
# mask returns all of the arrays in data indexed by Bob and Will (4 of them in this case)
# set all negatives to zero in the data matrix
data[data < 0] =0 
print '-----\n', data

[[ 1.29313774  0.00514901 -1.26392151  0.08542281]
 [-1.40263692 -2.10162292 -0.31976389 -0.44329617]
 [ 1.76418262  0.11052763 -1.55802298 -0.11021456]
 [ 0.2076581  -0.18809767  1.9111004  -1.64087642]]
-----
[[ 1.29313774  0.00514901  0.          0.08542281]
 [ 0.          0.          0.          0.27893627]
 [ 0.          0.          0.          0.        ]
 [ 1.76418262  0.11052763  0.          0.        ]
 [ 0.2076581   0.          1.9111004   0.        ]
 [ 0.06850057  0.          0.          0.        ]
 [ 0.          0.          0.08451821  0.        ]]


## FANCY INDEXING

In [220]:
# fancy indexing is a term used for integer arrays
# lets take an 8x4 matrix

arr = np.empty((8,4))

# set values in matrix to iterator position
for i in range(8):
    arr[i] = i
    
print arr, '\n---------'

# now let's select a subset of rows in a particular order
# pass a list of ints to the index
print arr[[4,3,0,6]], '\n---------'
# use negative indices to go backward across a matrix
print arr[[-1, -3, -5]] # should be 7, 5, 3


[[ 0.  0.  0.  0.]
 [ 1.  1.  1.  1.]
 [ 2.  2.  2.  2.]
 [ 3.  3.  3.  3.]
 [ 4.  4.  4.  4.]
 [ 5.  5.  5.  5.]
 [ 6.  6.  6.  6.]
 [ 7.  7.  7.  7.]] 
---------
[ 1.  1.  1.  1.]
[[ 4.  4.  4.  4.]
 [ 3.  3.  3.  3.]
 [ 0.  0.  0.  0.]
 [ 6.  6.  6.  6.]] 
---------
[[ 7.  7.  7.  7.]
 [ 5.  5.  5.  5.]
 [ 3.  3.  3.  3.]]


In [None]:
# passing multiple index arrays: it selects a 1D array of elements corresponding to each tuple of indicies
arr = np.arange(32).reshape(8,4)
print arr, '\n----------'
print arr[[1,5,7,2],[0,3,1,2]]
# essentially passing tuple elements into the overall array positions: (1,0), (5,3), (7,1), (2,2) and bringing them
# back as a 1D array
testArray = arr[[1,5,7,2], [0,3,1,2]]
# to get the rectangular region formed by selecting a subset of a matrix's rows and columns:
arr[np.ix_([1, 5, 7, 2],[0, 3, 1, 2])]
# in this case, we get row at position 1, with elements at position 0, 3, 1, 2
# then we get row at position 5, with elements at position 0, 3, 1, 2
# and so on....
print arr[:,:1].shape

# # Transposing Arrays and Swapping Axes

In [232]:
# transposing is a special form of reshaping which returns a view on the underlying data without
# copying anything
# arrays have the transpose method and also the special T attribute
arr = np.arange(15).reshape(3,5)
print arr, '\n'
print arr.T # this is the transpose of arr, 3x5 becomes 5x3
# compute the inner matrix product X^tX using np.dot
print np.dot(arr.T,arr)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]] 

[[ 0  5 10]
 [ 1  6 11]
 [ 2  7 12]
 [ 3  8 13]
 [ 4  9 14]]
[[125 140 155 170 185]
 [140 158 176 194 212]
 [155 176 197 218 239]
 [170 194 218 242 266]
 [185 212 239 266 293]]


ValueError: operands could not be broadcast together with shapes (5,3) (3,5) 

In [243]:
# swapping axes is also easy
# lets swap axis 1 with axis 2
# (0,4) (8,12) become rows
arr = np.array([[[0,1,2,3],
                 [4,5,6,7]],
                 [[8,9,10,11],
                 [12,13,14,15]]])

print arr.swapaxes(1,2)
print arr.transpose((1,2,0))


[[[ 0  4]
  [ 1  5]
  [ 2  6]
  [ 3  7]]

 [[ 8 12]
  [ 9 13]
  [10 14]
  [11 15]]]
[[[ 0  8]
  [ 1  9]
  [ 2 10]
  [ 3 11]]

 [[ 4 12]
  [ 5 13]
  [ 6 14]
  [ 7 15]]]


# Universal Functions in NumPy
### function that performs elementwise operations on data in ndarrays
### fast vectorized wrappers for simple functions that take one or more scalar values
### and produce one or more scalar results

In [251]:
# simple unary universal funcs
arr = np.arange(10)
print np.sqrt(arr)
print np.exp(arr)
# binary universal functions that return a single value
x = randn(8)
y = randn(8)
print np.maximum(x,y) # prints out maximum of each position in each array compared
# universal functions can also return multiple arrays
arr = randn(7) *5 
print np.modf(arr)

lister = np.array([1.25, 1.75])
print np.ceil(lister)
print np.modf(lister)

[ 0.          1.          1.41421356  1.73205081  2.          2.23606798
  2.44948974  2.64575131  2.82842712  3.        ]
[  1.00000000e+00   2.71828183e+00   7.38905610e+00   2.00855369e+01
   5.45981500e+01   1.48413159e+02   4.03428793e+02   1.09663316e+03
   2.98095799e+03   8.10308393e+03]
[-0.32627871  1.54095124  0.05516584  0.95156608  0.76087284  0.1927715
  0.84374381 -0.46102333]
(array([-0.19103279,  0.20196978,  0.97522022,  0.84610689, -0.62295877,
       -0.10734091,  0.98487673]), array([-0.,  0.,  4.,  2., -1., -0.,  0.]))
[ 2.  2.]
(array([ 0.25,  0.75]), array([ 1.,  1.]))
[False False]


#### common unary functions
##### abs, fabs - absolute element wise value, fabs is a faster alternative for non-complex values
##### sqrt, square, exp e^x of each element
##### log, log10, log2, log1p log1p = log(1+x)
##### sign: compute sign (pos, neg, zero) of each element
##### ceil, floor computer ceiling of each element - smallest integer greater than or equal to each element, less than or equal to for floor
##### rint - rounds elements to nearest integer, preserving dtype
##### modf - return fractional and integral parts of array as separate array
##### isnan - return boolean array indicating whether each value is not a number
##### isfinite, isinf - finite or infinite
##### cos, cosh, sin, sinh, tan, tanh - regular and hyperbolic trig functions
##### arccos, arcosh, arcsin, arcsinh, arctan, arctanh - inverse regular and hyperbolic trig functions
##### logical_not - compute truth value of not x element wise, = to -arr


#### common binary functions
##### add, subtract, multiply, divide, floor_divide, power (raise elements in first array to power of second), maximum or fmax, fmax ignore nan
##### minimum or fmin
##### mod
##### copysign - copy sign of values in second arg to values in first arg
##### greater, greater_equal, less, less_equal, equal, not_equal
##### logical_and, logical_or, logical_xor - equivalent to operators & | ^


In [254]:
#### data processing using arrays
# vectorization much faster than pure python
# evaluate sqrt(x^2 + y^2) across a regular grid of values
# np.meshgrid function takes two 1D arrays and produces two 2D matrices corresponding to all pairs of x,y in the
# two arrays

points = np.arange(-5,5, 0.01)
xs, ys = np.meshgrid(points, points)
print xs
print ys

[[-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]
 [-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]
 [-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]
 ..., 
 [-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]
 [-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]
 [-5.   -4.99 -4.98 ...,  4.97  4.98  4.99]]
[[-5.   -5.   -5.   ..., -5.   -5.   -5.  ]
 [-4.99 -4.99 -4.99 ..., -4.99 -4.99 -4.99]
 [-4.98 -4.98 -4.98 ..., -4.98 -4.98 -4.98]
 ..., 
 [ 4.97  4.97  4.97 ...,  4.97  4.97  4.97]
 [ 4.98  4.98  4.98 ...,  4.98  4.98  4.98]
 [ 4.99  4.99  4.99 ...,  4.99  4.99  4.99]]


In [None]:
import matplotlib.pyplot as plt
z = np.sqrt(xs** 2 + ys **2)
print z

plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")


### Conditional Logic as Array Operations

In [5]:
# using numpy.where
import numpy as np
xarr = np.array([1.1,1.2,1.3,1.4,1.5])
yarr = np.array([2.1,2.2,2.3,2.4,2.5])
cond = np.array([True,False,False,True,False])

# pure python method
result = [(x if c else y) for x, y, c in zip(xarr,yarr,cond)]
print result
# slow on large data sets, lets use NumPy instead
# conditional, true, else
result = np.where(cond, xarr, yarr)
print result

[1.1000000000000001, 2.2000000000000002, 2.2999999999999998, 1.3999999999999999, 2.5]
[ 1.1  2.2  2.3  1.4  2.5]


In [None]:
# arguments to where do not need to be arrays, they can be scalars
from numpy.random import randn
arr = randn(4,4)
print arr
# replace all positive values with 2 else negative values
arrCleaned = np.where(arr > 0, 2, -2)
# set only positive values to 2
arrPos= np.where(arr > 0, 2, arr)

In [None]:
# you can even nest arrays
# else 3 in this case
np.where(cond1 & cond2, 0,
        np.where(cond1, 1
                np.where(cond2, 2, 3)))

## Mathematical and Statistical Methods

In [17]:
# can call standard methods with an array instance method or using top level NumPy function
arr = np.random.randn(5,4) # matrix of normally distributed data
print arr.mean()
print np.mean(arr)
print arr.sum()

# functions like mean and sum take an optional axis arg which computes statistic over the given axis, resulting in
# an array with one fewer dimension
print arr.mean(axis=1) # mean of cols
print arr.sum(axis = 0), '\n-------' # sum of rows 

# methods like cumsum and cumprod do not aggregate, instead produce an array of intermediate results
arr = np.array([[0,1,2], [3,4,5], [6,7,8]])
print arr.cumsum(0), '\n-------' # cumulative sum of rows (up and down)
print arr.cumprod(1), '\n-------' # cumulative product of columns (left to right)
print arr.sum(axis=1)

-0.204359423718
-0.204359423718
-4.08718847436
[-0.41103287 -0.01426361 -0.26313526 -0.11243094 -0.22093443]
[-3.18063461  2.13130826 -1.36991743 -1.66794469] 
-------
[[ 0  1  2]
 [ 3  5  7]
 [ 9 12 15]] 
-------
[[  0   0   0]
 [  3  12  60]
 [  6  42 336]] 
-------
[ 3 12 21]


## Methods For Boolean Arrays

In [21]:
arr = np.random.randn(100)
print (arr>0).sum() # agg sum of all numbers > 0

bools = np.array([False, False, True, False])
print bools.any() # checks for any true values
print bools.all() # checks for all true values in an array as condition

True
False


## Sorting

In [30]:
arr = np.random.randn(8)
print arr
arr.sort()
print arr

# sorting multidimensional arrays
arr = np.random.randn(5,3)
print arr, '\n---------'
arr.sort(0) # 0 or 1 defines descending (0) or ascending (1)
print arr

np.sort(arr,1) # sorts a COPY, not underlying memory by calling np.sort()

[-0.91534096 -0.35537131  1.65798528 -0.00176562  0.58127101  1.23023396
 -0.69027989  0.42250992]
[-0.91534096 -0.69027989 -0.35537131 -0.00176562  0.42250992  0.58127101
  1.23023396  1.65798528]
[[ 1.27138577  0.97537439 -0.21153676]
 [ 1.05102066  0.41981847 -0.06825443]
 [-0.20778216  0.20925159 -1.41952762]
 [ 1.34398439  0.53550408 -2.16345514]
 [ 1.8119909  -0.47351535  1.54445315]] 
---------
[[-0.20778216 -0.47351535 -2.16345514]
 [ 1.05102066  0.20925159 -1.41952762]
 [ 1.27138577  0.41981847 -0.21153676]
 [ 1.34398439  0.53550408 -0.06825443]
 [ 1.8119909   0.97537439  1.54445315]]


array([[-2.16345514, -0.47351535, -0.20778216],
       [-1.41952762,  0.20925159,  1.05102066],
       [-0.21153676,  0.41981847,  1.27138577],
       [-0.06825443,  0.53550408,  1.34398439],
       [ 0.97537439,  1.54445315,  1.8119909 ]])

In [33]:
# get quantiles of array
large_arr = np.random.randn(1000)
large_arr.sort()
# find 5% quantile
large_arr[int(0.05 * len(large_arr))]

-1.5597482637739664

## Unique and Other Set Logic

In [34]:
names = np.array(['Bob','Joe','Will','Sergio','Bob','John','Will'])
print np.unique(names)

['Bob' 'Joe' 'John' 'Sergio' 'Will']


## Linear Algebra Methods in NumPy

In [45]:
# dot products are taken with np.dot() or array.dot()
from numpy.linalg import inv, qr

x = np.array([[1.,2.,3.], [4.,5.,6.]])
y = np.array([[6.,23.],[-1,7],[8,9]])
x.dot(y)
print x.dot(y)

# create 5x5 matrix of randomly distributed float values
x = np.random.randn(5,5)
# multiply the transpose of x by x for inner dot product
mat = x.T.dot(x)
# take the inverse of mat
inv(mat)
print inv(mat), '\n INVERSE ABOVE -----------'
# multiply the inner dot product by its inverse
print mat.dot(inv(mat)), '\n------------' 

# get q,r decomposition
q, r = qr(mat)
print r



[[  28.   64.]
 [  67.  181.]]
[[ 0.46637678  0.34932925 -0.02242434  0.14925006 -0.36715409]
 [ 0.34932925  0.78996947  0.07312474  0.58401039 -0.1865664 ]
 [-0.02242434  0.07312474  0.1368611   0.23371375 -0.02150665]
 [ 0.14925006  0.58401039  0.23371375  1.11566338 -0.13619677]
 [-0.36715409 -0.1865664  -0.02150665 -0.13619677  0.98801382]] 
 INVERSE ABOVE -----------
[[  1.00000000e+00  -1.72027621e-17   1.59541359e-17  -1.45147617e-17
    9.54259617e-19]
 [ -1.55584135e-16   1.00000000e+00   3.19684337e-17  -4.43836097e-16
    3.67817948e-17]
 [ -2.14226699e-17   1.39370486e-16   1.00000000e+00   1.69939228e-17
   -6.72608508e-18]
 [ -1.13783189e-16   3.28686670e-17   1.01453236e-16   1.00000000e+00
   -2.42950425e-18]
 [  6.80464963e-17   1.23491590e-16   2.26627562e-17   1.76419110e-16
    1.00000000e+00]] 
------------
[[-5.95341786  3.2361119  -4.70210565 -0.20928955 -1.99034197]
 [ 0.         -2.89439394 -8.18486785  3.61117134 -0.36468076]
 [ 0.          0.         -9.09783

##### other Linear algebra functions in numpy.linalg:
##### diag (return diagonal or off diagonal elements of a square matrix as 1D array or convert a 1D array into a square matrix with zeros on off)
##### dot - matrix multiplication
##### trace - sum of all diagonal elements
##### det - compute matrix determinant
##### eig - compute the eigenvalues and eigenvectors of square matrix
##### inv - compute inverse of square matrix
##### pinv - compute Moore Penrose pesudo inverse of a mtrix
##### qr - compute the QR decomposition
##### svd - compute the single value decomposition
##### solve - solve the linear system Ax = b for x, where A is a square matrix
##### lstsq - compute the least square solution to Ax = b