# Numpy Quick Review

## Basic Operation
파이썬 리스트와 numpy 배열과의 차이를 알아 보고 numpy가 제공하는 기본적인 수학 연산을 살펴보자.

In [1]:
import numpy as np

In [2]:
a = np.array(range(10))
print(a)

# ndarray means n-dimensional array
print(type(a))

# number of dimensions
print(a.ndim)

# array shape
print(a.shape)

# Bytes per element
print(a.itemsize)

# Bytes of memory used
print(a.nbytes)

[0 1 2 3 4 5 6 7 8 9]
<class 'numpy.ndarray'>
1
(10,)
8
80


In [3]:
# what (10,) means one element tuple  
x = 10,
y = (10)

print('x:', x)
print('y:', y)

x: (10,)
y: 10


In [4]:
# Numpy array is mutable. 
a = np.array(range(10))

# element data type
print(a.dtype)

# Beware of coercion
a[0] = 10.6
print(a)

a.fill(-1.3)
print(a)

int64
[10  1  2  3  4  5  6  7  8  9]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


In [5]:
a = np.array([1, 2, 3])
print(a.dtype)

a = np.array([1., 2, 3])
print(a.dtype)

a = np.array([1.0+1j, 2, 3])
print(a.dtype)

a = np.array([1., 2, 3], dtype='int32')
print(a.dtype)

int64
float64
complex128
int32


In [6]:
x = [1, 2, 3]

for i in x:
    print(i)

# In numpy, for-loop is a not good idea due to performance issue
x = np.array([1, 2, 3])

for i in x:
    print(i)
    
# Thus, broadcasting opertion is a better solution than for-loop

print(a * 10)

1
2
3
1
2
3
[10 20 30]


In [7]:
x = [1, 2, 3]
x.append(4)

# sum is equal to 'append' method of list
x = x + [5]

print(x)

[1, 2, 3, 4, 5]


numpy는 list와는 다르게 '+'는 append 기능을 하지 않는다.

In [8]:
x = np.array([1, 2, 3])
# 'append' and '__sum__' operator do not exit on numpy array

# x.append(4)
x = x + [5]
print(x)

[6 7 8]


In [9]:
# In plain list '__sum__' operator concaterate two operends 
a = [1, 2, 3]
b = [2, 3, 4]

d = a + b
print('list: ',d)

# However, in Numpy '__sum__' operator adds two operends element by element.
a = np.array([1, 2, 3])
b = np.array([2, 3, 4])

d = a + b
print('np: ', d)

list:  [1, 2, 3, 2, 3, 4]
np:  [3 5 7]


In [10]:
x = [1, 2, 3]
# The plain list doesn't support exponentiation 는 다음과 같은 동작은 정의되어 있지 않다.
# print(x**2)

# 곱하기 동작은 다음과 같이 정의되어 있다.
print(x*2)

[1, 2, 3, 1, 2, 3]


In [11]:
x = np.array([1, 2, 3])

x = x**2
print(x)
x = np.sqrt(x)
print(x)
x = np.log(x)
print(x)
x = np.exp(x)
print(x)

[1 4 9]
[1. 2. 3.]
[0.         0.69314718 1.09861229]
[1. 2. 3.]


In [72]:
# The example of how to use slicing in practice

data = np.arange(1000)
np.random.shuffle(data)

total_size = int(data.size * 0.9)

train_data = data[:total_size]
test_data = data[total_size:]

print('train_data.size: ', train_data.size)
print('test_data.size: ', test_data.size)

train_data.size:  900
test_data.size:  100


##  Slicing

The following is the basic form of slicing

```python
var[lower:upper:step]
```

In [109]:
# var[lower:upper:step]

a = np.array(range(10, 20))
print(a)

# Slicing is not a deep copy
c = a[:5]
print(f'c before changing an element :', c)
a[1] = 3
print(f'c after changing an element :', c)

print('a\n', a.flags)
print('c\n', c.flags)

print('a[5:-2] ', a[5:-2])
print('a[5:-2] ', a[5:-2])

# ommited boundaries are assumed to be the beginning or end of list
print('a[:-5] ', a[:-5])

print('a[::3] ', a[::3])

print('a[2:-3:2] ', a[2:-3:2])

[10 11 12 13 14 15 16 17 18 19]
c before changing an element : [10 11 12 13 14]
c after changing an element : [10  3 12 13 14]
a
   C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
c
   C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
a[5:-2]  [15 16 17]
a[5:-2]  [15 16 17]
a[:-5]  [10  3 12 13 14]
a[::3]  [10 13 16 19]
a[2:-3:2]  [12 14 16]


In [27]:
# Multi-dimentions slicing
a = np.arange(36).reshape(6, 6)
print(a,'\n')

# In numpy, a[3, 3] is a much better way to access element in multi-dimentional array
print('a[3,3]:\n', a[3,3])

print('a[0, 3:5]:\n', a[0, 3:5])
print('a[4:, 4:]:\n', a[4:, 4:])
print('a[2::2, ::2]:\n', a[2::2, ::2])

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]] 

a[3,3]:
 21
a[0, 3:5]:
 [3 4]
a[4:, 4:]:
 [[28 29]
 [34 35]]
a[2::2, ::2]:
 [[12 14 16]
 [24 26 28]]


## Fancy Indexing in 1-D

In [50]:
# fancy indexing
a = np.arange(0, 100, 10)
indices = [1, 2, -2]

b = a[indices]

print(a)
print(b)

[ 0 10 20 30 40 50 60 70 80 90]
[10 20 80]


In [83]:
# boolean mask
a = np.arange(0, 100, 10)
mask = np.array(np.random.randint(0, 2, 10), dtype=bool)

print(a)
print(mask)
print(a[mask])

# if you want to find out positions of 'True'
print(np.nonzero(mask))

[ 0 10 20 30 40 50 60 70 80 90]
[ True False False  True False  True  True  True False  True]
[ 0 30 50 60 70 90]
(array([0, 3, 5, 6, 7, 9]),)


In [74]:
# conditional mask
a = np.arange(0, 100, 10)
mask = a < 30

print(a)
print(mask)
print(a[mask])
print(a[a<30])

[ 0 10 20 30 40 50 60 70 80 90]
[ 0 10 20]


In [90]:
# Tricky part
a = np.arange(0, 100, 10)

print('a < 80: ', a < 80)
print('a > 40: ', a > 40)

# Binary operators: and, or, not
# Binary operators requires both operands to be booleans. However, a > 40 isn't either True or False
# Thus the following doesn't work
# print(a > 40 and a < 80)

# In contrast, Bitwise doesn't require the above mentioned
# Bitwise operators:
# & (and)
# | (or)
# ~ (not)
# ^ (xor)

# So, you can use the following
print('(a > 40) & (a < 80): ', (a > 40) & (a < 80))

# Other way to solve the above
less = a < 80
more = a > 40

print('intersected indices: ',np.intersect1d(np.nonzero(less), np.nonzero(more)))

a < 80:  [ True  True  True  True  True  True  True  True False False]
a > 40:  [False False False False False  True  True  True  True  True]
(a > 40) & (a < 80):  [False False False False False  True  True  True False False]
intersected indices:  [5 6 7]


## Fancy Indexing in 2-D

In [114]:
a = np.arange(36).reshape(6, 6)
print(a,'\n')

# a[[x coordinates], [y coordinates]]
print('a[[0, 1, 2, 3], [1, 2, 3, 4]]: ', a[[0, 1, 2, 3], [1, 2, 3, 4]])

# you can use both slicing and indexing in the same bracket 
print('a[3:, [0, 2, 4]]:', '\n', a[3:, [0, 2, 4]])

# you can use both slicing and indexing in the same bracket
# One thing you should know that while slicing is swallow copy, indexing is deep copy  
c = a[3:, [0, 2, 4]]
print('a[3:, [0, 2, 4]]:', '\n', c)
a[3, 2] = 1
print('a[3:, [0, 2, 4]]:', '\n', c)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]] 

a[[0, 1, 2, 3], [1, 2, 3, 4]]:  [ 1  8 15 22]
a[3:, [0, 2, 4]]: 
 [[18 20 22]
 [24 26 28]
 [30 32 34]]
a[3:, [0, 2, 4]]: 
 [[18 20 22]
 [24 26 28]
 [30 32 34]]
a[3:, [0, 2, 4]]: 
 [[18 20 22]
 [24 26 28]
 [30 32 34]]


## Multi-Dimensional Arrays

- Operations between multiple array objects are first checked for proper shape match
- Matematical operator (+, -, *, /, exp, log, etc) apply element by element on the values
- Reduction operations (mean, std, skew, kurt, sum, prod, etc) apply to the whole array, unless an axis is specified
- Missing values propagate unless explicitly ignored  
<br>
<br>
<img src="img/md.png" alt="drawing" style="width:400px;"/>

## Dot product
내적은 딥러닝을 포함해서 다양한 머신러닝 알고리즘에서 다양하게 사용된다.

In [14]:
a = np.array([2, 2])
b = np.array([2, 1])

# scalar값을 곱할 때는 각 element에 곱이 수행된다.
print(a*2)

# vector간의 element-wise 곱을 수행한다.
print(a*b)

# 내적은 다음과 같이 수행된다.
print(np.sum(a*b))

# 제공되는 dot product는 다음과 같다.
print(np.dot(a, b))
print(a.dot(b))

[4 4]
[4 2]
6
6
6


List를 이용한 dot product와 numpy를 이용한 dot product의 성능 비교를 살펴보자.

In [15]:
from datetime import datetime

a = range(100)
b = range(100)

t0 = datetime.now()

scalar = 0
for i in range(len(a)):
    scalar += (a[i] * b[i])
    
list_duration = (datetime.now() - t0)

print("List: ",list_duration.total_seconds())

# 정규 분포로 부터 랜덤값을 반환한다. 파라미터는 shape을 나타낸다.
# print(np.random.randn(2, 3))

a = np.array(range(100))
b = np.array(range(100))

t0 = datetime.now()

a.dot(b)

np_duration = (datetime.now() - t0)

print("Numpy: ", np_duration.total_seconds())

print("Numpy is faster than list {} times".format(list_duration.total_seconds() / np_duration.total_seconds()))

List:  0.000137
Numpy:  5.6e-05
Numpy is faster than list 2.446428571428571 times


## Vectors and Matrices

In [42]:
x = np.array([[1, 2], [3, 4]]) 
print('type(x): ', type(x))
print(f'x shape: {x.shape}')
print('x:', x)
print('x[0]: ', x[0])
print('x[0, 0]: ', x[0, 0])
print('x.T: \n', x.T)

type(x):  <class 'numpy.ndarray'>
x shape: (2, 2)
x: [[1 2]
 [3 4]]
x[0]:  [1 2]
x[0, 0]:  1
x.T:  [[1 3]
 [2 4]]


In [44]:
x = np.matrix([[1, 2], [3, 4]])
print('type(x): ', type(x))
print(f'x shape: {x.shape}')
print('x:', x)
print('x[0]: ', x[0])
print('x[0, 0]: ', x[0, 0])
print('x.T: \n', x.T)

type(x):  <class 'numpy.matrixlib.defmatrix.matrix'>
x shape: (2, 2)
x: [[1 2]
 [3 4]]
x[0]:  [[1 2]]
x[0, 0]:  1
x.T: 
 [[1 3]
 [2 4]]


In [18]:
print(np.zeros(5))
# 파라미터가 2개 들어가는 것이 아니라 tuple에 첫 번째 파라미터로 들어간다.
print(np.zeros((3,3)))
# 모든 값이 1로된 array를 반환한다.
print(np.ones((3,3)))
# 0부터 1사이의 값을 반환한다.
print(np.random.random((3,3)))
# 3x3 배열 정규 분포 반환. 입력이 2개 들어간다. 
print(np.random.randn(3,3))

[0. 0. 0. 0. 0.]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[[0.23026583 0.56888947 0.74525327]
 [0.76372824 0.96930679 0.80709215]
 [0.54143546 0.89154384 0.02657295]]
[[-0.48045304  2.03111105 -0.35220643]
 [ 0.54623093 -0.31747465 -0.98978466]
 [-0.4734849   0.35937731 -1.9028411 ]]


In [19]:
x = np.random.randn(100,100)
print(x.mean())
print(x.var())

0.0015261588589282084
1.0079976566255637


In [20]:
x = np.array([[1,2],[3,4]])
# 역행렬 inverse matrix
i = np.linalg.inv(x)
print(i)
print(x.dot(i))
# determiant
np.linalg.det(x)


[[-2.   1. ]
 [ 1.5 -0.5]]
[[1.00000000e+00 1.11022302e-16]
 [0.00000000e+00 1.00000000e+00]]


-2.0000000000000004

In [21]:
x = np.array([1, 2])
y = np.array([3, 4])

# 외적 outer product
print(np.outer(x, y))

[[3 4]
 [6 8]]


In [22]:
# trace : sum of diagnal values
x = np.array([[1,2],[3,4]])
print(np.diag(x).sum())
print(np.trace(x))

5
5


## Save and Load

저장된 파일의 확장자를 통해서 저장된 데이터 타입을 알 수 있다.

In [23]:
# 파일 확장자가 .npy일 경우, array를 반환한다.
x = np.array([[1, 2, 3], [4, 5, 6]])
y = np.array([1, 2, 3])

np.save('./array', x)
data = np.load('./array.npy')
print('x: ', data)

data = np.load('./array.npy', mmap_mode='r')
print('x(mmap_mode) : ', data[1, :])

# 파일 확장자가 .npz일 경우 dictionary 형태의 key-value 형태로 반환된다.
# {filename: array}
np.savez('dict.npz', a=x, b=y)
data = np.load('./dict.npz')
print('a: ', data['a'])
print('b: ', data['b'])


x:  [[1 2 3]
 [4 5 6]]
x(mmap_mode) :  [4 5 6]
a:  [[1 2 3]
 [4 5 6]]
b:  [1 2 3]


## Miscellaneous

In [24]:
data = np.array([[1,2,3], [5,6,7]])

print(data[:,0])
print(data[:,1])
print(data[:,2])

[1 5]
[2 6]
[3 7]
