### Numpy Tutorial Part 2 – Vital Functions for Data Analysis
### https://www.machinelearningplus.com/python/numpy-tutorial-python-part2/

In [1]:
import numpy as np

# 1. How to get index locations that satisfy a given condition using np.where?

In [2]:
# Create an array
import numpy as np
arr_rand = np.array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])
print("Array: ", arr_rand)

# Positions where value > 5
index_gt5 = np.where(arr_rand > 5)
print("Positions where value > 5: ", index_gt5)

Array:  [8 8 3 7 7 0 4 2 5 2]
Positions where value > 5:  (array([0, 1, 3, 4], dtype=int64),)


In [3]:
# Take items at given index
arr_rand.take(index_gt5)

array([[8, 8, 7, 7]])

In [4]:
# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')

array(['gt5', 'gt5', 'le5', 'gt5', 'gt5', 'le5', 'le5', 'le5', 'le5',
       'le5'], dtype='<U3')

In [5]:
# Location of the max
print('Position of max value: ', np.argmax(arr_rand))  

# Location of the min
print('Position of min value: ', np.argmin(arr_rand))  

Position of max value:  0
Position of min value:  5


# 2. How to import and export data as a csv file?

In [6]:
# Turn off scientific notation
np.set_printoptions(suppress=True)  

# Import data from csv file url
path = 'case/Auto.csv'
# path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='float')
data[:3]  # see first 3 rows

array([[  18. ,    8. ,  307. ,  130. , 3504. ,   12. ,   70. ,    1. ,
        -999. ],
       [  15. ,    8. ,  350. ,  165. , 3693. ,   11.5,   70. ,    1. ,
        -999. ],
       [  18. ,    8. ,  318. ,  150. , 3436. ,   11. ,   70. ,    1. ,
        -999. ]])

# 2.1 How to handle datasets that has both numbers and text columns?

In [7]:
# data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data2[:3]  # see first 3 rows

  


array([(18., 8, 307., 130, 3504, 12. , 70, 1, b'"chevrolet chevelle malibu"'),
       (15., 8, 350., 165, 3693, 11.5, 70, 1, b'"buick skylark 320"'),
       (18., 8, 318., 150, 3436, 11. , 70, 1, b'"plymouth satellite"')],
      dtype=[('f0', '<f8'), ('f1', '<i4'), ('f2', '<f8'), ('f3', '<i4'), ('f4', '<i4'), ('f5', '<f8'), ('f6', '<i4'), ('f7', '<i4'), ('f8', 'S38')])

In [8]:
# Save the array as a csv file
np.savetxt("case\out.csv", data, delimiter=",")

# 3. How to save and load numpy objects?

In [9]:
# Save single numpy array object as .npy file
np.save('case\\myarray.npy', data2)  

# Save multile numy arrays as a .npz file
np.savez('case\\array.npz', data2, data)

In [10]:
# Load a .npy file
a = np.load('case\\myarray.npy')
print(a[:3])

[(18., 8, 307., 130, 3504, 12. , 70, 1, b'"chevrolet chevelle malibu"')
 (15., 8, 350., 165, 3693, 11.5, 70, 1, b'"buick skylark 320"')
 (18., 8, 318., 150, 3436, 11. , 70, 1, b'"plymouth satellite"')]


In [11]:
# Load a .npz file
b = np.load('case\\array.npz')
print(b.files)
b['arr_1']

['arr_0', 'arr_1']


array([[  18.,    8.,  307., ...,   70.,    1., -999.],
       [  15.,    8.,  350., ...,   70.,    1., -999.],
       [  18.,    8.,  318., ...,   70.,    1., -999.],
       ...,
       [  32.,    4.,  135., ...,   82.,    1., -999.],
       [  28.,    4.,  120., ...,   82.,    1., -999.],
       [  31.,    4.,  119., ...,   82.,    1., -999.]])

# 4. How to concatenate two numpy arrays columnwise and row wise
There are 3 different ways of concatenating two or more numpy arrays.

Method 1: np.concatenate by changing the axis parameter to 0 and 1
Method 2: np.vstack and np.hstack
Method 3: np.r_ and np.c_

In [12]:
a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [13]:
# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)  
np.vstack([a,b])  
np.r_[a,b] 

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [14]:
# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1) 
np.hstack([a,b])  
np.c_[a,b]

array([[0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.]])

In [15]:
np.r_[[1,2,3], 0, 0, [4,5,6]]

array([1, 2, 3, 0, 0, 4, 5, 6])

# 5. How to sort a numpy array based on one or more columns?

In [16]:
arr = np.random.randint(1,6, size=[8, 4])
arr

array([[2, 3, 5, 5],
       [2, 4, 5, 1],
       [3, 1, 2, 1],
       [2, 4, 1, 5],
       [1, 4, 2, 1],
       [4, 1, 2, 3],
       [2, 4, 5, 4],
       [2, 1, 5, 1]])

In [17]:
# Sort each columns of arr
np.sort(arr, axis=0)

array([[1, 1, 1, 1],
       [2, 1, 2, 1],
       [2, 1, 2, 1],
       [2, 3, 2, 1],
       [2, 4, 5, 3],
       [2, 4, 5, 4],
       [3, 4, 5, 5],
       [4, 4, 5, 5]])

In [18]:
# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])
sort_index = np.argsort(x)
print(sort_index)

[0 3 2 4 5 1]


In [19]:
x[sort_index]

array([ 1,  2,  5,  8,  9, 10])

In [20]:
# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()

# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]

array([[1, 4, 2, 1],
       [2, 3, 5, 5],
       [2, 4, 5, 1],
       [2, 4, 1, 5],
       [2, 4, 5, 4],
       [2, 1, 5, 1],
       [3, 1, 2, 1],
       [4, 1, 2, 3]])

In [21]:
# Descending sort
arr[sorted_index_1stcol[::-1]]

array([[4, 1, 2, 3],
       [3, 1, 2, 1],
       [2, 1, 5, 1],
       [2, 4, 5, 4],
       [2, 4, 1, 5],
       [2, 4, 5, 1],
       [2, 3, 5, 5],
       [1, 4, 2, 1]])

## 5.2 How to sort a numpy array based on 2 or more columns?

In [22]:
# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0])) 
arr[lexsorted_index]

array([[1, 4, 2, 1],
       [2, 1, 5, 1],
       [2, 3, 5, 5],
       [2, 4, 5, 1],
       [2, 4, 1, 5],
       [2, 4, 5, 4],
       [3, 1, 2, 1],
       [4, 1, 2, 3]])

# 6. Working with dates

In [23]:
# Create a datetime64 object
date64 = np.datetime64('2018-02-04 23:10:10')
date64

numpy.datetime64('2018-02-04T23:10:10')

In [24]:
# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64

numpy.datetime64('2018-02-04')

In [25]:
# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)

Add 10 days:  2018-02-14
Add 10 minutes:  2018-02-04T00:10
Add 10 seconds:  2018-02-04T00:00:10
Add 10 nanoseconds:  2018-02-04T00:00:00.000000010


In [26]:
# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)

'2018-02-04'

In [27]:
print('Date: ', dt64)
print("Is it a business day?: ", np.is_busday(dt64))  
print("Add 2 business days, rolling forward to nearest biz day: ", np.busday_offset(dt64, 2, roll='forward'))  
print("Add 2 business days, rolling backward to nearest biz day: ", np.busday_offset(dt64, 2, roll='backward'))  

Date:  2018-02-04
Is it a business day?:  False
Add 2 business days, rolling forward to nearest biz day:  2018-02-07
Add 2 business days, rolling backward to nearest biz day:  2018-02-06


## 6.1 How to create a sequence of dates?

In [28]:
# Create date sequence
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-10'))
print(dates)

# Check if its a business day
np.is_busday(dates)

['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09']


array([ True,  True, False, False,  True,  True,  True,  True,  True])

## 6.2 How to convert numpy.datetime64 to datetime.datetime object?

In [29]:
# Convert np.datetime64 to datetime.datetime
import datetime
dt = dt64.tolist()
dt

datetime.date(2018, 2, 4)

In [30]:
print('Year: ', dt.year)  
print('Day of month: ', dt.day)
print('Month of year: ', dt.month)  
print('Day of Week: ', dt.weekday())  # Sunday

Year:  2018
Day of month:  4
Month of year:  2
Day of Week:  6


# 7. Advanced numpy functions
## 7.1 vectorize – Make a scalar function work on vectors

In [31]:
# Define a scalar function
def foo(x):
    if x % 2 == 1:
        return x**2
    else:
        return x/2

# On a scalar
print('x = 10 returns ', foo(10))
print('x = 11 returns ', foo(11))

# On a vector, doesn't work
# print('x = [10, 11, 12] returns ', foo([10, 11, 12]))  # Error 

x = 10 returns  5.0
x = 11 returns  121


In [32]:
# Vectorize foo(). Make it work on vectors.
foo_v = np.vectorize(foo, otypes=[float])

print('x = [10, 11, 12] returns ', foo_v([10, 11, 12]))
print('x = [[10, 11, 12], [1, 2, 3]] returns ', foo_v([[10, 11, 12], [1, 2, 3]]))

x = [10, 11, 12] returns  [  5. 121.   6.]
x = [[10, 11, 12], [1, 2, 3]] returns  [[  5. 121.   6.]
 [  1.   1.   9.]]


## 7.2 apply_along_axis – Apply a function column wise or row wise

In [33]:
# Create a 4x10 random array
np.random.seed(100)
arr_x = np.random.randint(1,10,size=[4,10])
arr_x

array([[9, 9, 4, 8, 8, 1, 5, 3, 6, 3],
       [3, 3, 2, 1, 9, 5, 1, 7, 3, 5],
       [2, 6, 4, 5, 5, 4, 8, 2, 2, 8],
       [8, 1, 3, 4, 3, 6, 9, 2, 1, 8]])

In [34]:
# Define func1d
def max_minus_min(x):
    return np.max(x) - np.min(x)

# Apply along the rows
print('Row wise: ', np.apply_along_axis(max_minus_min, 1, arr=arr_x))

# Apply along the columns
print('Column wise: ', np.apply_along_axis(max_minus_min, 0, arr=arr_x))

Row wise:  [8 8 6 8]
Column wise:  [7 8 2 7 6 5 8 5 5 5]


## 7.3 searchsorted – Find the location to insert so the array will remain sorted

In [62]:
# example of searchsorted
x = np.arange(10)
print('Where should 5 be inserted?: ', np.searchsorted(x, 5))
print('Where should 5 be inserted (right)?: ', np.searchsorted(x, 5, side='right'))

Where should 5 be inserted?:  5
Where should 5 be inserted (right)?:  6


In [63]:
fx = x.astype('float')
fx

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [68]:
probs = fx/10  # probabilities
probs

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [71]:
probs.cumsum()

array([0. , 0.1, 0.3, 0.6, 1. , 1.5, 2.1, 2.8, 3.6, 4.5])

In [66]:
# Randomly choose an item from a list based on a predefined probability
lst = range(10000)  # the list
probs = np.random.random(10000); probs /= probs.sum()  # probabilities

%timeit lst[np.searchsorted(probs.cumsum(), np.random.random())]
%timeit np.random.choice(lst, p=probs)

33.7 µs ± 691 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.15 ms ± 56.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 7.4 How to add a new axis to a numpy array?

In [72]:
# Create a 1D array
x = np.arange(5)
print('Original array: ', x)

# Introduce a new column axis
x_col = x[:, np.newaxis]
print('x_col shape: ', x_col.shape)
print(x_col)

# Introduce a new row axis
x_row = x[np.newaxis, :]
print('x_row shape: ', x_row.shape)
print(x_row)

Original array:  [0 1 2 3 4]
x_col shape:  (5, 1)
[[0]
 [1]
 [2]
 [3]
 [4]]
x_row shape:  (1, 5)
[[0 1 2 3 4]]


# 7.5 More Useful Functions

In [74]:
# Create the array and bins
x = np.arange(10)
bins = np.array([0, 3, 6, 9])
print(x)
print(bins)
# Get bin allotments
np.digitize(x, bins)

[0 1 2 3 4 5 6 7 8 9]
[0 3 6 9]


array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4], dtype=int64)

In [78]:
print(x)
# Cap all elements of x to lie between 3 and 8
# All number lesser than the lower limit will be replaced by the lower limit. Same applies to the upper limit also.
np.clip(x, 3, 8)

[0 1 2 3 4 5 6 7 8 9]


array([3, 3, 3, 3, 4, 5, 6, 7, 8, 8])

In [79]:
# Bincount example
x = np.array([1,1,2,2,2,4,4,5,6,6,6]) # doesn't need to be sorted
np.bincount(x) # 0 occurs 0 times, 1 occurs 2 times, 2 occurs thrice, 3 occurs 0 times, ...

# Histogram example
counts, bins = np.histogram(x, [0, 2, 4, 6, 8])
print('Counts: ', counts)
print('Bins: ', bins)

Counts:  [2 3 3 3]
Bins:  [0 2 4 6 8]
