# 101 Numpy Exercises for Data Analysis (Python)
source: Machine Leanring +

**1. Import numpy as np and see the version**

In [1]:
import numpy as np

print(np.__version__)

2.1.3


**2. How to create a 1D array?**

In [2]:
arr = np.array([0,1,2,3,4,5,6,7,8,9])

# arr = np.arange(10) (alternate answer)

arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

**3. How to create a boolean array?**

In [3]:
arr = np.array([[True, True, True], [True, True, True],[True, True, True]])

# commnet: need to include dtype=bool in the array

arr

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

**4. How to extract items that satisfy a given condition from 1D array?**

In [4]:
# Example: Extract all odd numbers from `arr`

arr = np.arange(10)

arr[arr % 2 == 1] # "modulo is 1" => the number is odd

array([1, 3, 5, 7, 9])

**5. How to replace items that satisfy a condition with another value in numpy array?**

In [5]:
# Example: Replace all odd numbers in `arr` with -1

arr = np.arange(10)

np.where((arr % 2 == 1), arr*-1, arr)

# alternate answer: arr[arr % 2 == 1] = -1

array([ 0, -1,  2, -3,  4, -5,  6, -7,  8, -9])

**6. How to replace items that satisfy a condition without affecting the original array?**

Q: Replace all odd numbers in `arr` with -1 without changing arr


In [6]:


arr = np.arange(10)
out = arr.copy()

np.where((out % 2 == 1), -1, out) # np.where it self makes a COPY!!

# alternate answer: 
# arr = np.arange(10)
# out = np.where(arr % 2 == 1, -1, arr)

# print(arr)
# out

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

**7. How to reshape an array?**
Q: Convert a 1D array to a 2D array with 2 rows

In [7]:


arr = np.arange(10)

arr.reshape(2,5)

# alternate answer:
# arr.reshape(2, -1) setting -1 automatically decides the number of cols

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

**8. How to stack two arrays vertically?**

Q: Stack arrays `a` and `b` vertically

In [8]:

a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)

np.vstack([a,b])

# alternate answers:
# method 1:
np.concatenate([a,b], axis=0)

#method 2
np.r_[a,b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

**9. How to stack two arrays horizontally?**

Q: Stack the arrays `a` and `b` horizontally

In [9]:
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)

# method 1
np.hstack([a,b])

# method 2

np.concatenate([a,b], axis=1)

# alternate answer:

# methods 3
np.c_[a,b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

**10. How to generate custom sequences in numpy without hardcoding?**

Q: Create the following pattern without hardcoding. Use only numpy functions and the below input array `a`.

In [10]:
a = np.array([1,2,3])

repeat = np.repeat(a, 3)
tile = np.tile(a, 3)

np.hstack([repeat, tile])

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

**11. How to get the common items between two python numpy arrays?**

Q: Get the common items between `a` and `b`

In [11]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)

array([2, 4])

**12. How to remove from one array those items that exist in another?**

Q: From array `a` remove all items present in array `b`

In [12]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

a[~np.isin(a,b)]

# alternate answer:
np.setdiff1d(a,b)

array([1, 2, 3, 4])

**13. How to get the positions where elements of two arrays match?**

Q: Get the positions where elements of `a` and `b` match

In [13]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.argwhere(a==b).flatten()

# alternate answer:
np.where(a==b)

(array([1, 3, 5, 7]),)

**14. How to extract all numbers between a given range from a numpy array?**

Q: Get all items between 5 and 10 from `a`.

In [14]:
a = np.array([2,6,1,9,10,3,27])

a[np.where((a >= 5) & (a <= 10))]

array([ 6,  9, 10])

**15. How to make a python function that handles scalars to work on numpy arrays?**

Q: Convert the function `maxx` that works on two scalars, to work on two arrays.

In [15]:
def maxx(x, y):
    """Get the maximum of two items"""

    if x >= y:
        return x
    else:
        return y
    

pair_max = np.vectorize(maxx, otypes=[float])

a = np.array([5,7,9,8,6,4,5])
b = np.array([6,3,4,8,9,7,1])

pair_max(a,b)

array([6., 7., 9., 8., 9., 7., 5.])

**16. How to swap two columns in a 2d numpyt array?**

Q: Swap columns 1 and 2 in the array `arr`.

In [16]:
arr = np.arange(9).reshape(3,3)

# Answer:
arr[:, [1,0,2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

**17. How to swap two rows in a 2d numpy array?**

Q: Swap rows 1 and 2 in the array `arr`

In [17]:
arr = np.arange(9).reshape(3,3)

arr[[1,0,2],:]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

**18. How to reverse the rows of a 2D array?**

Q: Reverse the rows of a 2D array `arr`

In [18]:
arr = np.arange(9).reshape(3,3)

arr[::-1, :] # can just be arr[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

**19. How to reverse the columns of a 2D array?**

Q: Reverse the columns of a 2D array `arr`

In [19]:
arr = np.arange(9).reshape(3,3)

arr[:, ::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

**20. How to create a 2D array containing random floats between 5 and 10?**

Q: Create a 2D array of shape 5x3 to contain random decimal numbers between 5 and 10.

In [20]:
arr = np.random.uniform(5, 10, (5,3))

arr

array([[9.70041542, 9.2409287 , 5.95013704],
       [9.97173198, 9.85632295, 5.60722558],
       [9.63466391, 5.5007834 , 6.72528867],
       [9.03781119, 6.96894915, 9.49285407],
       [6.18427814, 7.78499114, 9.61211027]])

**21. How to print only 3 decimal places in python numpy array?**

Q: Print or show only 3 decimal places of the numpy array `rand_arr`

In [21]:
rand_arr = np.random.random((5,3))

rand_arr.round(3)


# Answer:

rand_arr = np.random.random((5,3))

rand_arr = np.random.random([5,3])

np.set_printoptions(precision=3)
rand_arr[:4]

array([[0.758, 0.646, 0.862],
       [0.697, 0.329, 0.236],
       [0.061, 0.516, 0.305],
       [0.585, 0.274, 0.7  ]])

**22. How to pretty print a numpy array by suppressing the scientific notation (like 1e10)?**

Q: Pretty print `rand_arr` by suppressing the scientific notation (like 1e10)

In [22]:
# Answer:
np.set_printoptions(suppress=True, precision=6)

np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3

rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

**23. How to limit the number of items printed in output of numpy array?**

Q: Limit the number of items printed in python numpy array `a` to a maximum of 6 elements.

In [23]:
# Answer:
np.set_printoptions(threshold=6)

a = np.arange(15)

a

array([ 0,  1,  2, ..., 12, 13, 14])

**24. How to print the full numpy array without truncating**

Q: Print the full numpy array `a` without truncating

In [24]:
np.set_printoptions(threshold=np.inf)
a = np.arange(15)

a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

**25. How to import a dataset with numbers and texts keeping the text intact in python numpy?**

Q: Import the iris dataset keeping the text intact

In [25]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

**26. How to extrat a particular column from 1D array of tuples?**

Q: Extract the text column `species` from the 1D `iris` imported in previous quetion.

In [26]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)

# Answer:
species = np.array([row[4] for row in iris_1d]) # you can use list comprehension

species[:5]

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

**27. How to convert a 1d array of tuples to a 2d numpy array?**

Q: Convert the 1D `iris` to 2D array `iris_2d` by omitting the `species` text field.

In [27]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)

# Answer:

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])

iris_2d[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

**28. How to compute the mean, median, standard deviation of a numpy array?**

Q: Find the mean, median, standard deviation of iris's `sepallength` (1st column)

In [28]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Answer:

sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0]) # changing the dtype to float and use usecols to extract the first column

print(np.mean(sepallength), np.median(sepallength), np.std(sepallength))

5.843333333333334 5.8 0.8253012917851409


**29. How to normalize an array so the values range exactly between 0 and 1?**

Q: Create a normalized form of `iris`'s `sepallength` whose values range exactly between 0 and 1 so that the minimum has value 0 and maximum has value 1.

In [29]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

# WRONG!!!!!!!!
(sepallength-np.mean(sepallength))/np.std(sepallength)

# Answer:

Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin) / (Smax - Smin)

S

array([0.222222, 0.166667, 0.111111, 0.083333, 0.194444, 0.305556,
       0.083333, 0.194444, 0.027778, 0.166667, 0.305556, 0.138889,
       0.138889, 0.      , 0.416667, 0.388889, 0.305556, 0.222222,
       0.388889, 0.222222, 0.305556, 0.222222, 0.083333, 0.222222,
       0.138889, 0.194444, 0.194444, 0.25    , 0.25    , 0.111111,
       0.138889, 0.305556, 0.25    , 0.333333, 0.166667, 0.194444,
       0.333333, 0.166667, 0.027778, 0.222222, 0.194444, 0.055556,
       0.027778, 0.194444, 0.222222, 0.138889, 0.222222, 0.083333,
       0.277778, 0.194444, 0.75    , 0.583333, 0.722222, 0.333333,
       0.611111, 0.388889, 0.555556, 0.166667, 0.638889, 0.25    ,
       0.194444, 0.444444, 0.472222, 0.5     , 0.361111, 0.666667,
       0.361111, 0.416667, 0.527778, 0.361111, 0.444444, 0.5     ,
       0.555556, 0.5     , 0.583333, 0.638889, 0.694444, 0.666667,
       0.472222, 0.388889, 0.333333, 0.333333, 0.416667, 0.472222,
       0.305556, 0.472222, 0.666667, 0.555556, 0.361111, 0.333

**30. How to compute the softmax score?**

Q: Compute the softmax score of `sepallength`

In [30]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

# Answer:

def softmax(x):
    """Compute softmax values for each sets of scores in x."""

    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

print(softmax(sepallength))

[0.00222  0.001817 0.001488 0.001346 0.002008 0.002996 0.001346 0.002008
 0.001102 0.001817 0.002996 0.001644 0.001644 0.000997 0.00447  0.004044
 0.002996 0.00222  0.004044 0.00222  0.002996 0.00222  0.001346 0.00222
 0.001644 0.002008 0.002008 0.002453 0.002453 0.001488 0.001644 0.002996
 0.002453 0.003311 0.001817 0.002008 0.003311 0.001817 0.001102 0.00222
 0.002008 0.001218 0.001102 0.002008 0.00222  0.001644 0.00222  0.001346
 0.002711 0.002008 0.01484  0.008144 0.013428 0.003311 0.009001 0.004044
 0.007369 0.001817 0.009947 0.002453 0.002008 0.00494  0.005459 0.006033
 0.003659 0.010994 0.003659 0.00447  0.006668 0.003659 0.00494  0.006033
 0.007369 0.006033 0.008144 0.009947 0.01215  0.010994 0.005459 0.004044
 0.003311 0.003311 0.00447  0.005459 0.002996 0.005459 0.010994 0.007369
 0.003659 0.003311 0.003311 0.006033 0.00447  0.002008 0.003659 0.004044
 0.004044 0.006668 0.00222  0.004044 0.007369 0.00447  0.016401 0.007369
 0.009001 0.02704  0.001817 0.020032 0.010994 0.01812

**31. How to find the percentile scores of a numpy array?**

Q: Find the 5th and 95th percentile of iris's `sepallength`

In [31]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

print(np.percentile(sepallength, 0.05), np.percentile(sepallength,0.95))

# np.percentile(sepallength, q=[5,95]) # why does it have to be this!!!!!??!!!!?!!??!/?!@?!?!?!?!??!?!!?!?!

4.30745 4.4


**32. How to insert values at random positions in an array?**

Q: Insert `np.nan` values at 20 random positions in `iris_2d` dataset

In [32]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

pos1 = np.random.randint(0,len(iris_2d), 20)
pos2 = np.random.randint(0, 5, 20)

iris_2d[pos1, pos2] = np.nan

iris_2d

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [nan, b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', b'2.9', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa'],
       [b'5.4', b'3.7', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.4', b'1.6', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.0', b'1.4', b'0.1', b'Iris-setosa'],
       [b'4.3', b'3.0', b'1.1', b'0.1', b'Iris-setosa'],
       [b'5.8', b'4.0', b'1.2', b'0.2', b'Iris-setosa'],
       [b'5.7', b'4.4', b'1.5', b'0.4', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.3', b'0.4', b'Iris-setosa'],
       [b'5.1', b'3.5', b'1.4', b'

**34. How to filter a numpy array based on two or more conditions?**

Q: Filter the rows of `iris_2d` that has `petallength (3rd column) > 1.5` and `sepallength (1st column) < 5.0`>

In [33]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

iris_2d[(iris_2d[:,2] > 1.5) & (iris_2d[:,0] < 5.0)]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

**35. How to drop rows that contain a missing value from a numpy array?**

Q: Select the rows of iris_2d that does not have any `nan` value.

In [34]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Answer:

any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[any_nan_in_row][:5]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [4.6, 3.4, 1.4, 0.3]])

**36. How to find the correlation between two columns of a numpt array?**

Q: Find the correlation between SepalLength (1st column) and PetalLength (3rd column) in iris_2d

In [35]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

# Answers:

# Method 1:
np.corrcoef(iris_2d[:, 0], iris_2d[:, 2])[0, 1]

# Method 2:
from scipy.stats.stats import pearsonr
corr, p_value = pearsonr(iris_2d[:, 0], iris_2d[:, 2])
print(corr, p_value)

0.871754157304871 1.0384540627942314e-47


  from scipy.stats.stats import pearsonr


**37. How to find if a given array has any null values?**

Q: Find out if `iris_2d` has any missing values.

In [36]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any(axis=1)

# Answer:
np.isnan(iris_2d).any()

np.False_

**38. How to replace all missing values with 0 in a numpy array?**

Q: Replace all occurrences of `nan` with 0 in numpy array.

In [37]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

iris_2d[np.isnan(iris_2d)] = 0

iris_2d[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0. ],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 0. , 1.5, 0.1]])

**39. How to find the count of unique values in a numpy array?**

Q: Find the unique values and the count of unique values in iris's `species`.

In [38]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

print(np.unique(iris[:, 4], return_counts=True))

# Alternate answer:
species = np.array([row.tolist()[4] for row in iris])
np.unique(species, return_counts=True)

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype=object), array([50, 50, 50]))


(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'),
 array([50, 50, 50]))

**40. How to convert a numeric to a categorical (text) array?**

Q: Bin the petal length (3rd) column of iris_2d to form a text array, such that if petal length is:
* less than 3 -> 'small'
* 3-5 -> 'medium'
* greater than equal to 5' -> 'large'

In [39]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

def petalLengthToCategory(length, breaks=[3,5], category=['small', 'medium', 'large']):
    i = np.searchsorted(breaks, length)
    return category[i]

[petalLengthToCategory(length) for length in iris[:,2]][:4]

# Alternate answer:
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0,3,5,10])

label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]

petal_length_cat[:4]

['small', 'small', 'small', 'small']

**41. How to create a new column from existing columns of a numpy array?**

Q: Create a new column for volume in iris_2d, where volume is `(pi x petallength x sepal_length^2)/3`

In [40]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

petallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[2])
sepal_length = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

new_column = (np.pi*petallength*sepal_length**2)/3

new_iris = np.c_[iris,new_column]

new_iris[:4]



array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

**42. How to do probabilistic sampling in numpy?**

Q: Randomly sample `iris`'s `species` such that `setose` is twice the number of `versicolor` and `virginica`.

In [41]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Answer:

species = iris[:, 4]

# Method 1:
np.random.seed(100)
a = np.array(['Iris-setosa','Iris-versicolor','Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

# Method 2:
np.random.seed(100)
probs = np.r_[np.linspace(0,0.500, num=50), np.linspace(0.501, 0.750, num=50), np.linspace(0.751, 1.0, num=50)]
index = np.searchsorted(probs, np.random.random(150))
species_out = species[index]
print(np.unique(species_out, return_counts=True))

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype=object), array([77, 37, 36]))


**43. How to get the second largest value of an array when grouped by another array?**

Q: What is value of second longest `petallength` of species `setosa`.

In [42]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

setosa = iris[iris[:, 4] == b'Iris-setosa']

setosa[:, 2].astype('float').max(axis=0)

# Answer:

petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')

np.unique(np.sort(petal_len_setosa))[-2]

np.float64(1.7)

**44. How to sort a 2D array by a column**

Q: Sort iris dataset based on `sepallength` column.

In [43]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Answer:
iris[iris[:, 0].argsort()][:20]

array([[b'4.3', b'3.0', b'1.1', b'0.1', b'Iris-setosa'],
       [b'4.4', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.4', b'3.0', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.4', b'2.9', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.5', b'2.3', b'1.3', b'0.3', b'Iris-setosa'],
       [b'4.6', b'3.6', b'1.0', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'4.6', b'3.2', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.6', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.0', b'1.4', b'0.1', b'Iris-setosa'],
       [b'4.8', b'3.0', b'1.4', b'0.3', b'Iris-setosa'],
       [b'4.8', b'3.4', b'1.9', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.4', b'1.6', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.1', b'1.6', b'0.2', b'Iris-setosa'],
       [b'4.9', b'2.4', b'3.3', b'1.0', b'Iris-versicolor'],
       [b'4.9', b'2.5', b'4

**45. How to find the most frequent value in a numpy array?**

Q: Find the most frequent value of petal length (3rd column) in iris dataset.

In [44]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Answer:
val, counts = np.unique(iris[:, 2], return_counts=True)
val[np.argmax(counts)]

b'1.5'

**46. How to find the position of the first occurence of a value greater than a given value?**

Q: Find the position of the first occurrence of a value greater than 1.0 in petalwidth 4th column of iris dataset.

In [45]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

threshold = 1.0

petalwidth = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[3])
np.searchsorted(petalwidth, threshold)

# alternate answer
np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

array([50])

**47. How to replace all values greater than a given value to a given cutoff?**

Q: From the array `a`, replace all values greater than 30 to 30 and less than 10 to 10.

In [46]:
np.random.seed(100)
a = np.random.uniform(1,50, 20)

np.clip(a, a_min=10, a_max=30).round(2)

array([27.63, 14.64, 21.8 , 30.  , 10.  , 10.  , 30.  , 30.  , 10.  ,
       29.18, 30.  , 11.25, 10.08, 10.  , 11.77, 30.  , 30.  , 10.  ,
       30.  , 14.43])

**48. How to get the positions of top values from a numpy array?**

Q: Get the positions of top 5 maximum values in a given array `a`.

In [47]:
np.random.seed(100)
a = np.random.uniform(1,50, 20)

a.argsort()

array([ 4, 13,  5,  8, 17, 12, 11, 14, 19,  1,  2,  0,  9,  6, 16, 18,  7,
        3, 10, 15])

**49. How to compute the row wise counts of all possible values in an array?**

Q: Compute the counts of unique values row-wise.


In [48]:
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))



In [49]:
# create a plain array with zeros
result = np.zeros_like(arr)

In [50]:
for row_pos, row in enumerate(arr.tolist()):
    for value in row:
        result[row_pos, value-1] += 1

result 

array([[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
       [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
       [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
       [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
       [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]])

**50. How to convert an array of arrays into a flat 1d array?**

Q: Convert `array_of_arrays` into a flat linear 1d array.

In [51]:
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.r_[arr1, arr2, arr3]

array_of_arrays

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

**51. How to generate one-hot encodings for an array in numpy?**

Q: Compute the one-hot encodings (dummy binary variables for each unique value in the array)

In [60]:
np.random.seed(101)

def addOneToPosition(pos):
    """makes an array of 3 values of 0 and add 1 to the given position"""

    result = np.array([0,0,0])
    result[pos-1] += 1

    return result

arr = np.random.randint(1,4, size=6)

np.array([addOneToPosition(row[0]) for row in np.vstack(arr)])

# This works for this particular problem. The solution should cover all the possible array size.

# Answer 1:

def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    for i,k in enumerate(arr):
        out[i, k-1] = 1
    return out

one_hot_encodings(arr)

# Answer 2:

(arr[:, None] == np.unique(arr)).view(np.int8) # each numbers in bits

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 0]], dtype=int8)

**52. How to create row numbers grouped by a categorical variable?**

Q: Create row numbers grouped by a categorical variable. Use the following sample from `iris` `species` as input.

In [61]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
species_small = np.sort(np.random.choice(species, size=20))
species_small

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica'], dtype='<U15')

In [85]:
result = []
for species in np.unique(species_small).tolist():

    left_pos = np.searchsorted(species_small, species, side='left')
    right_pos = np.searchsorted(species_small, species, side='right')
    diff = right_pos - left_pos
    freq = list(range(0, diff))

    result.extend(freq)

result

# Answer:

print([i for val in np.unique(species_small) for i, grp in enumerate(species_small[species_small==val])])

[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8]


**53. How to create group ids based on a given categorical variable?**

Q: Create group ids based on a given categorical variable. Use the following sample from `iris species` as input.

In [86]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
species_small = np.sort(np.random.choice(species, size=20))
species_small

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica'], dtype='<U15')

In [93]:
[i for val in species_small for i, grp in enumerate(np.unique(species_small)) if val==grp]

# Answer 1:

output = [np.argwhere(np.unique(species_small) == s).tolist()[0][0] for val in np.unique(species_small) for s in species_small[species_small==val]]

# Answer 2:
output = []
uniqs = np.unique(species_small)

for val in uniqs:  # uniq values in group
    for s in species_small[species_small==val]:  # each element in group
        groupid = np.argwhere(uniqs == s).tolist()[0][0]  # groupid
        output.append(groupid)

print(output)

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]


**54. How to rank items in an array using numpy?**

Q: Create the ranks for the given numeric array `a`.

In [94]:
np.random.seed(10)
a = np.random.randint(20, size=10)
print(a)

[ 9  4 15  0 17 16 17  8  9  0]


In [96]:
# Answer:

a.argsort().argsort().tolist()

[4, 2, 6, 0, 8, 7, 9, 3, 5, 1]

**55. How to rank items in a multidimensional array using numpy?**

Q: Create a rank array of the same shape as a given numeric array `a`.

In [97]:
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

[[ 9  4 15  0 17]
 [16 17  8  9  0]]


In [98]:
# Answer
print(a.ravel().argsort().argsort().reshape(a.shape))

[[4 2 6 0 8]
 [7 9 3 5 1]]


**56. How to find the maximum value in each row of a numpy array 2d?**

Q: Compute the maximum for each row in the given array.

In [99]:
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

array([[9, 9, 4],
       [8, 8, 1],
       [5, 3, 6],
       [3, 3, 3],
       [2, 1, 9]])

In [101]:
a.max(axis=1)

# Answer 1:
np.amax(a, axis=1)

# Answer 2:
np.apply_along_axis(np.max, arr=a, axis=1)

array([9, 8, 6, 3, 9])

**57. How to compute the min-by-max for each row for a numpy array 2d?**

Q: Compute the min-by-max for each row for given 2d numpy array.

In [102]:
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

array([[9, 9, 4],
       [8, 8, 1],
       [5, 3, 6],
       [3, 3, 3],
       [2, 1, 9]])

In [105]:
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

array([0.444444, 0.125   , 0.5     , 1.      , 0.111111])

**58. How to find the duplicte records in a numpy array?**

Q: Find the duplicate entries (2nd occurrence onwards) in the given numpy array and mark them as `True`. First time occurrences should be `False`.

In [106]:
np.random.seed(100)
a = np.random.randint(0, 5, 10)
print('Array: ', a)

Array:  [0 0 3 0 2 4 2 2 2 2]


In [None]:
## Answer:

# Create an all True array
out = np.full(a.shape[0], True)

# Find the index position of unique elements
unique_positions = np.unique(a, return_index=True)[1]

# Mark those positions as False
out[unique_positions] = False

print(out)

[False  True False  True False False  True  True  True  True]


**59. How to find the grouped mean in numpy?**

Q: Find the mean of a numeric column grouped by a categorical column in a 2D numpy array**

In [124]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

iris[:4]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa']], dtype=object)

In [142]:
sepalwidth = iris[:, 1].astype('float')
species = iris[:, 4]

for species in np.unique(species):
    data = iris[iris[:, 4]==species][:, 1].astype('float')

    print(np.hstack([species, np.mean(data).round(3)]))

[b'Iris-setosa' b'3.418']
[b'Iris-versicolor' b'2.77']
[b'Iris-virginica' b'2.974']


In [140]:
## Answer:

sepalwidth = iris[:, 1].astype('float')
species = iris[:, 4]

[[group_val, sepalwidth[species==group_val].mean().round(3)] for group_val in np.unique(species)]

[[b'Iris-setosa', np.float64(3.418)],
 [b'Iris-versicolor', np.float64(2.77)],
 [b'Iris-virginica', np.float64(2.974)]]

**60. How to convert a PIL image to numpy array?**

Q: Import the image from the following URL and convert it to a numpy array

In [144]:
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
response = requests.get(URL)

# Read it as Image
I = Image.open(BytesIO(response.content))

# Optionally resize
I = I.resize([150,150])

# Convert to numpy array
arr = np.asarray(I)

# Optionaly Convert it back to an image and show
im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

**61. How to drop all missing values from a numpy array?**

Q: Drop all `nan` values from a 1D numpy array

In [147]:
a = np.array([1,2,3,np.nan,5,6,7,np.nan])
a

array([ 1.,  2.,  3., nan,  5.,  6.,  7., nan])

In [149]:
a[~np.isnan(a)]

array([1., 2., 3., 5., 6., 7.])

**62. How to compute the euclidean distance between two arrays?**

Q: Compute the euclidean distance between two arrays `a` and `b`.

In [151]:
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

In [153]:
euclidean_distance = np.sqrt(np.sum((a-b)**2))

print(euclidean_distance)

## Answer:

dist = np.linalg.norm(a-b)
dist

6.708203932499369


np.float64(6.708203932499369)

**63. How to find all the local maxima (or peaks) in a 1d array?**

Q: Find all the peaks in a 1D numpy array `a`. Peaks are points surrounded by smaller values on both sides.

In [154]:
a = np.array([1,3,7,1,2,6,0,1])
a

array([1, 3, 7, 1, 2, 6, 0, 1])

In [161]:
## Answer:

doublediff = np.diff(np.sign(np.diff(a)))
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations

array([2, 5])

**64. How to subtract a 1d array from a 2d array, where each item of 1d array subtracts from respective row?**

Q: Subtract the 1d array `b_1d` from the 2d array `a_2d`, such that each item of `b_1d` subtracts from respective row of `a_2d`.

In [169]:
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,2,3])

a_2d, b_1d

(array([[3, 3, 3],
        [4, 4, 4],
        [5, 5, 5]]),
 array([1, 2, 3]))

In [176]:
a_2d - np.vstack(b_1d)

## Answer:

print(a_2d - b_1d[:,None])

[[2 2 2]
 [2 2 2]
 [2 2 2]]


**65. How to find the index of n'th repetition of an item in an array**

Q: Find the index of 5th repetition of number 1 in `x`

In [177]:
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])
x

array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])

In [187]:
## Answer 1:
n = 5

[i for i, v in enumerate(x) if v == 1][n-1]

## Answer 2:

np.where(x==1)[0][n-1]

np.int64(8)

**66. How to convert numpy's `datetime64` object to datetime's `datetime` object?**

Q: Convert numpy's `datetime64` object to datetime's `datetime` object

In [194]:
dt64 = np.datetime64('2018-02-25 22:10:10')

dt64

np.datetime64('2018-02-25T22:10:10')

In [199]:
## Answer 1:

dt64.tolist()

## Answer 2:
from datetime import datetime
dt64.astype(datetime)

datetime.datetime(2018, 2, 25, 22, 10, 10)

**67. How to compute the moving average of a numpy array?**

Q: Compute the moving average of window size 3, for the given 1D array.

In [201]:
np.random.seed(100)
Z = np.random.randint(10, size=10)
Z

array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])

In [203]:
# Source: https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

np.random.seed(100)
Z = np.random.randint(10, size=10)
print('array: ', Z)

## Answer 1:
moving_average(Z, n=3).round(2)

# Answer 2:
# np.ones(3)/3 gives equal weights. Use np.ones(4)/4 for window size 4.
np.convolve(Z, np.ones(3)/3, mode='valid')


array:  [8 8 3 7 7 0 4 2 5 2]


array([6.333333, 6.      , 5.666667, 4.666667, 3.666667, 2.      ,
       3.666667, 3.      ])

**68. How to create a numpy array sequence given only the starting point, length and the step?**

Q: Create a numpy array of length 10, starting from 5 and has a step of 3 between consecutive numbers

In [210]:
arr = np.arange(start=5, stop= 35, step=3)

arr

## Answer:
length = 10
start = 5
step = 3

def seq(start, length, step):
    end = start + (step*length)
    return np.arange(start, end, step)

seq(start, length, step)


array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32])

**69. How to fill in missing dates in an irregular series of numpy dates?**

Q: Given an array of a non-continuous sequence of dates. Make it a continuous sequence of dates, by filling in the missing dates.

In [211]:
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

['2018-02-01' '2018-02-03' '2018-02-05' '2018-02-07' '2018-02-09'
 '2018-02-11' '2018-02-13' '2018-02-15' '2018-02-17' '2018-02-19'
 '2018-02-21' '2018-02-23']


In [213]:
# Answer 1 ---------------
filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output

# Answer 2 -------
out = []
for date, d in zip(dates, np.diff(dates)):
    out.append(np.arange(date, (date+d)))

filled_in = np.array(out).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output


array(['2018-02-01', '2018-02-02', '2018-02-03', '2018-02-04',
       '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08',
       '2018-02-09', '2018-02-10', '2018-02-11', '2018-02-12',
       '2018-02-13', '2018-02-14', '2018-02-15', '2018-02-16',
       '2018-02-17', '2018-02-18', '2018-02-19', '2018-02-20',
       '2018-02-21', '2018-02-22', '2018-02-23'], dtype='datetime64[D]')

**70. How to create strides from a given 1D array?**

Q: From the given 1d array `arr`, generate a 2d matrix using strides, with a window length of 4 and strides of 2, like [[0,1,2,3], [2,3,4,5],[4,5,6,7]..]

In [214]:
arr = np.arange(15) 
arr

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [215]:
# Answer:

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    # return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

print(gen_strides(np.arange(15), stride_len=2, window_len=4))

[[ 0  1  2  3]
 [ 2  3  4  5]
 [ 4  5  6  7]
 [ 6  7  8  9]
 [ 8  9 10 11]
 [10 11 12 13]]
