# Machine Learning Zoomcamp
## 1.7 Introduction to NumPy

**Plan:**
- Creating arrays
- Multi-dimensional arrays
- Randomly generated arrays
- Element-wise operations
    - Comparison operations
    - Logical operations
- Summarizing operations

In [1]:
import numpy as np
np

<module 'numpy' from '/home/codespace/.local/lib/python3.12/site-packages/numpy/__init__.py'>

## Creating arrays

In [2]:
np.zeros(10)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [3]:
np.ones(10)


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [4]:
np.full(10, 2.5)


array([2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5])

In [5]:

a = np.array([1, 2, 3, 4, 5])
a


array([1, 2, 3, 4, 5])

In [6]:

a[2] = 10
a


array([ 1,  2, 10,  4,  5])

In [7]:

np.arange(0, 10, 2)

array([0, 2, 4, 6, 8])

In [8]:
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

## Multi-dimensional arrays

In [9]:
np.zeros((5, 2))



array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [10]:
n = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])
n


array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [11]:

n[0, 1] = 20
n


array([[ 1, 20,  3],
       [ 4,  5,  6],
       [ 7,  8,  9]])

In [12]:

n[2] = [1, 1, 1]
n


array([[ 1, 20,  3],
       [ 4,  5,  6],
       [ 1,  1,  1]])

In [13]:

n[:, 2] = [0, 1, 2]
n


array([[ 1, 20,  0],
       [ 4,  5,  1],
       [ 1,  1,  2]])

In [14]:

n[:, 1]  # whole rows, column 1

array([20,  5,  1])

In [15]:
n[1:, :2]  # from row 1 to end, columns 0 and 1

array([[4, 5],
       [1, 1]])

## Reshaping Arrays

In [16]:
x = np.arange(12)
x


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [17]:
x.reshape(3, 4)      # 3 rows × 4 columns
x.reshape(2, -1)     # 2 rows, auto columns


array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [18]:

grid = np.arange(16).reshape((4, 4))
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

## Concatenation & Splitting

In [19]:
r = np.array([1, 2, 3])
s = np.array([3, 2, 1])

In [20]:
r

array([1, 2, 3])

In [21]:
s

array([3, 2, 1])

In [22]:

np.concatenate([r, s])

array([1, 2, 3, 3, 2, 1])

In [23]:
np.vstack([r, s])


array([[1, 2, 3],
       [3, 2, 1]])

In [24]:
np.hstack([r.reshape(-1,1), s.reshape(-1,1)])



array([[1, 3],
       [2, 2],
       [3, 1]])

In [25]:
# splitting
t = np.arange(10)
t

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [26]:
np.split(t, [3, 6])

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8, 9])]

## Broadcasting

In [49]:
radio = np.arange(12).reshape(3, 4)
radio

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [50]:
radio + 5 #adds 5 to every element

array([[ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

In [51]:
radio * np.array([1, 2, 3, 4]) # mutiplies each row by [1,2,3,4]

array([[ 0,  2,  6, 12],
       [ 4, 10, 18, 28],
       [ 8, 18, 30, 44]])

## Universal Functions (ufuncs) - vectorized operations

In [53]:
universe = np.arange(5)
universe

array([0, 1, 2, 3, 4])

In [58]:
np.sqrt(universe) # square root of each element

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ])

In [57]:
np.exp(universe) # makes e raised to the power of each element (exponentiation)

array([ 1.        ,  2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [60]:
np.log(universe[1:])  # natural logarithm of each element (skipping log(0) to avoid -inf -> use slicing)

array([0.        , 0.69314718, 1.09861229, 1.38629436])

In [62]:
np.max(universe) # maximum value
np.min(universe) # minimum value
np.mean(universe) # mean value
np.std(universe)  # standard deviation

np.float64(1.4142135623730951)

## Aggrerations

In [63]:
#use in pandas too
agg = np.random.randint(0, 100, (5, 5))
agg

array([[80, 52, 76, 50,  4],
       [90, 63, 79, 49, 39],
       [46,  8, 50, 15,  8],
       [17, 22, 73, 57, 90],
       [62, 83, 96, 43, 32]])

In [65]:
agg.sum() #total sum of all elements

np.int64(1284)

In [66]:
agg.sum(axis=0)  # sum of each column

array([295, 228, 374, 214, 173])

In [67]:
agg.sum(axis=1)  # sum of each row

array([262, 320, 127, 259, 316])

In [68]:
agg.mean() # average of all elements

np.float64(51.36)

In [69]:
agg.std()  # standard deviation of all elements

np.float64(27.51273159829827)

## Boolean indexing

In [71]:
#filter data like a pro
z = np.random.randint(0, 100, 10)
z

array([70, 19, 56, 82,  1, 68, 40, 81, 61, 70])

In [74]:
z > 50 # returns a boolean array

array([ True, False,  True,  True, False,  True, False,  True,  True,
        True])

In [80]:
z[z > 50]  # returns elements greater than 50

array([70, 56, 82, 68, 81, 61, 70])

In [81]:
#real ML example: filter outliers
data = np.random.normal(0, 1, 1000)  # generate normal distributed data with mean 0 and std 1
data

array([ 1.40473201e+00,  5.39801647e-01, -5.28282347e-01, -8.44260731e-01,
       -2.19216319e+00,  3.67973869e-01,  1.99151856e-01, -2.91949847e+00,
       -1.15925113e+00, -8.51427514e-01,  1.12721446e+00, -3.84014659e-01,
       -7.22558355e-01,  1.41621298e+00,  5.70141267e-01, -1.59014501e+00,
       -6.35243791e-02, -2.44173380e+00, -1.79591943e+00, -1.02477173e+00,
        4.44304318e-01,  4.78942513e-02, -3.37651271e-01, -7.00147654e-01,
       -1.14533806e+00,  6.32191767e-01,  1.81669697e+00, -1.05138862e+00,
       -9.98889774e-01,  4.89501829e-01,  2.81790074e-01,  3.86844084e-01,
        7.71259807e-01,  6.42604143e-01, -1.49327115e+00,  2.31858787e-01,
        1.68888089e-01,  7.67475759e-01, -1.26904429e+00, -8.26050665e-01,
       -1.57208265e-01,  8.10705885e-01,  2.26911211e+00,  8.40992624e-01,
        4.24977984e-01,  1.12208984e+00,  3.42654126e-01, -1.21388478e+00,
        2.97314956e+00, -6.66140046e-01, -1.03942856e+00,  8.15879385e-01,
       -2.71685068e-02, -

In [83]:
clean_data = data[np.abs(data) < 3]  
# keep only data within 3 standard deviations
# etc. remove anything beyond 3 std
clean_data


array([ 1.40473201e+00,  5.39801647e-01, -5.28282347e-01, -8.44260731e-01,
       -2.19216319e+00,  3.67973869e-01,  1.99151856e-01, -2.91949847e+00,
       -1.15925113e+00, -8.51427514e-01,  1.12721446e+00, -3.84014659e-01,
       -7.22558355e-01,  1.41621298e+00,  5.70141267e-01, -1.59014501e+00,
       -6.35243791e-02, -2.44173380e+00, -1.79591943e+00, -1.02477173e+00,
        4.44304318e-01,  4.78942513e-02, -3.37651271e-01, -7.00147654e-01,
       -1.14533806e+00,  6.32191767e-01,  1.81669697e+00, -1.05138862e+00,
       -9.98889774e-01,  4.89501829e-01,  2.81790074e-01,  3.86844084e-01,
        7.71259807e-01,  6.42604143e-01, -1.49327115e+00,  2.31858787e-01,
        1.68888089e-01,  7.67475759e-01, -1.26904429e+00, -8.26050665e-01,
       -1.57208265e-01,  8.10705885e-01,  2.26911211e+00,  8.40992624e-01,
        4.24977984e-01,  1.12208984e+00,  3.42654126e-01, -1.21388478e+00,
        2.97314956e+00, -6.66140046e-01, -1.03942856e+00,  8.15879385e-01,
       -2.71685068e-02, -

In [86]:
clean_data.size

996

## Fancy Indexing (pass lists of indices)

In [93]:
apple = np.arange(0, 40, 2)
apple

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
       34, 36, 38])

In [94]:
apple[[0, 5, 10, 15]] #pick specific elements/positions

array([ 0, 10, 20, 30])

In [95]:
matrix = np.arange(16).reshape(4, 4)
matrix

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [97]:
matrix[[0, 2, 3], [1, 0, 3]]  
# picks elements in rows 0,2,3 and columns 1,0,3 respectively
# picks elements (0,1), (2,0), (3,3)

array([ 1,  8, 15])

## Randomly generated arrays

In [27]:
np.random.seed(2)
np.random.rand(5, 2)


array([[0.4359949 , 0.02592623],
       [0.54966248, 0.43532239],
       [0.4203678 , 0.33033482],
       [0.20464863, 0.61927097],
       [0.29965467, 0.26682728]])

In [28]:
np.random.seed(2)
100 * np.random.rand(5, 2)

array([[43.59949021,  2.59262318],
       [54.96624779, 43.53223926],
       [42.03678021, 33.0334821 ],
       [20.4648634 , 61.92709664],
       [29.96546737, 26.68272751]])

In [29]:
np.random.seed(2)
np.random.randn(5, 2)


array([[-0.41675785, -0.05626683],
       [-2.1361961 ,  1.64027081],
       [-1.79343559, -0.84174737],
       [ 0.50288142, -1.24528809],
       [-1.05795222, -0.90900761]])

In [30]:
np.random.seed(2)
np.random.randint(low=0, high=100, size=(5, 2))

array([[40, 15],
       [72, 22],
       [43, 82],
       [75,  7],
       [34, 49]])

## Array Attributes

In [31]:
att = np.random.randint(0, 10, (4, 5))
att

array([[5, 4, 4, 5, 7],
       [3, 6, 4, 3, 7],
       [6, 1, 3, 5, 8],
       [4, 6, 3, 9, 2]])

In [32]:
att.shape

(4, 5)

In [33]:
att.ndim

2

In [34]:
att.size

20

In [35]:
att.dtype

dtype('int64')

## Element-wise operations

In [36]:
a = np.arange(5)
a

array([0, 1, 2, 3, 4])

In [37]:
a / 2

array([0. , 0.5, 1. , 1.5, 2. ])

In [38]:
b = (10 + (a * 2)) ** 3 / 100
b

array([10.  , 17.28, 27.44, 40.96, 58.32])

In [39]:
a + b + 10

array([20.  , 28.28, 39.44, 53.96, 72.32])

## Comparison operations

In [40]:
a >= 2

array([False, False,  True,  True,  True])

In [41]:
a > b

array([False, False, False, False, False])

In [42]:
a[a < b]   # elements of a that are smaller than corresponding b

array([0, 1, 2, 3, 4])

## Summarizing operations

In [43]:
a.min()

np.int64(0)

In [44]:
a.max()

np.int64(4)

In [45]:
a.sum()

np.int64(10)

In [46]:
a.std()

np.float64(1.4142135623730951)

In [47]:
n.min()

np.int64(0)

### Next → Linear algebra refresher