First, let's talk about numpy (short for Numerical Python). It's a python library used for exploratory data analysis and it allows us to simplify and optimize many data related operations.

In [19]:
import numpy as np # first we import numpy as shown, if you are using it locally it might have to be downloaded first

In [2]:
# some random data, in this case: array of heights of 45 US presidents
heights = [189, 170, 189, 163, 183, 171, 185,
           168, 173, 183, 173, 173, 175, 178, 183, 193,
           178, 173, 174, 183, 183, 180, 168, 180, 170,
           178, 182, 180, 183, 178, 182, 188, 175, 179,
           183, 193, 182, 183, 177, 185, 188, 188, 182,
           185, 191]

Say we want to check how many presidents are higher than 188 cm. This operation will not necessarily be faster, due to its relative simplicity.



In [18]:
#Without numpy:
import timeit

# code snippet whose execution time is to be measured
setup = "import numpy as np"

code = '''
heights = [189, 170, 189, 163, 183, 171, 185,
           168, 173, 183, 173, 173, 175, 178, 183, 193,
           178, 173, 174, 183, 183, 180, 168, 180, 170,
           178, 182, 180, 183, 178, 182, 188, 175, 179,
           183, 193, 182, 183, 177, 185, 188, 188, 182,
           185, 191]

def high_presidents_normal():
  count = 0
  for height in heights:
    if height > 188:
      count += 1
  print(count)
high_presidents_normal()
'''

time = timeit.timeit(setup=setup, stmt=code, number=1)
print(f'Time: {round(time * 1000)}ms')

5
Time: 1ms


In [17]:
# And with numpy:

setup = "import numpy as np"

code = '''
heights = [189, 170, 189, 163, 183, 171, 185,
           168, 173, 183, 173, 173, 175, 178, 183, 193,
           178, 173, 174, 183, 183, 180, 168, 180, 170,
           178, 182, 180, 183, 178, 182, 188, 175, 179,
           183, 193, 182, 183, 177, 185, 188, 188, 182,
           185, 191]

def high_presidents_numpy():
  heights_arr = np.array(heights)
  print((heights_arr > 188).sum())
high_presidents_numpy()
'''

time = timeit.timeit(setup=setup, stmt=code, number=1)
print(f'Time: {round(time * 1000)}ms')

5
Time: 2ms


We can do many things in numpy:

In [20]:
# check size of the arrays
heights_arr = np.array(heights)
print(heights_arr.size)

45


In [21]:
# check shape (dimensions of the arrays)
heights_arr.shape

(45,)

In [22]:
ages = [57, 61, 57, 57, 58, 57, 61, 54, 68,
        51, 49, 64, 50, 48, 65, 52, 56, 46, 54, 49,
        51, 47, 55, 55, 54, 42, 51, 56, 55, 51, 54,
        51, 60, 62, 43, 55, 56, 61, 52, 69, 64, 46,
        54, 47, 70]

In [28]:
# work with arrays
heights_and_ages = heights + ages
h_and_a = np.array(heights_and_ages)
heights_ages = h_and_a.reshape((2, 45))
print(heights_ages, heights_ages.shape)

[[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
  174 183 183 180 168 180 170 178 182 180 183 178 182 188 175 179 183 193
  182 183 177 185 188 188 182 185 191]
 [ 57  61  57  57  58  57  61  54  68  51  49  64  50  48  65  52  56  46
   54  49  51  47  55  55  54  42  51  56  55  51  54  51  60  62  43  55
   56  61  52  69  64  46  54  47  70]] (2, 45)


In [24]:
# check the type of the array
h_and_a.dtype

dtype('int64')

In [25]:
# index arrays
heights_arr[2]

189

In [30]:
heights_ages[0, 2]

189

In [31]:
# first three elements
heights_ages[0, 0:3] # does the same thing as heights_ages[0, :3]

array([189, 170, 189])

In [35]:
ages_arr = np.array(ages)
ages_arr = ages_arr.reshape((45,1))
print(ages_arr.shape, ages_arr[:3,])

(45, 1) [[57]
 [61]
 [57]]


In [53]:
# combining (stacking) arrays horizontally
heights_arr = heights_arr.reshape((45,1))
ages_arr = ages_arr.reshape((45,1))
height_age_arr = np.hstack((heights_arr, ages_arr))
print(height_age_arr.shape, height_age_arr[:3,])

(45, 2) [[189  57]
 [170  61]
 [189  57]]


In [44]:
# combining (stacking) arrays vertically
ages_arr = ages_arr.reshape((1,45))
heights_arr = heights_arr.reshape((1,45))
age_height_arr = np.vstack((heights_arr, ages_arr))
print(height_age_arr.shape, height_age_arr[:3,:3])

(45, 2) [[189  57]
 [170  61]
 [189  57]]


In [51]:
height_age_arr = np.concatenate((heights_arr, ages_arr), axis=1)
height_age_arr

array([[189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175,
        178, 183, 193, 178, 173, 174, 183, 183, 180, 168, 180, 170, 178,
        182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177,
        185, 188, 188, 182, 185, 191,  57,  61,  57,  57,  58,  57,  61,
         54,  68,  51,  49,  64,  50,  48,  65,  52,  56,  46,  54,  49,
         51,  47,  55,  55,  54,  42,  51,  56,  55,  51,  54,  51,  60,
         62,  43,  55,  56,  61,  52,  69,  64,  46,  54,  47,  70]])

In [54]:
# heights in feet
height_age_arr[:,0]*0.0328084

array([6.2007876, 5.577428 , 6.2007876, 5.3477692, 6.0039372, 5.6102364,
       6.069554 , 5.5118112, 5.6758532, 6.0039372, 5.6758532, 5.6758532,
       5.74147  , 5.8398952, 6.0039372, 6.3320212, 5.8398952, 5.6758532,
       5.7086616, 6.0039372, 6.0039372, 5.905512 , 5.5118112, 5.905512 ,
       5.577428 , 5.8398952, 5.9711288, 5.905512 , 6.0039372, 5.8398952,
       5.9711288, 6.1679792, 5.74147  , 5.8727036, 6.0039372, 6.3320212,
       5.9711288, 6.0039372, 5.8070868, 6.069554 , 6.1679792, 6.1679792,
       5.9711288, 6.069554 , 6.2664044])

In [61]:
young_presidents = height_age_arr[height_age_arr[:, 1] < 55]
young_presidents.reshape((2, 22))

array([[168,  54, 183,  51, 173,  49, 175,  50, 178,  48, 193,  52, 173,
         46, 174,  54, 183,  49, 183,  51, 180,  47],
       [170,  54, 178,  42, 182,  51, 178,  51, 182,  54, 188,  51, 183,
         43, 177,  52, 188,  46, 182,  54, 185,  47]])

In [63]:
fifty_one = (young_presidents[:, :] == 51).sum()
fifty_one

5

In [64]:
mask = height_age_arr[:, 0] >= 182
mask.sum()

23

In [66]:
tall_presidents = height_age_arr[mask, ]
tall_presidents.shape

(23, 2)

In [69]:
mask_2 = (height_age_arr[:, 0] >= 182) & (height_age_arr[:, 1] <= 50)
height_age_arr[mask_2,]

array([[183,  49],
       [183,  43],
       [188,  46],
       [185,  47]])

In [84]:
X = ['1.5', '1', '2', '2.9']
X = [(lambda x: float(x))(x) for x in X]
X = (np.array(X)).reshape(2, 2)
X.mean(axis=1)

array([1.25, 2.45])