In [0]:
import numpy as np

### Array manipulation

In [0]:
arr = np.arange(9)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [0]:
arr2D_1 = arr.reshape(3,3)
arr2D_1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [0]:
arr2D_2 = np.arange(10,19).reshape(3,3)
arr2D_2

array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [0]:
np.concatenate((arr2D_1, arr2D_2))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [0]:
np.concatenate((arr2D_1, arr2D_2), axis=1)

array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

In [0]:
np.concatenate((arr2D_1, arr2D_2, arr2D_1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8]])

In [0]:
# Alternatives
np.vstack((arr2D_1, arr2D_2))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [0]:
np.hstack((arr2D_1, arr2D_2))

array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

## np.argsort

In [0]:
score = np.array([70, 60, 50, 10, 90, 40, 80])
name = np.array(['Ada', 'Ben', 'Charlie', 'Danny', 'Eden', 'Fanny', 'George'])
sorted_name = name[np.argsort(score)] # an array of names in ascending order of their scores
print(sorted_name)   # ['Danny' 'Fanny' 'Charlie' 'Ben' 'Ada' 'George' 'Eden']

original_name = sorted_name[np.argsort(np.argsort(score))]
print(original_name) # ['Ada' 'Ben' 'Charlie' 'Danny' 'Eden' 'Fanny' 'George']

%timeit name[np.argsort(score)] 
# 1.83 µs ± 182 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

%timeit sorted(zip(score, name))
# 3.2 µs ± 76.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

['Danny' 'Fanny' 'Charlie' 'Ben' 'Ada' 'George' 'Eden']
['Ada' 'Ben' 'Charlie' 'Danny' 'Eden' 'Fanny' 'George']
The slowest run took 40.26 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.19 µs per loop
The slowest run took 12.29 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.11 µs per loop


## Broadcasting - shapes

In [0]:
Argument 1  (4D array): 7 × 5 × 3 × 1
Argument 2  (3D array):     1 × 3 × 9
Output      (4D array): 7 × 5 × 3 × 9

SyntaxError: ignored

## Ellipsis and NewAxis — dimensions

In [0]:
arr = np.array(range(1000)).reshape(2,5,2,10,-1)
print(arr[:,:,:,3,2] == arr[...,3,2])

# [[[ True,  True],
#   [ True,  True],
#   [ True,  True],
#   [ True,  True],
#   [ True,  True]],
#  [[ True,  True],
#   [ True,  True],
#   [ True,  True],
#   [ True,  True],
#   [ True,  True]]])

print(arr.shape)                       # (2, 5, 2, 10, 5)
print(arr[...,np.newaxis,:,:,:].shape) # (2, 5, 1, 2, 10, 5)

[[[ True  True]
  [ True  True]
  [ True  True]
  [ True  True]
  [ True  True]]

 [[ True  True]
  [ True  True]
  [ True  True]
  [ True  True]
  [ True  True]]]
(2, 5, 2, 10, 5)
(2, 5, 1, 2, 10, 5)


## Masked Array — selection

In [0]:
# np.ma.MaskedArray(data=arr, mask=invalid_mask)

In [0]:
import math

def is_prime(n):
    assert n > 1, 'Input must be larger than 1'
    if n % 2 == 0 and n > 2: 
        return False
    return all(n % i for i in range(3, int(math.sqrt(n)) + 1, 2))

arr = np.array(range(2,100))
non_prime_mask = [not is_prime(n) for n in arr]
prime_arr = np.ma.MaskedArray(data=arr, mask=non_prime_mask)
print(prime_arr)

# [2 3 -- 5 -- 7 -- -- -- 11 -- 13 -- -- -- 17 -- 19 -- -- -- 23 -- -- -- --
#  -- 29 -- 31 -- -- -- -- -- 37 -- -- -- 41 -- 43 -- -- -- 47 -- -- -- --
#  -- 53 -- -- -- -- -- 59 -- 61 -- -- -- -- -- 67 -- -- -- 71 -- 73 -- --
#  -- -- -- 79 -- -- -- 83 -- -- -- -- -- 89 -- -- -- -- -- -- -- 97 -- --]

arr = np.array(range(11))
print(arr.sum())        # 55

arr[-1] = -999 # indicates missing value

masked_arr = np.ma.masked_values(arr, -999)
print(masked_arr.sum()) # 45  

[2 3 -- 5 -- 7 -- -- -- 11 -- 13 -- -- -- 17 -- 19 -- -- -- 23 -- -- -- --
 -- 29 -- 31 -- -- -- -- -- 37 -- -- -- 41 -- 43 -- -- -- 47 -- -- -- --
 -- 53 -- -- -- -- -- 59 -- 61 -- -- -- -- -- 67 -- -- -- 71 -- 73 -- --
 -- -- -- 79 -- -- -- 83 -- -- -- -- -- 89 -- -- -- -- -- -- -- 97 -- --]
55
45


In [0]:
### Percentile

In [8]:
s = """
6   44   52   72  197  217  219  230  279  283  298  356  392  422
  463  468  479  487  507  510  520  537  540  550  557  559  560  567
  586  594  595  604  623  628  637  661  706  720  732  802  803  823
  895  903  928  950  974  994 1009 1018 1047 1086 1093 1115 1217 1230
 1271 1378 1446 1478 1504 1508 1535 1553 1597 1671 1686 1697 1702 1723
 1807 1862 1914 1962 1989 2037 2105 2260 2276 2320 2326 2461 2466 2492
 2502 2532 2550 2581 2633 2672 2792 2794 2958 2983 2989 3029 3034 3086
 3105 3113 3138 3277 3340 3350 3356 3468 3470 3497 3526 3545 3624 3675
 3781 3846 3948 3968 3987 4012 4015 4016 4036 4060 4112 4267 4321 4373
 4479 4514 4528 4578 4663 4675 4725 4748 4836 4840 4851 4876 4974 5140
 5185 5237 5240 5308 5311 5355 5426 5442 5501 5506 5523 5533 5618 5673
 5720 5725 5820 5842 5921 5949 6019 6045 6156 6257 6447 6772 7106 7166
 7181 7196 7222 7231 7249 7455 7466 7503 7513 7557 7583 7669 7686 7722
 7931 7983 8051 8078 8186 8323 8701 8988 9028 9074 9234 9308 9465
""".split(' ')
numbers = np.array([ int(v.strip()) for v in  s if v.strip() != ''])
print(len(numbers), numbers)

s = """
0.5002664  0.5030543  0.5011834  0.50345784 0.5030445  0.50096476
 0.5014635  0.5052597  0.5015462  0.5009198  0.5042279  0.50006586
 0.50178766 0.5023342  0.50314224 0.5004783  0.5055111  0.50520676
 0.502752   0.50549245 0.5023156  0.50282675 0.50116205 0.50143045
 0.5035101  0.5000821  0.50057685 0.50085443 0.5011579  0.5002063
 0.50650465 0.50328237 0.5013409  0.5014749  0.5012579  0.50692064
 0.505105   0.5018501  0.5014952  0.50276613 0.50555366 0.50691855
 0.5031804  0.50596476 0.50178415 0.50021064 0.5042699  0.50111026
 0.50641996 0.5032469  0.50847775 0.50062954 0.5001466  0.50336856
 0.5024362  0.50006026 0.5009796  0.5019523  0.5001754  0.50096774
 0.50319767 0.50054866 0.50182873 0.5046981  0.5033597  0.50512415
 0.50483483 0.5014438  0.50194305 0.50123465 0.50008434 0.50341696
 0.50045264 0.5025949  0.50230265 0.5001563  0.50238717 0.5011572
 0.50144017 0.50044996 0.50233644 0.50212455 0.5012261  0.50025487
 0.5008068  0.50028175 0.5020612  0.5019552  0.5036699  0.5035833
 0.5019106  0.5008689  0.5000663  0.5018568  0.500888   0.500508
 0.5014499  0.5001171  0.503839   0.5002827  0.5018912  0.50058854
 0.50156236 0.501077   0.5004424  0.50066143 0.5021202  0.50402784
 0.5026449  0.5004557  0.5001347  0.5005555  0.50083715 0.50087285
 0.5012383  0.50178754 0.50007164 0.50061536 0.50035024 0.50409687
 0.5041618  0.50725734 0.50100785 0.5059609  0.5033471  0.5000567
 0.5003834  0.5026191  0.50031006 0.5008642  0.5008177  0.50193226
 0.5019959  0.50106937 0.5004341  0.50066406 0.50575113 0.5023617
 0.50140333 0.5010693  0.5000706  0.5018855  0.50137466 0.5004089
 0.50492716 0.50401115 0.50000346 0.500923   0.50069857 0.5001628
 0.5006791  0.50340325 0.50013334 0.5011139  0.50137734 0.50342333
 0.50395393 0.50051075 0.5035721  0.5021655  0.5002876  0.50285715
 0.50163585 0.50263757 0.50299925 0.50205386 0.5027933  0.5011403
 0.501143   0.50303525 0.500594   0.5029259  0.5011528  0.5018968
 0.50152516 0.5013398  0.50072265 0.50102705 0.50118047 0.50084674
 0.50000525 0.5034939  0.5034052  0.5060979  0.5025529  0.5020137
 0.5032606  0.50090754 0.50194335 0.50144184 0.5018807  0.50249344
 0.5002677  0.50301665 0.50083494
""".split(' ')
probas = np.array([ float(v.strip()) for v in  s if v.strip() != ''])
print(len(probas), probas)

195 [   6   44   52   72  197  217  219  230  279  283  298  356  392  422
  463  468  479  487  507  510  520  537  540  550  557  559  560  567
  586  594  595  604  623  628  637  661  706  720  732  802  803  823
  895  903  928  950  974  994 1009 1018 1047 1086 1093 1115 1217 1230
 1271 1378 1446 1478 1504 1508 1535 1553 1597 1671 1686 1697 1702 1723
 1807 1862 1914 1962 1989 2037 2105 2260 2276 2320 2326 2461 2466 2492
 2502 2532 2550 2581 2633 2672 2792 2794 2958 2983 2989 3029 3034 3086
 3105 3113 3138 3277 3340 3350 3356 3468 3470 3497 3526 3545 3624 3675
 3781 3846 3948 3968 3987 4012 4015 4016 4036 4060 4112 4267 4321 4373
 4479 4514 4528 4578 4663 4675 4725 4748 4836 4840 4851 4876 4974 5140
 5185 5237 5240 5308 5311 5355 5426 5442 5501 5506 5523 5533 5618 5673
 5720 5725 5820 5842 5921 5949 6019 6045 6156 6257 6447 6772 7106 7166
 7181 7196 7222 7231 7249 7455 7466 7503 7513 7557 7583 7669 7686 7722
 7931 7983 8051 8078 8186 8323 8701 8988 9028 9074 9234 9308 9465]
195 [0

In [13]:
idxs = np.where(numbers < 1000)
print(idxs)
selected_numbers = numbers[idxs]
print(len(selected_numbers), selected_numbers)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]),)
48 [  6  44  52  72 197 217 219 230 279 283 298 356 392 422 463 468 479 487
 507 510 520 537 540 550 557 559 560 567 586 594 595 604 623 628 637 661
 706 720 732 802 803 823 895 903 928 950 974 994]


In [11]:
avg_all_probas = np.average(probas)
print(avg_all_probas)

0.5020542709230769


In [15]:
avg_selected_probas = np.average(probas[idxs])
print(probas[idxs])
print(avg_selected_probas)

[0.5002664  0.5030543  0.5011834  0.50345784 0.5030445  0.50096476
 0.5014635  0.5052597  0.5015462  0.5009198  0.5042279  0.50006586
 0.50178766 0.5023342  0.50314224 0.5004783  0.5055111  0.50520676
 0.502752   0.50549245 0.5023156  0.50282675 0.50116205 0.50143045
 0.5035101  0.5000821  0.50057685 0.50085443 0.5011579  0.5002063
 0.50650465 0.50328237 0.5013409  0.5014749  0.5012579  0.50692064
 0.505105   0.5018501  0.5014952  0.50276613 0.50555366 0.50691855
 0.5031804  0.50596476 0.50178415 0.50021064 0.5042699  0.50111026]
0.5026514897916666


In [16]:
selected_idxs = np.where(probas[idxs] > avg_selected_probas)
print(selected_numbers[selected_idxs])

[ 44  72 197 230 298 463 479 487 507 510 537 557 595 604 661 706 802 803
 823 895 903 974]


In [17]:
selected_idxs = np.where(probas[idxs] > avg_all_probas)
print(selected_numbers[selected_idxs])

[ 44  72 197 230 298 422 463 479 487 507 510 520 537 557 595 604 661 706
 802 803 823 895 903 974]
