In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Creating an Array

In [3]:
# Using a list
a1 = np.array(['a', 'b', 'c'])

# Using a tuple
a2 = np.array(('a', 'b', 'c'))

print("a1:", a1)
print("a2:", a2)
print("Are they equal?", np.array_equal(a1, a2))

a1: ['a' 'b' 'c']
a2: ['a' 'b' 'c']
Are they equal? True


## Slicing

In [4]:
## Slicing an array

arr = np.array([5, 10, 15, 20, 25, 30, 35, 40])
print("Array:", arr)
print("Elements 2 and 3:", arr[2:4])
print("First 2 elements:", arr[:2])
print("Last 2 elements:", arr[-2:])

Array: [ 5 10 15 20 25 30 35 40]
Elements 2 and 3: [15 20]
First 2 elements: [ 5 10]
Last 2 elements: [35 40]


## Subsetting

In [5]:
arr = np.array([5, 10, 15])
print("Array:", arr)
print("Greater than 8:", arr[arr > 8])
print("Greater than 8 and less than 12:", arr[(arr > 8) & (arr < 12)])
print("Greater than 8 and less than 12:", arr[np.logical_and(arr > 8, arr < 12)])

Array: [ 5 10 15]
Greater than 8: [10 15]
Greater than 8 and less than 12: [10]
Greater than 8 and less than 12: [10]


## Mapping

In [6]:
arr = np.array([4, 6, 8, 10])
squarer = lambda i: i ** 2
squared = squarer(arr)
print(squared)

[ 16  36  64 100]


## Reshaping

In [7]:
arr = np.array([
    [3, 4, 5],
    [6, 7, 8]])

# x.reshape((num_rows, num_columns))
print("Array:\n", arr)
print("\n1 row, 1 element with 6 columns:\n", arr.reshape((1, 6)))

# Equivalent to this because "If an integer, 
# then the result will be a 1-D array of that length."
print("\n1 row, 6 columns:\n", arr.reshape(6))

print("\n2 rows, 3 columns:\n", arr.reshape((2, 3)))

print("\n6 rows, 1 column:\n", arr.reshape((6, 1)))

# Equivalent to:
# Because "One shape dimension can be -1. In this case, the 
# value is inferred from the length of the array and remaining dimensions.""
print("\nAs many rows as needed, 1 columns:\n", arr.reshape((-1, 1)))

Array:
 [[3 4 5]
 [6 7 8]]

1 row, 1 element with 6 columns:
 [[3 4 5 6 7 8]]

1 row, 6 columns:
 [3 4 5 6 7 8]

2 rows, 3 columns:
 [[3 4 5]
 [6 7 8]]

6 rows, 1 column:
 [[3]
 [4]
 [5]
 [6]
 [7]
 [8]]

As many rows as needed, 1 columns:
 [[3]
 [4]
 [5]
 [6]
 [7]
 [8]]


## Creating an array of evenly spaced numbers between two values

In [8]:
print(np.linspace(1, 2, num=2))
print(np.linspace(1, 2, num=3))

[1. 2.]
[1.  1.5 2. ]


## Stacking

In [9]:
# Horizontal stacking
a1 = np.array(['a', 'b', 'c'])
a2 = np.array(['d', 'e', 'f'])
print(np.hstack([a1, a2]))

['a' 'b' 'c' 'd' 'e' 'f']


In [10]:
# Vertical stacking
a1 = np.array(['a', 'b', 'c'])
a2 = np.array(['d', 'e', 'f'])

print(np.vstack([a1, a2]))

[['a' 'b' 'c']
 ['d' 'e' 'f']]


In [11]:
# Stacking 1-d arrays

a1 = np.array(['a', 'b', 'c'])
a2 = np.array(['d', 'e', 'f'])
print(np.column_stack((a1, a2)))

# Equivalent to:
# print(np.array(list(zip(a1, a2))))

[['a' 'd']
 ['b' 'e']
 ['c' 'f']]


## Calculating the maximum values element-wise

In [12]:
a1 = np.array([2, 5, 10])
a2 = np.array([3, 4, 6])

print(np.maximum(a1, a2))

[ 3  5 10]


In [13]:
# Can also supply a number for the second argument in which case
# it will be broadcast to an array and compared element-wise:
a = np.array([2, 5, 10])
print(np.maximum(a1, 6))

[ 6  6 10]


## Min and Max

In [14]:
arr = np.array([2, 5, 10])

print("Min:", arr.min())
print("Min:", np.min(arr))
print("Max:", arr.max())
print("Max:", np.max(arr))

Min: 2
Min: 2
Max: 10
Max: 10


## Finding the positions of the largest values in an array

In [15]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
print("Array:\n", arr)
print("\nPosition of the largest value in the (flattened) array:", np.argmax(arr))
print("\nPosition of largest value in each column:\n", np.argmax(arr, axis=0))
print("\nPosition of largest value in each row:\n", np.argmax(arr, axis=1))

Array:
 [[1 2 3]
 [4 5 6]]

Position of the largest value in the (flattened) array: 5

Position of largest value in each column:
 [1 1 1]

Position of largest value in each row:
 [2 2]


## Returning the indeces that would sort an array

In [16]:
arr = np.array([3, 10, 2])
print(arr.argsort())

[2 0 1]


## Finding array values that contain a string

In [17]:
arr = np.array(['company_id', 'days_after_sign_up', 'user_count'])
matches = arr[np.flatnonzero(np.core.defchararray.find(arr, "_count") != -1)]
print("Method 1:\n", matches)

matches = [column for column in arr if "_count" in column]
print("\nMethod 2:\n", matches)

Method 1:
 ['user_count']

Method 2:
 ['user_count']


## Calculating statistics row and column wise

In [18]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
print("Array:\n", arr)
print("\nMean column wise:", arr.mean(axis=0))
print("\nMean row wise:", arr.mean(axis=1))

Array:
 [[1 2 3]
 [4 5 6]]

Mean column wise: [2.5 3.5 4.5]

Mean row wise: [2. 5.]


## Squeeze

In [19]:
arr = np.array([[1], [4], [7]])
print("Array:\n", arr)

print("\nSqueezed:\n", arr.squeeze())

Array:
 [[1]
 [4]
 [7]]

Squeezed:
 [1 4 7]


## Median

In [20]:
arr = np.array([np.nan, 4, 8, 10])
print("Array:\n", arr)
print("Median of an array with a nan will be nan:", np.median(arr))
print("Median of an array with with nan removed will be a number:", np.nanmedian(arr))

Array:
 [nan  4.  8. 10.]
Median of an array with a nan will be nan: nan
Median of an array with with nan removed will be a number: 8.0


## Count Nonzero

In [21]:
arr = np.array([0, 0, 2, 4, 8])

print("Size:", np.size(arr))
print("Count:", len(arr))
print("Non-zero count:", np.count_nonzero(arr))

Size: 5
Count: 5
Non-zero count: 3


## Indexing

In [22]:
arr = np.array([0, 1, 2, 4, 8])

# The basic slice syntax is i:j:k where i is the starting index, 
# j is the stopping index, and k is the step

print("All:", arr[:])
print("Every other starting at index 0:", arr[::2])
print("Every other starting at index 1:", arr[1::2])
print("Reversed:", arr[::-1])
print("Reversed except last element:", arr[-2::-1])

All: [0 1 2 4 8]
Every other starting at index 0: [0 2 8]
Every other starting at index 1: [1 4]
Reversed: [8 4 2 1 0]
Reversed except last element: [4 2 1 0]


## Slicing

In [23]:
arr = np.array([[2, 4, 8], [3, 6, 9], [4, 8, 12], [5, 10, 15]])
print(arr)

# arr[rows:columns]

print("\nRows: 1 and 2, Column: 0:\n", arr[0:2,0])
print("\nRows: All, Column: 0:\n", arr[:,0])
print("\nRows: 1, Column: All:\n", arr[0,:])
print("\nRows: 1, Column: Last:\n", arr[0,-1])
print("\nRows: All, Column: All except last:\n", arr[:,:-1])
print("\nRows: Last, Column: Last:\n", arr[-1,-1])

[[ 2  4  8]
 [ 3  6  9]
 [ 4  8 12]
 [ 5 10 15]]

Rows: 1 and 2, Column: 0:
 [2 3]

Rows: All, Column: 0:
 [2 3 4 5]

Rows: 1, Column: All:
 [2 4 8]

Rows: 1, Column: Last:
 8

Rows: All, Column: All except last:
 [[ 2  4]
 [ 3  6]
 [ 4  8]
 [ 5 10]]

Rows: Last, Column: Last:
 15


## Flattening an array

In [24]:
y = np.array([[2], [4], [6]])

# ravel: "Return a contiguous flattened array."
y.ravel()

# Same as:
# y.reshape(1, -1)

array([2, 4, 6])

## Generating a range of values

In [25]:
np.arange(50, 100, 5)

array([50, 55, 60, 65, 70, 75, 80, 85, 90, 95])

In [26]:
# Similar process to generate a list, but this doesn't work for decimals
list(range(50, 100, 5))

[50, 55, 60, 65, 70, 75, 80, 85, 90, 95]

## Generating a specific number of values between two other values

In [27]:
# linspace = linear space
np.linspace(0, 50, 6)

array([ 0., 10., 20., 30., 40., 50.])

## Calculating the dot product

In [28]:
a = np.array([3, 4])
b = np.array([5, 6])

# 3 * 5 + 4 * 6
np.dot(a, b)

39

### Shorthand

In [29]:
a@b

39

## Measuring cosine similarity

In [30]:
a = np.array([4, 7, 1])
b = np.array([5, 2, 3])

dot_product = a@b
magnitude = np.sqrt(a[0]**2 + a[1]**2 + a[2]**2) * np.sqrt(b[0]**2 + b[1]**2 + b[2]**2)
print("Manually:", dot_product / magnitude)

sim = cosine_similarity([a, b])
print("\nFunction:\n", sim)

Manually: 0.7388188340435563

Function:
 [[1.         0.73881883]
 [0.73881883 1.        ]]


## Multiplication of two arrays is element wise

In [36]:
a = np.array([-1, 1])
b = np.array([2, 3])
c = a * b

In [37]:
c.sum()

1

### Same as dot product

In [34]:
a@b

1

In [42]:
4/3

1.3333333333333333