In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
!head -n 10 data/iris.csv

sepal_length,sepal_width,petal_length,petal_width,class
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa


### Reading a table using pandas

In [3]:
df = pd.read_csv('data/iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Basic information about dataframe

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### Calculate various statistical metrics for each numeric column
We can also get individual values for the columns:
```py
df['sepal_length'].describe().loc['std']
```

In [5]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Statistical metrics for petal width using NumPy

In [6]:
ndarr = np.array(df['petal_width'], dtype='float')

pw_dict = {
    'min': np.min(ndarr),
    'max': np.max(ndarr),
    'average': round(np.mean(ndarr), 5),
    'dispersion': round(np.var(ndarr), 5),
    'stdeviation': round(np.std(ndarr), 5),
    'median': np.median(ndarr),
    'percentiles': np.percentile(ndarr, (25, 75)).tolist(),
}

pprint(pw_dict, sort_dicts=False)

{'min': 0.1,
 'max': 2.5,
 'average': 1.19867,
 'dispersion': 0.57853,
 'stdeviation': 0.76061,
 'median': 1.3,
 'percentiles': [0.3, 1.8]}


## NumPy arrays

### ndarray initialization

In [7]:
arr = np.array([1, 2, 4, 8, 16, 32], dtype='uint')
print(arr)

[ 1  2  4  8 16 32]


In [8]:
arr = np.array([
    [-1, -2, -3, 0],
    [7, 12, 45, 17],
    [89, 4, 67, -8],
], dtype='int')

print(arr)

[[-1 -2 -3  0]
 [ 7 12 45 17]
 [89  4 67 -8]]


In [9]:
arr = np.arange(0, 100, 10, dtype='float')
print(arr)

[ 0. 10. 20. 30. 40. 50. 60. 70. 80. 90.]


In [10]:
arr = np.ones(12, dtype='uint')
print(arr)

[1 1 1 1 1 1 1 1 1 1 1 1]


In [11]:
arr = np.zeros((8, 8), dtype='uint')
print(arr)

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [12]:
arr = np.linspace(-4, 4, 17)
print(arr)

[-4.  -3.5 -3.  -2.5 -2.  -1.5 -1.  -0.5  0.   0.5  1.   1.5  2.   2.5
  3.   3.5  4. ]


In [13]:
arr = np.random.random((5, 5))
print(arr)

[[0.73077629 0.84379404 0.96378788 0.0283987  0.14244728]
 [0.70971158 0.38880207 0.23912219 0.71920356 0.0660731 ]
 [0.98519819 0.70469275 0.02577313 0.167078   0.58887726]
 [0.86900249 0.23294121 0.3002606  0.27162322 0.30684567]
 [0.0830974  0.04561795 0.15381157 0.72786386 0.75013246]]


In [14]:
arr = np.random.uniform(0, 10, (3, 3))
print(arr)

[[9.76422448 3.45012014 5.15166064]
 [3.97330508 3.2711411  1.6142528 ]
 [8.91913001 1.39427902 6.9954141 ]]


In [15]:
arr = np.random.randint(20, 50, (7, 7))
print(arr)

[[29 47 22 40 26 46 45]
 [46 29 48 40 26 42 20]
 [26 23 45 45 45 40 41]
 [41 35 45 38 46 45 36]
 [42 31 49 45 39 25 25]
 [23 21 21 29 34 35 22]
 [28 41 41 39 49 44 29]]


In [16]:
arr = np.empty((3, 3))
print(arr)

[[9.76422448 3.45012014 5.15166064]
 [3.97330508 3.2711411  1.6142528 ]
 [8.91913001 1.39427902 6.9954141 ]]


### Indexing, slicing and basic operations

In [17]:
arr = np.random.randint(0, 100, (5, 5), dtype='uint')
print(f"{arr}\n")

print(f"dimension: {arr.ndim}\nsize of array dimensions: {arr.shape}\nnumber of elements in the array: {arr.size}\n")
print(f"array data type: {arr.dtype}\narray size in bytes: {arr.nbytes}\nelement size in bytes: {arr.itemsize}\n")

print("Array indexing: ")
print(f"arr[2][3] = {arr[2][3]}\narr[4][0] = {arr[4][0]}\narr[4][-2] = {arr[4][-2]}\n")

print("Slices:")
print(f"arr[0][1:4] = {arr[0][1:4]}\narr[2][5::-2] = {arr[2][5::-2]}")

[[36 39 51 64 21]
 [13 47 39 48 70]
 [23 18 43 58 53]
 [66 50 31 26 52]
 [87 93 92 30 15]]

dimension: 2
size of array dimensions: (5, 5)
number of elements in the array: 25

array data type: uint64
array size in bytes: 200
element size in bytes: 8

Array indexing: 
arr[2][3] = 58
arr[4][0] = 87
arr[4][-2] = 30

Slices:
arr[0][1:4] = [39 51 64]
arr[2][5::-2] = [53 43 23]


In [18]:
arr = np.array([1, 2, 3, 4, 2], dtype='uint')
print(f"arr: {arr}")

arr += 1
print(f"arr+1 = {arr}")

arr **= 2
print(f"arr^2 = {arr}")

arr: [1 2 3 4 2]
arr+1 = [2 3 4 5 3]
arr^2 = [ 4  9 16 25  9]


In [19]:
arr = np.array([2, 3, 4, 2], dtype='uint')

arr_pow = np.power.reduce(arr) # 2^3 => 8^4 => 4096^2 => 16777216
print(arr_pow)

arr_acc = np.power.accumulate(arr)
print(arr_acc)

16777216
[       2        8     4096 16777216]


In [20]:
arr_1 = np.arange(1, 12, 2, dtype='uint') # [1 3 5 7 9 11]
arr_2 = np.arange(1, 7, dtype='uint')     # [1 2 3 4 5 6]

sum_of_pairs = np.multiply.outer(arr_1, arr_2)
print(sum_of_pairs)

[[ 1  2  3  4  5  6]
 [ 3  6  9 12 15 18]
 [ 5 10 15 20 25 30]
 [ 7 14 21 28 35 42]
 [ 9 18 27 36 45 54]
 [11 22 33 44 55 66]]


In [21]:
arr = np.arange(2, 13, 2, dtype='uint') # [2, 4, 6, 8, 10, 12]

print(f"min: {arr.min()}\nmax: {arr.max()}\naverage value: {arr.mean()}\n")
print(f"standard deviation: {arr.std()}\ndispersion: {arr.var()}\nmedian: {np.median(arr)}")
print(f"percentiles [25%, 75%]: {np.percentile(arr, (25, 75))}")

min: 2
max: 12
average value: 7.0

standard deviation: 3.415650255319866
dispersion: 11.666666666666666
median: 7.0
percentiles [25%, 75%]: [4.5 9.5]
