## **NumPy**

Stands for numerical python. Implemented in C. Runs at compiled-language speed. The main object is called the multidimensional array.

In [2]:
## import
import numpy as np

## 1D array (vector)
gpa = np.array( [3.2, 4.0, 2.9, 3.7, 3.0] )

In [3]:
## type of object
type(gpa)

numpy.ndarray

In [4]:
## to see attached methods and attributes of an object
## use dir()
dir(gpa)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__dlpack__',
 '__dlpack_device__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',

In [5]:
## Some attached objects are attributes
## .T gets the transpose of the array
gpa.T

array([3.2, 4. , 2.9, 3.7, 3. ])

In [7]:
## Some attached objects are methods (functions)
## .mean() gets the average of the array
gpa.mean()

3.3600000000000003

In [9]:
from statistics import mean
mean(gpa)

3.36

In [10]:
## Important methods/attributes NumPy
## shape shows the dimension of the array
gpa.shape

(5,)

In [12]:
## Indexing in Python starts at zero
gpa

array([3.2, 4. , 2.9, 3.7, 3. ])

In [13]:
## subset example
## what is the gpa for student number 2?
gpa[1]

4.0

In [14]:
## what is the gpa for the last student?
gpa[-1]

3.0

In [18]:
## slicing
## what are the gpas for the first 3 students?
gpa[0:3]

array([3.2, 4. , 2.9])

In [19]:
## start at 0
gpa[:3]

array([3.2, 4. , 2.9])

In [21]:
## What are the gpas for the last 3 students?
gpa[2:]

array([2.9, 3.7, 3. ])

#### **2D Array (matrices)**

Two axes:

* axis 0: rows
* axis 1: columns

This is the most common array in tabular data.

In [22]:
## matrix (lists of lists)
A = np.array([ [5.2,3.0, 4.5],[9.1,0.1,0.3] ])

In [23]:
## type of A
type(A)

numpy.ndarray

In [25]:
A

array([[5.2, 3. , 4.5],
       [9.1, 0.1, 0.3]])

In [24]:
## some attached objects
A.T

array([[5.2, 9.1],
       [3. , 0.1],
       [4.5, 0.3]])

In [27]:
## shape
A.shape

(2, 3)

In [28]:
## max
A.max()

9.1

In [29]:
## 2D arrays have double indexing
A

array([[5.2, 3. , 4.5],
       [9.1, 0.1, 0.3]])

In [30]:
## select the first row
A[0,:]

array([5.2, 3. , 4.5])

In [31]:
## select column 2
A[:,1]

array([3. , 0.1])

### **k-dimensional Array (Tensor)**

We can combine different matrices together. The most common covered in this course will be the 3-dimensional array used in images.

* axis 0: channel (red, green, blue)
* axis 1: rows
* axis 2: cols

In [32]:
image_tensor = np.array([    [ [5.2,3.0, 4.5],[9.1,0.1,0.3] ],
                             [ [5.2,3.0, 4.5],[9.1,0.1,0.3] ],
                             [ [5.2,3.0, 4.5],[9.1,0.1,0.3] ]    ] )

In [33]:
## shape
image_tensor.shape

(3, 2, 3)

In [34]:
## show me the green channel (1)
image_tensor[0, :, :]

array([[5.2, 3. , 4.5],
       [9.1, 0.1, 0.3]])

In [35]:
## Generate data with numpy
X = np.random.randn(100)

## **Pandas**

Extension to NumPy for data manipulation. Once all manipulation, we generally revert back to NumPy. It adds indices to observations and variables names.

In [36]:
## import
import pandas as pd

## Series
gpa_series = pd.Series([3.2, 4.0, 2.9, 3.7, 3.0])

In [37]:
gpa_series

Unnamed: 0,0
0,3.2
1,4.0
2,2.9
3,3.7
4,3.0


In [38]:
## what is the gpa for student with ID 0?
gpa_series[0]

3.2

In [39]:
## To get the NumPy 1D vector from a series
gpa_series.values

array([3.2, 4. , 2.9, 3.7, 3. ])

#### **Pandas DataFrame**

2D NumPy equivalent with variables names and IDs.

In [40]:
## Read a data set
df = pd.read_csv('/content/sample_data/mnist_test.csv')

In [41]:
df

Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df1 = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/main/data/cereal.csv')

In [43]:
df1

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193
