## NumPy Arrays

More efficient than lists or tuples. Mutable objects (we can change them in place). Have **methods**, **attributes** attached to them for more functionality.

In [1]:
## import
import numpy as np

## numpy array
x = np.array([7,2,9,10])

In [2]:
## type
type(x)

numpy.ndarray

In [3]:
## dir() lists all attached methods and attributes
dir(x)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__class_getitem__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__dlpack__',
 '__dlpack_device__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',

In [4]:
## .T calls the attached object
## if it is a method we want to use ()
## An attribute just used the att name
x.T   ## computes the transpose of a matrix

array([ 7,  2,  9, 10])

In [7]:
## .mean() is a method
x.mean()

7.0

In [8]:
## .shape attribute shows the dimensions
x.shape

(4,)

In [9]:
## subsetting and slicing works
x

array([ 7,  2,  9, 10])

In [10]:
## indexing starts at zero
x[0]

7

In [11]:
## slicing does NOT contain the ending number
x[0:3]

array([7, 2, 9])

In [12]:
## if you start from 0, there is no need to include it
x[:3]

array([7, 2, 9])

In [13]:
x

array([ 7,  2,  9, 10])

In [14]:
x[1:]

array([ 2,  9, 10])

## NumPy 2D arrays (Matrices)

* Have both obs (axis 0) and variables (axis 1)
* Most common array to represent tabular data

In [15]:
A = np.array([ [5.2,3.0,4.0], [9.1, 0.1, 0.2] ])

In [16]:
A

array([[5.2, 3. , 4. ],
       [9.1, 0.1, 0.2]])

In [17]:
type(A)

numpy.ndarray

In [18]:
## shape
A.shape

(2, 3)

In [19]:
A

array([[5.2, 3. , 4. ],
       [9.1, 0.1, 0.2]])

In [20]:
## select first row
A[0,:]

array([5.2, 3. , 4. ])

In [21]:
## select column 2
A[:,1]

array([3. , 0.1])

## Tensors (k-dimensional NumPy arrays)

* concatenations of different matrices.

In [22]:
image_tensor = np.array([  [ [5.2,3.0,4.0], [9.1, 0.1, 0.2] ] ,
                           [ [5.2,3.0,4.0], [9.1, 0.1, 0.2] ] ,
                           [ [5.2,3.0,4.0], [9.1, 0.1, 0.2] ] ])

In [23]:
## shape (number of channels, number of obs, number of cols)
image_tensor.shape

(3, 2, 3)

In [24]:
image_tensor[0,:,:]

array([[5.2, 3. , 4. ],
       [9.1, 0.1, 0.2]])

## Generating Data

* From distributions
* Random integers
* Constant values (matrices of 0s, 1s)

In [25]:
## e.g., np.zeros

## **PANDAS**

* Extension to NumPy for Data Manipulation
* When done with data manipulation, you want to revert back to NumPy
* Adds the index
* Selection of variables is easier

In [26]:
import pandas as pd

## gpa Series (equivalent to 1D array)
gpa = pd.Series([3.2, 4.0, 2.9, 3.0, 3.7])

In [27]:
## type
type(gpa)

In [29]:
## index is the ID variable
## pull gpa for student with ID 0
gpa[0]

3.2

In [30]:
## attached methods
gpa.mean()

3.3600000000000003

In [33]:
## slicing does NOT WORK as numpy
## you want to use .iloc
gpa.iloc[:3]

Unnamed: 0,0
0,3.2
1,4.0
2,2.9


In [34]:
## Supposed you are done manipulating the data
## extract the np array
gpa_numpy = gpa.values

In [35]:
type(gpa_numpy)

numpy.ndarray

## Data Frame (2D NumPy Equivalent)

* axis 0: rows
* axis 1: cols

In [36]:
df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/main/data/cereal.csv')

In [37]:
## type
type(df)

In [38]:
## .shape attribute
df.shape

(77, 16)

In [39]:
## extract the numpy array
df.values

array([['100% Bran', 'N', 'C', ..., 1.0, 0.33, 68.402973],
       ['100% Natural Bran', 'Q', 'C', ..., 1.0, 1.0, 33.983679],
       ['All-Bran', 'K', 'C', ..., 1.0, 0.33, 59.425505],
       ...,
       ['Wheat Chex', 'R', 'C', ..., 1.0, 0.67, 49.787445],
       ['Wheaties', 'G', 'C', ..., 1.0, 1.0, 51.592193],
       ['Wheaties Honey Gold', 'G', 'C', ..., 1.0, 0.75, 36.187559]],
      dtype=object)

In [40]:
## .head() .tail() show obs
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [41]:
## Selecting variables
## object notation .
df.calories

Unnamed: 0,calories
0,70
1,120
2,70
3,50
4,110
...,...
72,110
73,110
74,100
75,100


In [42]:
## subsetting notation
df['calories']

Unnamed: 0,calories
0,70
1,120
2,70
3,50
4,110
...,...
72,110
73,110
74,100
75,100


In [44]:
df.set_index(['name'], inplace = True)

In [45]:
df.head(3)

Unnamed: 0_level_0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505


In [50]:
## select All-Bran
## select the cereal with ID 'All-Bran'
df['All-Bran ']

KeyError: 'All-Bran '

In [51]:
## remove the index
df.reset_index(inplace = True)

In [52]:
df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


In [53]:
## remove variables
df.drop(['name'], axis = 1, inplace = True)

In [55]:
## Checking how missing values we have
## missing values per variables
df.isna().sum()

Unnamed: 0,0
mfr,0
type,0
calories,0
protein,0
fat,0
sodium,0
fiber,0
carbo,0
sugars,0
potass,0


In [58]:
## Indexing does NOT work
## instead use .iloc
df.iloc[:,:3]

Unnamed: 0,mfr,type,calories
0,N,C,70
1,Q,C,120
2,K,C,70
3,K,C,50
4,R,C,110
...,...,...,...
72,G,C,110
73,G,C,110
74,R,C,100
75,G,C,100


In [62]:
## .describe()
## 8 number summary for numeric variables
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
calories,77.0,106.883117,19.484119,50.0,100.0,110.0,110.0,160.0
protein,77.0,2.545455,1.09479,1.0,2.0,3.0,3.0,6.0
fat,77.0,1.012987,1.006473,0.0,0.0,1.0,2.0,5.0
sodium,77.0,159.675325,83.832295,0.0,130.0,180.0,210.0,320.0
fiber,77.0,2.151948,2.383364,0.0,1.0,2.0,3.0,14.0
carbo,77.0,14.597403,4.278956,-1.0,12.0,14.0,17.0,23.0
sugars,77.0,6.922078,4.444885,-1.0,3.0,7.0,11.0,15.0
potass,77.0,96.077922,71.286813,-1.0,40.0,90.0,120.0,330.0
vitamins,77.0,28.246753,22.342523,0.0,25.0,25.0,25.0,100.0
shelf,77.0,2.207792,0.832524,1.0,1.0,2.0,3.0,3.0


In [63]:
## .info() to list variable types and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mfr       77 non-null     object 
 1   type      77 non-null     object 
 2   calories  77 non-null     int64  
 3   protein   77 non-null     int64  
 4   fat       77 non-null     int64  
 5   sodium    77 non-null     int64  
 6   fiber     77 non-null     float64
 7   carbo     77 non-null     float64
 8   sugars    77 non-null     int64  
 9   potass    77 non-null     int64  
 10  vitamins  77 non-null     int64  
 11  shelf     77 non-null     int64  
 12  weight    77 non-null     float64
 13  cups      77 non-null     float64
 14  rating    77 non-null     float64
dtypes: float64(5), int64(8), object(2)
memory usage: 9.1+ KB


In [64]:
df

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


In [72]:
## rename
df.rename(columns = {'carbo': 'carbs'}, inplace = True)

In [73]:
df

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbs,sugars,potass,vitamins,shelf,weight,cups,rating
0,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193
