# Lab to introduce NumPy and Pandas

In [2]:
# First we always import the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Pandas is a nice library to let you read CSV files.
# The main pandas datatype is the "data frame."  
# That's why the usual panda variable name is "df."

df = pd.read_csv("housing.csv")
df  # The last expression in a cell is auto-evaluated and printed.

Unnamed: 0,sqft,bedrooms,price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900
5,1985,4,299900
6,1534,3,314900
7,1427,3,198999
8,1380,3,212000
9,1494,3,242500


In [4]:
# .head() only shows the first few rows

df.head()

Unnamed: 0,sqft,bedrooms,price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


In [5]:
# Select one column with square brackets and a string with the name of the column:

df['sqft']

0     2104
1     1600
2     2400
3     1416
4     3000
5     1985
6     1534
7     1427
8     1380
9     1494
10    1940
11    2000
12    1890
13    4478
14    1268
15    2300
16    1320
17    1236
18    2609
19    3031
20    1767
21    1888
22    1604
23    1962
24    3890
25    1100
26    1458
27    2526
28    2200
29    2637
30    1839
31    1000
32    2040
33    3137
34    1811
35    1437
36    1239
37    2132
38    4215
39    2162
40    1664
41    2238
42    2567
43    1200
44     852
45    1852
46    1203
Name: sqft, dtype: int64

In [6]:
# Select multiple columns with square brackets and a list with the names of the column:

df[['sqft', 'price']]

Unnamed: 0,sqft,price
0,2104,399900
1,1600,329900
2,2400,369000
3,1416,232000
4,3000,539900
5,1985,299900
6,1534,314900
7,1427,198999
8,1380,212000
9,1494,242500


In [7]:
# Pandas cannot select rows easily (but you rarely need to do so).
# To do it, use .iloc:

print("Row 0:", df.iloc[0])

print("Row 20:", df.iloc[20])

# Select row 20, column 0 (square feet)
print(df.iloc[20][0])

Row 0: sqft          2104
bedrooms         3
price       399900
Name: 0, dtype: int64
Row 20: sqft          1767
bedrooms         3
price       252900
Name: 20, dtype: int64
1767


In [8]:
# You can calculate various aggregate functions (mean, median, max, standard deviation, etc) of
# a single column:

print("Mean:", df['sqft'].mean())
print("Median:", df['sqft'].median())
print("Max:", df['sqft'].max())
print("Standard deviation:", df['sqft'].std())
print()

# You can calculate these for multiple columns at once:

print("Mean of two columns:\n", df[['sqft', 'price']].mean())
print()

# What this is often useful for is applying a single transformation across the rows or
# columns of a dataframe.  

# For instance:

# Create a data frame with the square feet column minus the mean of the square feet column
# Notice how we are subtracting a *single number* (the mean) from an entire column at once.
print("Using the mean of a column in a calculation on that column:")
df2 = df['sqft'] - df['sqft'].mean()
print(df2.head())   # The head is just to save space
print()

# This can be done with multiple columns at once!
# Create a data frame with the square feet and price columns minus their respective means.
# Notice how we are subtracting a pair of numbers (the means) from an data frame of two columns.
print("Using the means of 2 columns in a calculation:")
df3 = df[['sqft', 'price']] - df[['sqft', 'price']].mean()
print(df3.head())   # The head is just to save space

Mean: 2000.6808510638298
Median: 1888.0
Max: 4478
Standard deviation: 794.7023535338897

Mean of two columns:
 sqft       2000.680851
price    340412.659574
dtype: float64

Using the mean of a column in a calculation on that column:
0    103.319149
1   -400.680851
2    399.319149
3   -584.680851
4    999.319149
Name: sqft, dtype: float64

Using the means of 2 columns in a calculation:
         sqft          price
0  103.319149   59487.340426
1 -400.680851  -10512.659574
2  399.319149   28587.340426
3 -584.680851 -108412.659574
4  999.319149  199487.340426


In [9]:
# Once you have your data selected in a Pandas dataframe, I find it easier to move to Numpy:

# Use the .to_numpy() method to turn your dataframe into a NumPy "ndarray."  (n-dimensional array)

matrix1 = df.to_numpy()
matrix1  

array([[  2104,      3, 399900],
       [  1600,      3, 329900],
       [  2400,      3, 369000],
       [  1416,      2, 232000],
       [  3000,      4, 539900],
       [  1985,      4, 299900],
       [  1534,      3, 314900],
       [  1427,      3, 198999],
       [  1380,      3, 212000],
       [  1494,      3, 242500],
       [  1940,      4, 239999],
       [  2000,      3, 347000],
       [  1890,      3, 329999],
       [  4478,      5, 699900],
       [  1268,      3, 259900],
       [  2300,      4, 449900],
       [  1320,      2, 299900],
       [  1236,      3, 199900],
       [  2609,      4, 499998],
       [  3031,      4, 599000],
       [  1767,      3, 252900],
       [  1888,      2, 255000],
       [  1604,      3, 242900],
       [  1962,      4, 259900],
       [  3890,      3, 573900],
       [  1100,      3, 249900],
       [  1458,      3, 464500],
       [  2526,      3, 469000],
       [  2200,      3, 475000],
       [  2637,      3, 299900],
       [  

In [10]:
matrix2 = df[['sqft', 'price']].head().to_numpy()
matrix2

array([[  2104, 399900],
       [  1600, 329900],
       [  2400, 369000],
       [  1416, 232000],
       [  3000, 539900]])

In [11]:
vector1 = df[['sqft']]
vector2 = df['sqft']

In [12]:
# The basic NumPy data type is the "ndarray" (n-dimensional array).  This can be used
# to represente vectors (1-dim), matrices (2-dim), or structures with even more dimensions.

# You can always get the dimensions and size/length of each dimension by using .shape
# NOTICE THIS IS NOT A METHOD: no parentheses

print("matrix1 shape: ", matrix1.shape)
print("matrix2 shape: ", matrix2.shape)
print("vector1 shape: ", vector1.shape)
print("vector2 shape: ", vector2.shape)

matrix1 shape:  (47, 3)
matrix2 shape:  (5, 2)
vector1 shape:  (47, 1)
vector2 shape:  (47,)


In [13]:
# Be careful of how you represent vectors in NumPy!  They can be presented as 1-dimensional ndarrays,
# or 2-dimensional ndarrays with one dimension having length 1.  Compare vector1 and vector2.

# Usually NumPy can operate correctly with either type and will figure out what you mean, just
# watch out for errors with dimensions.

# I recommend using 1-dimensional ndarrays for vectors (like vector1, not vector2).


In [14]:
# Example illustrating the dot-product (np.dot() function):

v1 = np.array([1, 2, 3])       # 1-d array (vector)
v2 = np.array([[1, 2, 3]])     # 2-d array, 1 row by 3 columns (row vector), but can be used usually like a 1-d array
v3 = np.array([[1], [2], [3]]) # 2-d array, 3 rows by 1 column (column vector), also will usually work

# v1 is the easiest.  v2 and v3 will mostly work, except when they don't :)

print("v1", v1)
print("v2", v2)
print("v3\n", v3)

# np.dot() is the dot product function:

#print(np.dot(v1, v2))  # doesn't work
#print(np.dot(v1, v3))   # works ok
#print(np.dot(v2, v3))   # also works ok
#print(np.dot(v3, v2))   # "works", but probably doesn't do what you meant
print(np.dot(v1, v1))   # works fine, probably what you meant

v1 [1 2 3]
v2 [[1 2 3]]
v3
 [[1]
 [2]
 [3]]
14


In [15]:
# Getting rows and columns of a matrix:
# Use .shape[0] for rows and .shape[1] for columns

m1 = np.array([[1, 2, 3], [4, 5, 6]])

print(m1.shape[0])
print(m1.shape[1])
print(m1.shape)

2
3
(2, 3)


In [16]:
# Creating new arrays: 

# Use np.array() like above to create 1-d or 2-d arrays from 1 or 2-d lists.

# Use np.zeros() to make a vector/matrix of all zeroes.

print(np.zeros(3))       # [0. 0. 0.]
print(np.zeros((3, 4)))  # 3 rows, 4 columns
#print(np.zeros(3, 40))   # nope

# Use np.ones() to make a vector/matrix of all ones.

print(np.ones(3))       # [0. 0. 0.]
print(np.ones((3, 4)))  # 3 rows, 4 columns
#print(np.ones(3, 40))   # nope


[0. 0. 0.]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[1. 1. 1.]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [17]:
# Matrix operations:

# Addition and subtraction (only on matrices of identical dimensions, including vectors)

m1 = np.array([[1, 2, 3], [4, 5, 6]])
m2 = np.ones((2, 3))
print(m1 + m2)
print()
print(m1 - m2)
print()

v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
print(v1 + v2)
print(v1 - v2)
print()

# Can add or subtract a scalar to/from a vector or matrix; will be applied to all elements:
print(m1 + 1)
print()
print(v1 + 1)

[[2. 3. 4.]
 [5. 6. 7.]]

[[0. 1. 2.]
 [3. 4. 5.]]

[5 7 9]
[-3 -3 -3]

[[2 3 4]
 [5 6 7]]

[2 3 4]


In [18]:
# Matrix multiplication

# Multiplying by a scalar multiplies all entries by that number:

print(2 * m1)
print()

# Can multiply a matrix by a matrix only if inner dimensions match:
# USE THE "AT" SIGN: @

#print(m1 @ m2)  # doesn't work

m3 = np.array([[7, 8], [9, 10]])

#print(m1 @ m3)  # doesn't work

print(m3 @ m1)  # fine



[[ 2  4  6]
 [ 8 10 12]]

[[39 54 69]
 [49 68 87]]


In [19]:
# Matrix multiplication by vector

# Different when on the left versus on right:

print(m1)
print()
print(v1)
print()
print(m1 @ v1)  # treats v1 as a column vector, so multiplies (2x3) by (3x1)
print()
#print(v1 @ m1)  # treats v1 as a row vector, but won't work: v1 has wrong dimensions: trying to multiply (3x1) by (2x3)
v2 = np.array([2, 4])
print(v2 @ m1)   # fine


[[1 2 3]
 [4 5 6]]

[1 2 3]

[14 32]

[18 24 30]


In [20]:
# Matrix transposition: .T 
# Swaps rows for columns

print(m1)
print()
print(m1.T)
print()
print(m1.T @ m1)

[[1 2 3]
 [4 5 6]]

[[1 4]
 [2 5]
 [3 6]]

[[17 22 27]
 [22 29 36]
 [27 36 45]]


In [21]:
# Matrix inverse: Use np.linalg.inv().  Matrix must be square.

m1 = np.array([[1, 2, 3], [6, 5, 4], [7, 9, 8]])
print(m1)
print()
inverse_m1 = np.linalg.inv(m1)
print(inverse_m1)

# the inverse of a matrix A is a matrix A^-1 such that A @ A^-1 or A^1 @ A is the identity matrix.

print(m1 @ inverse_m1)

[[1 2 3]
 [6 5 4]
 [7 9 8]]

[[ 0.19047619  0.52380952 -0.33333333]
 [-0.95238095 -0.61904762  0.66666667]
 [ 0.9047619   0.23809524 -0.33333333]]
[[ 1.00000000e+00  0.00000000e+00  2.22044605e-16]
 [ 8.88178420e-16  1.00000000e+00 -6.66133815e-16]
 [ 0.00000000e+00 -2.22044605e-16  1.00000000e+00]]


In [22]:
# Selecting rows and columns from a matrix:

m1 = np.array([[1, 2, 3], [4, 5, 6]])

print(m1[0]) # row 0
print()
print(m1[0, :]) # another way to get a row
print()
print(m1[:, 0]) # column 0 (as 1-d vector)
print()
print(m1[0][0]) # get a single element

[1 2 3]

[1 2 3]

[1 4]

1


In [23]:
# Adding rows and columns to a matrix (makes a new matrix):

m1 = np.array([[1, 2, 3], [4, 5, 6]])

# Adding one row

v1 = np.zeros(3)  # 1-d array of zeros works OK

print("New row on bottom")
print(np.vstack((m1, v1)))
print()

print("New row on top")
print(np.vstack((v1, m1)))
print()

# Adding one column

v2 = np.zeros((2,1))  # 2-d array of zeros (required for column vector)

print("New column on right")
print(np.hstack((m1, v2)))
print()

print("New column on left")
print(np.hstack((v2, m2)))
print()

# Can also horizontally or vertically stack a matrix with a matrix as long as dimensions match
# along the correct axis.

New row on bottom
[[1. 2. 3.]
 [4. 5. 6.]
 [0. 0. 0.]]

New row on top
[[0. 0. 0.]
 [1. 2. 3.]
 [4. 5. 6.]]

New column on right
[[1. 2. 3. 0.]
 [4. 5. 6. 0.]]

New column on left
[[0. 1. 1. 1.]
 [0. 1. 1. 1.]]



In [24]:
np.ones(3)

array([1., 1., 1.])

In [25]:
m1 = np.array([[1, 2, 3], [4, 5, 6]])

In [26]:
m1

array([[1, 2, 3],
       [4, 5, 6]])

In [27]:
np.ones(2)

array([1., 1.])

In [28]:
np.hstack((np.ones(2), m1))

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)