# Lab to introduce NumPy and Pandas

In [None]:
# First we always import the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Pandas is a nice library to let you read CSV files.
# The main pandas datatype is the "data frame."  
# That's why the usual panda variable name is "df."

df = pd.read_csv("housing.csv")
df  # The last expression in a cell is auto-evaluated and printed.

In [None]:
# .head() only shows the first few rows

df.head()

In [None]:
# Select one column with square brackets and a string with the name of the column:

df['sqft']

In [None]:
# Select multiple columns with square brackets and a list with the names of the column:

df[['sqft', 'price']]

In [None]:
# Pandas cannot select rows easily (but you rarely need to do so).
# To do it, use .iloc:

print("Row 0:", df.iloc[0])

print("Row 20:", df.iloc[20])

# Select row 20, column 0 (square feet)
print(df.iloc[20][0])

In [None]:
# You can calculate various aggregate functions (mean, median, max, standard deviation, etc) of
# a single column:

print("Mean:", df['sqft'].mean())
print("Median:", df['sqft'].median())
print("Max:", df['sqft'].max())
print("Standard deviation:", df['sqft'].std())
print()

# You can calculate these for multiple columns at once:

print("Mean of two columns:\n", df[['sqft', 'price']].mean())
print()

# What this is often useful for is applying a single transformation across the rows or
# columns of a dataframe.  

# For instance:

# Create a data frame with the square feet column minus the mean of the square feet column
# Notice how we are subtracting a *single number* (the mean) from an entire column at once.
print("Using the mean of a column in a calculation on that column:")
df2 = df['sqft'] - df['sqft'].mean()
print(df2.head())   # The head is just to save space
print()

# This can be done with multiple columns at once!
# Create a data frame with the square feet and price columns minus their respective means.
# Notice how we are subtracting a pair of numbers (the means) from an data frame of two columns.
print("Using the means of 2 columns in a calculation:")
df3 = df[['sqft', 'price']] - df[['sqft', 'price']].mean()
print(df3.head())   # The head is just to save space

In [None]:
# Once you have your data selected in a Pandas dataframe, I find it easier to move to Numpy:

# Use the .to_numpy() method to turn your dataframe into a NumPy "ndarray."  (n-dimensional array)

matrix1 = df.to_numpy()
matrix1  

In [None]:
matrix2 = df[['sqft', 'price']].head().to_numpy()
matrix2

In [None]:
vector1 = df[['sqft']]
vector2 = df['sqft']

In [None]:
# The basic NumPy data type is the "ndarray" (n-dimensional array).  This can be used
# to represente vectors (1-dim), matrices (2-dim), or structures with even more dimensions.

# You can always get the dimensions and size/length of each dimension by using .shape
# NOTICE THIS IS NOT A METHOD: no parentheses

print("matrix1 shape: ", matrix1.shape)
print("matrix2 shape: ", matrix2.shape)
print("vector1 shape: ", vector1.shape)
print("vector2 shape: ", vector2.shape)

In [None]:
# Be careful of how you represent vectors in NumPy!  They can be presented as 1-dimensional ndarrays,
# or 2-dimensional ndarrays with one dimension having length 1.  Compare vector1 and vector2.

# Usually NumPy can operate correctly with either type and will figure out what you mean, just
# watch out for errors with dimensions.

# I recommend using 1-dimensional ndarrays for vectors (like vector1, not vector2).


In [None]:
# Example illustrating the dot-product (np.dot() function):

v1 = np.array([1, 2, 3])       # 1-d array (vector)
v2 = np.array([[1, 2, 3]])     # 2-d array, 1 row by 3 columns (row vector), but can be used usually like a 1-d array
v3 = np.array([[1], [2], [3]]) # 2-d array, 3 rows by 1 column (column vector), also will usually work

# v1 is the easiest.  v2 and v3 will mostly work, except when they don't :)

print("v1", v1)
print("v2", v2)
print("v3\n", v3)

# np.dot() is the dot product function:

#print(np.dot(v1, v2))  # doesn't work
#print(np.dot(v1, v3))   # works ok
#print(np.dot(v2, v3))   # also works ok
#print(np.dot(v3, v2))   # "works", but probably doesn't do what you meant
print(np.dot(v1, v1))   # works fine, probably what you meant

In [None]:
# Getting rows and columns of a matrix:
# Use .shape[0] for rows and .shape[1] for columns

m1 = np.array([[1, 2, 3], [4, 5, 6]])

print(m1.shape[0])
print(m1.shape[1])
print(m1.shape)

In [None]:
# Creating new arrays: 

# Use np.array() like above to create 1-d or 2-d arrays from 1 or 2-d lists.

# Use np.zeros() to make a vector/matrix of all zeroes.

print(np.zeros(3))       # [0. 0. 0.]
print(np.zeros((3, 4)))  # 3 rows, 4 columns
#print(np.zeros(3, 40))   # nope

# Use np.ones() to make a vector/matrix of all ones.

print(np.ones(3))       # [0. 0. 0.]
print(np.ones((3, 4)))  # 3 rows, 4 columns
#print(np.ones(3, 40))   # nope


In [None]:
# Matrix operations:

# Addition and subtraction (only on matrices of identical dimensions, including vectors)

m1 = np.array([[1, 2, 3], [4, 5, 6]])
m2 = np.ones((2, 3))
print(m1 + m2)
print()
print(m1 - m2)
print()

v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
print(v1 + v2)
print(v1 - v2)
print()

# Can add or subtract a scalar to/from a vector or matrix; will be applied to all elements:
print(m1 + 1)
print()
print(v1 + 1)

In [None]:
# Matrix multiplication

# Multiplying by a scalar multiplies all entries by that number:

print(2 * m1)
print()

# Can multiply a matrix by a matrix only if inner dimensions match:
# USE THE "AT" SIGN: @

#print(m1 @ m2)  # doesn't work

m3 = np.array([[7, 8], [9, 10]])

#print(m1 @ m3)  # doesn't work

print(m3 @ m1)  # fine



In [None]:
# Matrix multiplication by vector

# Different when on the left versus on right:

print(m1)
print()
print(v1)
print()
print(m1 @ v1)  # treats v1 as a column vector, so multiplies (2x3) by (3x1)
print()
#print(v1 @ m1)  # treats v1 as a row vector, but won't work: v1 has wrong dimensions: trying to multiply (3x1) by (2x3)
v2 = np.array([2, 4])
print(v2 @ m1)   # fine


In [None]:
# Matrix transposition: .T 
# Swaps rows for columns

print(m1)
print()
print(m1.T)
print()
print(m1.T @ m1)

In [None]:
# Matrix inverse: Use np.linalg.inv().  Matrix must be square.

m1 = np.array([[1, 2, 3], [6, 5, 4], [7, 9, 8]])
print(m1)
print()
inverse_m1 = np.linalg.inv(m1)
print(inverse_m1)

# the inverse of a matrix A is a matrix A^-1 such that A @ A^-1 or A^1 @ A is the identity matrix.

print(m1 @ inverse_m1)

In [None]:
# Selecting rows and columns from a matrix:

m1 = np.array([[1, 2, 3], [4, 5, 6]])

print(m1[0]) # row 0
print()
print(m1[0, :]) # another way to get a row
print()
print(m1[:, 0]) # column 0 (as 1-d vector)
print()
print(m1[0][0]) # get a single element

In [None]:
# Adding rows and columns to a matrix (makes a new matrix):

m1 = np.array([[1, 2, 3], [4, 5, 6]])

# Adding one row

v1 = np.zeros(3)  # 1-d array of zeros works OK

print("New row on bottom")
print(np.vstack((m1, v1)))
print()

print("New row on top")
print(np.vstack((v1, m1)))
print()

# Adding one column

v2 = np.zeros((2,1))  # 2-d array of zeros (required for column vector)

print("New column on right")
print(np.hstack((m1, v2)))
print()

print("New column on left")
print(np.hstack((v2, m2)))
print()

# Can also horizontally or vertically stack a matrix with a matrix as long as dimensions match
# along the correct axis.