<a href="https://colab.research.google.com/github/olcaykursun/ML/blob/main/Fall2025/Week2/intro_np_pd_iris_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Intro to NumPy, let's start with very basics
#Please check out the longer and more comprehensive tutorial at:
#https://colab.research.google.com/drive/1RYrHHaVBSc4rW7K5H87LqYa5UIodT_oa?usp=sharing

import numpy as np

# Plain Python list (no shape, no transpose)
lst = [[1, 2, 3],
       [4, 5, 6]]
print("Python list:", lst)

# Convert to NumPy array (ndarray)
A = np.array(lst)                    # shape (2, 3)
print("Array A:\n", A)
print("shape:", A.shape, "ndim:", A.ndim, "dtype:", A.dtype)

# Transpose (2x3 -> 3x2)
AT = A.T
print("A.T:\n", AT)
print("A.T shape:", AT.shape)

# Reshape keeps the same number of elements (2*3 == 3*2)
B = A.reshape(3, 2)
print("reshape(3,2):\n", B)

# Flatten vs ravel
print("ravel():", A.ravel())   # returns a view when possible (no copy)
print("flatten():", A.flatten())  # always returns a copy

# Elementwise vs matrix multiply
print("A * 2 (elementwise):\n", A * 2)
print("A @ A.T (matrix mult 2x3 @ 3x2):\n", A @ A.T)

# Broadcasting example: add a row vector across rows
row_vec = np.array([10, 20, 30])
print("A + [10,20,30]:\n", A + row_vec)

# Gotcha: 1D arrays don’t change with .T
v = np.array([1, 2, 3])
print("v shape:", v.shape, "v.T shape:", v.T.shape)  # both (3,)

# Tips:
# - Force a specific dtype: np.array(lst, dtype=float)
# - Avoid ragged lists (e.g., different row lengths) -> would create dtype=object, which breaks math


Python list: [[1, 2, 3], [4, 5, 6]]
Array A:
 [[1 2 3]
 [4 5 6]]
shape: (2, 3) ndim: 2 dtype: int64
A.T:
 [[1 4]
 [2 5]
 [3 6]]
A.T shape: (3, 2)
reshape(3,2):
 [[1 2]
 [3 4]
 [5 6]]
ravel(): [1 2 3 4 5 6]
flatten(): [1 2 3 4 5 6]
A * 2 (elementwise):
 [[ 2  4  6]
 [ 8 10 12]]
A @ A.T (matrix mult 2x3 @ 3x2):
 [[14 32]
 [32 77]]
A + [10,20,30]:
 [[11 22 33]
 [14 25 36]]
v shape: (3,) v.T shape: (3,)


In [None]:
# For NumPy arrays, .copy() always makes a true independent copy.
# For Python lists, .copy() (or [:]) only makes a shallow copy —
# nested objects are still shared. Use copy.deepcopy() if you need
# a full recursive copy of all elements.

import numpy as np
import copy

# --------------------------
# NumPy array example
# --------------------------
arr = np.array([1, 2, 3])
arr_copy = arr.copy()
arr[0] = 99

print("NumPy original:", arr)       # [99  2  3]
print("NumPy copy    :", arr_copy)  # [1  2  3]  <-- independent


# --------------------------
# Python list shallow copy
# --------------------------
lst = [[1, 2], [3, 4]]
lst_copy = lst.copy()   # or lst[:]
lst[0][0] = 99

print("List original:", lst)       # [[99, 2], [3, 4]]
print("List copy    :", lst_copy)  # [[99, 2], [3, 4]]  <-- nested objects still shared!


# --------------------------
# Python list deep copy
# --------------------------
lst = [[1, 2], [3, 4]]
lst_deep = copy.deepcopy(lst)
lst[0][0] = 99

print("List original:", lst)        # [[99, 2], [3, 4]]
print("List deepcopy:", lst_deep)   # [[1, 2], [3, 4]]  <-- fully independent


NumPy original: [99  2  3]
NumPy copy    : [1 2 3]
List original: [[99, 2], [3, 4]]
List copy    : [[99, 2], [3, 4]]
List original: [[99, 2], [3, 4]]
List deepcopy: [[1, 2], [3, 4]]


In [None]:
#Another useful example with numpy, where we have class probabilities for each example
#Suppose we have 4 examples (rows) and 3 classes
class_names = np.array(['setosa', 'versicolor', 'virginica'])

row1 = [0.1,0.5,0.4]     #ML models will produce these probabilities, but let's just make up some numbers
row2 = [0.9, 0.05, 0.05]
row3 = [0, 0.4, 0.6]
row4 = [0, 0.2, 0.8]

probabilities = np.array([row1, row2, row3, row4])

y_pred = class_names[probabilities.argmax(axis=1)]
y_true = ['versicolor', 'setosa', 'setosa', 'versicolor'] #again, let's make up some "expected" (true) class labels

print(y_pred)

['versicolor' 'setosa' 'virginica' 'virginica']


In [None]:
y_true == y_pred  #we have two predictions correct

array([ True,  True, False, False])

In [None]:
# A quick introduction to pandas:
# We'll create a simple DataFrame so we can later learn to read CSV files
# and work with both numeric and string (categorical) columns.
import pandas as pd

# In pandas, a DataFrame can be created from a dictionary:
# - Keys are column names
# - Values are lists (or arrays) of data for each column
my_dictionary = {
    'col1': [1, 2, 3],
    'col2': [4, 5, 6],
    'col3': ['class1', 'class2', 'class1']
}

toy_dataframe = pd.DataFrame(my_dictionary)
toy_dataframe.head()


Unnamed: 0,col1,col2,col3
0,1,4,class1
1,2,5,class2
2,3,6,class1


In [None]:
toy_dataframe.describe()

Unnamed: 0,col1,col2
count,3.0,3.0
mean,2.0,5.0
std,1.0,1.0
min,1.0,4.0
25%,1.5,4.5
50%,2.0,5.0
75%,2.5,5.5
max,3.0,6.0


In [None]:
toy_dataframe.describe(include='all')  #this will include col3

Unnamed: 0,col1,col2,col3
count,3.0,3.0,3
unique,,,2
top,,,class1
freq,,,2
mean,2.0,5.0,
std,1.0,1.0,
min,1.0,4.0,
25%,1.5,4.5,
50%,2.0,5.0,
75%,2.5,5.5,


In [None]:
# Converting 'col3' to 'category' is optional:
# - Saves memory if many repeated strings
# - Makes grouping/sorting faster (uses integer codes internally)
# - Makes the intent clear: 'col3' is a set of labels, not free text
toy_dataframe['col3'] = toy_dataframe['col3'].astype('category')
toy_dataframe['col3'].cat.codes

Unnamed: 0,0
0,0
1,1
2,0


In [None]:
#Let's start with a toy problem: Read a simple dataset and evaluate the performance of some predefined rules.

import pandas as pd
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
X = pd.read_csv("iris.data", header=None, names=column_names)
y = X['class']
X.drop(columns=['class'], inplace=True)
X.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [None]:
X.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
X['sepal_width']

Unnamed: 0,sepal_width
0,3.5
1,3.0
2,3.2
3,3.1
4,3.6
...,...
145,3.0
146,2.5
147,3.0
148,3.4


In [None]:
X.shape

(150, 4)

In [None]:
y

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa
...,...
145,Iris-virginica
146,Iris-virginica
147,Iris-virginica
148,Iris-virginica


In [None]:
y.describe()

Unnamed: 0,class
count,150
unique,3
top,Iris-setosa
freq,50


In [None]:
y = y.astype('category')


In [None]:
y.head()

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


In [None]:
y.unique()

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [None]:
X_np = X.values

In [None]:
X.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
y_np = y.astype('category').cat.codes.values

In [None]:
type(y_np)

numpy.ndarray

In [None]:
X_np

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
y_np

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int8)

In [None]:
np.column_stack((X_np, y_np))

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3

In [None]:
y_pred = []
for row in X_np:
    if row[2] < 3: #pl
        cls = 0
    elif row[1] < 3.05: #sw
        cls = 1
    else:
        cls = 2

    y_pred.append(cls)

print(y_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_np, y_pred)

0.7266666666666667

In [None]:
X_np[:,1]<3.05

array([False,  True, False, False, False, False, False, False,  True,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False, False,
        True, False, False, False, False, False, False, False,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True, False,  True,
       False,  True,  True, False,  True,  True,  True, False, False,
        True,  True,

In [None]:
type(y_pred)

list

In [None]:
is_correct_prediction = y_np == y_pred
acc = is_correct_prediction.mean()


In [None]:
#Rather than a for loop to calculate y_pred, let's try a vectorized alternative taking advantage of np.select
import numpy as np

cond1 = X_np[:, 2] < 3.0
cond2 = X_np[:, 1] < 3.05  #and of course: cond1 is not satisfied

y_pred_vec = np.select(
    condlist=[cond1, cond2],
    choicelist=[0, 1],
    default=2
)

#Both y_pred and y_pred_vec should match. Let us check/assert:
assert np.array_equal(y_pred, y_pred_vec)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_np, y_pred)   # rows: true, cols: pred
print(cm)

print(classification_report(
    y_np, y_pred,
    target_names=["setosa", "versicolor", "virginica"],
    digits=3     #show 3 decimal places; 2–3 is typical for this simple/small dataset
))


[[50  0  0]
 [ 0 42  8]
 [ 0 33 17]]
              precision    recall  f1-score   support

      setosa      1.000     1.000     1.000        50
  versicolor      0.560     0.840     0.672        50
   virginica      0.680     0.340     0.453        50

    accuracy                          0.727       150
   macro avg      0.747     0.727     0.708       150
weighted avg      0.747     0.727     0.708       150

