<a href="https://colab.research.google.com/github/olcaykursun/ML/blob/main/Fall2025/Week06/exam_practice1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------------------------------------------------------------
# pandas vs NumPy vs PyTorch (deep learning workflow)
#
# pandas:
#     - Use it for loading data (CSV, Excel) and exploring with column names.
#     - Great for cleaning, filtering, grouping — like a database table.
#     - But: you don't feed pandas DataFrames directly to deep learning.
#
# NumPy:
#     - Convert your cleaned pandas data to NumPy arrays.
#     - ML libraries (scikit-learn, etc.) expect NumPy.
#
# PyTorch / TensorFlow:
#     - For deep learning, convert NumPy arrays into tensors.
#     - These frameworks build on NumPy concepts but need tensors for GPU/gradient support.
#
# Rule of thumb for students:
#     - Invest in pandas enough to clean & explore.
#     - But for ML/deep learning, focus on NumPy and tensors.
#
# Note:
#     - Some tasks (like checking correlations or selecting subsets)
#       can be done in pandas or NumPy.
#     - I’ll show you both ways, but that doesn’t mean you need to learn
#       every possible way — pick the one that feels natural.
# ---------------------------------------------------------------

import numpy as np
import pandas as pd
import torch

# Start with pandas (like a database table)
df = pd.DataFrame({"x1": [1, 2, 3], "x2": [4, 5, 6], "y": [0, 1, 0]})
print("pandas DataFrame:\n", df)

# Convert to NumPy
X_np = df[["x1", "x2"]].to_numpy()
y_np = df["y"].to_numpy()

# NumPy has so many matrix/data functions that Pandas does not have
test = [4, 7]
distances = np.linalg.norm(X_np - test, axis=1)
print("Distances from the test point:", distances)

print("The closest point's index:", distances.argmin())

# Finally when we learn neural networks: convert to PyTorch tensors
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.long)

print("\nTorch tensors:")
print(X)
print(y)


pandas DataFrame:
    x1  x2  y
0   1   4  0
1   2   5  1
2   3   6  0
Distances from the test point: [4.24264069 2.82842712 1.41421356]
The closest point's index: 2

Torch tensors:
tensor([[1., 4.],
        [2., 5.],
        [3., 6.]])
tensor([0, 1, 0])


In [2]:
# Q1. Create and Use a Synthetic Dataset
# Step 1: Create synthetic dataset y = a - 3b + noise
# Step 2: Save the dataset to CSV
# Step 3: Read the dataset back from CSV
# Step 4: Compute and display correlations (NumPy + pandas)

import numpy as np
import pandas as pd

N = 100  #Play with N to see how sensitive the correlations can be with few examples
a = np.random.rand(N)
b = np.random.rand(N)
noise = np.random.randn(N)*0.1 #0.1 controls the standard deviation of the noise

y = a - 3 * b + noise

df = pd.DataFrame({'a': a, 'b': b, 'y': y})
df.to_csv('synthetic_dataset.csv', index=False)

df = pd.read_csv('synthetic_dataset.csv')

df.corr()  #using pandas here

Unnamed: 0,a,b,y
a,1.0,0.121647,0.221712
b,0.121647,1.0,-0.935144
y,0.221712,-0.935144,1.0


In [3]:
np.corrcoef(a, y).shape  #using numpy here

(2, 2)

In [4]:
np.corrcoef(a, y)

array([[1.       , 0.2217125],
       [0.2217125, 1.       ]])

In [5]:
print(np.corrcoef(a, y)[0, 1])
print(np.corrcoef(b, y)[0, 1]) #These correlations are not the weights of a regression model, because there can be collinarities
print(np.corrcoef(a, b)[0, 1]) #We gave the example of adding both honey and sugar to make a sweet, we like them both but they correlate with each other


0.22171249933898626
-0.9351435703775537
0.12164714467699772


In [6]:
# Q2. Load the Iris dataset and print the number of examples and the names of features.

#if you have the file you can load it:
#iris = pd.read_csv('iris.csv')

#You can read it from the link:
#iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)

#You can access it from scikit
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
class_labels = iris.target
print(X.shape)
print(X.shape[0])
print(iris.feature_names)

(150, 4)
150
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [7]:
# Q3. Print number of examples for each class
class_ids, class_counts = np.unique(iris.target, return_counts=True)
print(class_ids)
print(class_counts)
np.unique(iris.target, return_counts=True)

[0 1 2]
[50 50 50]


(array([0, 1, 2]), array([50, 50, 50]))

In [8]:
# Optional discussion about a useful function "zip", it is not a priority
# Skip this if you do not feel comfortable with the other important subjects we are covering

# Step 1. Relating class IDs to their counts:
d = dict(zip(class_ids, class_counts))
print(d)
# Example output: {0: 50, 1: 50, 2: 50}
# Notice the keys may be NumPy ints (np.int64).

# Step 2. If seeing "numpy.int64" is annoying,
# you can convert to Python's native int (built-in int).
# Here are some alternatives:

# --- Option A: Convert NumPy arrays to Python lists ---
d = dict(zip(class_ids.tolist(), class_counts.tolist()))
print(d)

# --- Option B: Use class names instead of numeric IDs ---
d = dict(zip(iris.target_names[class_ids], class_counts.tolist()))
print(d)

# --- Option C: Class names + convert to lists (pure Python objects) ---
d = dict(zip(iris.target_names[class_ids].tolist(),
             class_counts.tolist()))
print(d)


{np.int64(0): np.int64(50), np.int64(1): np.int64(50), np.int64(2): np.int64(50)}
{0: 50, 1: 50, 2: 50}
{np.str_('setosa'): 50, np.str_('versicolor'): 50, np.str_('virginica'): 50}
{'setosa': 50, 'versicolor': 50, 'virginica': 50}


In [9]:
# Sometimes all the data comes in one single CSV-like file, such as iris.frame
iris = load_iris(as_frame=True)

X = iris.frame #this is data frame with the class labels, all-in-one

print(X.shape)
print(iris.feature_names)

(150, 5)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [10]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
y = X['target']
X.drop(columns='target', inplace=True)
print(X.shape)
print(X.corr())
y.value_counts()


(150, 4)
                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000         -0.117570           0.871754   
sepal width (cm)           -0.117570          1.000000          -0.428440   
petal length (cm)           0.871754         -0.428440           1.000000   
petal width (cm)            0.817941         -0.366126           0.962865   

                   petal width (cm)  
sepal length (cm)          0.817941  
sepal width (cm)          -0.366126  
petal length (cm)          0.962865  
petal width (cm)           1.000000  


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,50
1,50
2,50


In [12]:
# Q4. Print sepal length's correlations with all the other features and find the max correlation:
# First let's try using NumPy
X_np = X.to_numpy()

cors = []
for i in range(1, 4):
  cor = np.corrcoef(X_np[:,0], X_np[:,i])[0,1]
  cors.append(cor)
  print(iris.feature_names[i],': ', cor)

print("\nAfter the loop:")
print(cors)
print(np.argmax(cors))
print(np.max(cors))

print(iris.feature_names[np.argmax(cors)+1],': ', np.max(cors))

sepal width (cm) :  -0.11756978413300208
petal length (cm) :  0.8717537758865831
petal width (cm) :  0.8179411262715757

After the loop:
[np.float64(-0.11756978413300208), np.float64(0.8717537758865831), np.float64(0.8179411262715757)]
1
0.8717537758865831
petal length (cm) :  0.8717537758865831


In [13]:
# Print sepal length's correlations with all the other features and find the max correlation:
# Let's use Pandas (X is a data frame) this time (instead of X_np)

cors = []
for i in range(1, 4):
  cor = np.corrcoef(X.iloc[:,0], X.iloc[:,i])[0,1]
  cors.append(cor)
  print(iris.feature_names[i],': ', cor)

print("\nAfter the loop:")
print(cors)
print(np.argmax(cors))
print(np.max(cors))

print(iris.feature_names[np.argmax(cors)+1],': ', np.max(cors))

sepal width (cm) :  -0.11756978413300208
petal length (cm) :  0.8717537758865831
petal width (cm) :  0.8179411262715757

After the loop:
[np.float64(-0.11756978413300208), np.float64(0.8717537758865831), np.float64(0.8179411262715757)]
1
0.8717537758865831
petal length (cm) :  0.8717537758865831


In [14]:
X.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [15]:
# Just another way using Pandas (X is a data frame)
cors = []
for col_name in X.columns[1:4]:
  cor = np.corrcoef(X['sepal length (cm)'], X[col_name])[0,1]
  cors.append(cor)
  print(col_name, ': ', cor)

print("\nAfter the loop:")
print(cors)
print(np.argmax(cors))
print(np.max(cors))

print(iris.feature_names[np.argmax(cors)+1],': ', np.max(cors))

sepal width (cm) :  -0.11756978413300208
petal length (cm) :  0.8717537758865831
petal width (cm) :  0.8179411262715757

After the loop:
[np.float64(-0.11756978413300208), np.float64(0.8717537758865831), np.float64(0.8179411262715757)]
1
0.8717537758865831
petal length (cm) :  0.8717537758865831


In [16]:
# Q5. Print the average sepal length for versicolors (class = 1)
iris = load_iris()
X = iris.data  #this is a numpy array now that we load_iris without "as_frame"
print(type(X))
X.mean(axis=0) #is not what i want because it uses all the features and all the instances


<class 'numpy.ndarray'>


array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [17]:
mask = iris.target == 1  #pull the versicolors
X[mask, 0].mean()

np.float64(5.936)

In [18]:
# Q6. Print the average sepal length for each class
for cls in [0, 1, 2]: #or use "in np.unique(iris.target):" or use "in range(3):"
    mask = iris.target == cls
    print(iris.target_names[cls], X[mask, 0].mean())

setosa 5.006
versicolor 5.936
virginica 6.587999999999998


In [19]:
# Q7. Apply linear regression to the full set of examples
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
X = iris.data[:, 1:4]  #using sw, pl, and pw
y = iris.data[:, 0]    #predict sl
model = LinearRegression()
model.fit(X, y)
model.coef_
# These coefficients are regression coefficients. They are not the same
# as simple correlations, because regression weights represent partial
# effects (holding other predictors constant). Collinearity (among the predictors) makes the
# difference between the correlations and weights larger.


array([ 0.65083716,  0.70913196, -0.55648266])

In [20]:
# Optional discussion:
# When predictors are independent, there is an interesting relationship between the weights and correlations.
# When predictors are independent, the ratios among regression weights match the ratios among correlations, we can use the weights to assess feature importance
# In other words to determine which feature is more important for the prediction
import numpy as np
from sklearn.linear_model import LinearRegression

np.random.seed(0)
n = 100000
X1 = np.random.randn(n)   #This relation holds if the features are with the same variance, if we multiply X1 by 2 that changes but not the correlations
X2 = np.random.randn(n)
X3 = np.random.randn(n)

y = 0.5*X1 + 0.3*X2 + 0.2*X3 + np.random.randn(n)*0.1

#X1 = 2*X1
# Regression weights depend on the scale (variance) of the features.
# If we multiply X1 by 2, its regression coefficient is cut in half.
# Correlations, however, are scale-invariant, so they stay the same.

X = np.column_stack([X1, X2, X3])
model = LinearRegression().fit(X, y)
coefs = model.coef_

corrs = [np.corrcoef(X[:,i], y)[0,1] for i in range(3)]

print("Regression coefs:", np.round(coefs, 3))
print("Correlations:", np.round(corrs, 3))
print("Ratios coefs:", np.round(coefs/ coefs[1], 3))
print("Ratios corrs:", np.round(np.array(corrs)/ corrs[1], 3))


Regression coefs: [0.5 0.3 0.2]
Correlations: [0.802 0.479 0.323]
Ratios coefs: [1.669 1.    0.667]
Ratios corrs: [1.673 1.    0.673]


In [21]:
lst = [1, 4, 56]
print(lst)
lst.remove(4)
print(lst)

[1, 4, 56]
[1, 56]


In [22]:
# Q8. We should have a test to see how well it generalizes, so let's do Leave-One-Out (LOO)
from sklearn.metrics import mean_squared_error, r2_score
iris = load_iris()
X = iris.data[:, 1:4]
y = iris.data[:, 0] #predict sl

model = LinearRegression()

n_samples = len(X)
y_preds = []
for i in range(n_samples):
    training_indices = list(range(n_samples))
    training_indices.remove(i)
    #print(training_indices)
    X_train = X[training_indices]
    y_train = y[training_indices]
    X_test = X[i].reshape(1,-1)
    y_test = y[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_preds.append(y_pred[0])

mse = mean_squared_error(y, y_preds)
r2 = r2_score(y, y_preds)
print(mse)
print(r2)

0.10176343380434634
0.8505944594314747


In [23]:
y_pred

array([6.42926088])

In [24]:
len(y_preds)

150

In [25]:
# Q9. Why use LOO (or any other cross-validation), because we don't want to test memorization performance.
# K-Nearest Neighbors (KNN) is most commonly used for classification tasks (predicting categories), though it can also be applied to regression. Let's show that:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=1)

X = iris.data
y = iris.target
model.fit(X, y)
y_preds = model.predict(X)

print('MSE of nearest neighbor regression with knn: ', mean_squared_error(y, y_preds))


#KNN is more commonly used for classification
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)

X = iris.data
y = iris.target
model.fit(X, y)
y_preds = model.predict(X)

acc = (y==y_preds).mean()
print(f'{acc=}')

MSE of nearest neighbor regression with knn:  0.0
acc=np.float64(1.0)


In [26]:
# Q10. The same sort of code from above can be used for testing how well we learnt classification
# let's do KNN on the dataset and use LOO to evaluate its predictive performance.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier(n_neighbors=1)

X = iris.data
y = iris.target

n_samples = len(X)
y_preds = []
for i in range(n_samples):
    training_indices = list(range(n_samples))
    training_indices.remove(i)
    X_train = X[training_indices]
    y_train = y[training_indices]
    X_test = X[i].reshape(1,-1)
    y_test = y[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_preds.append(y_pred[0])

acc = (y==y_preds).mean()
print(acc)

acc = accuracy_score(y, y_preds)
print(acc)

0.96
0.96


In [27]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
model = KNeighborsClassifier(n_neighbors=1)

y_preds = []
#y_tests = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
#    y_tests.append(y_test)
    y_preds.append(y_pred[0])

#accuracy_score(y_tests, y_preds)
accuracy_score(y, y_preds)


0.96

In [29]:
l = [3,4,5]
l.append([1,2,3])
print(l)


[3, 4, 5, [1, 2, 3]]


In [28]:
# Q11. We can perform K-fold cross validation instead of LOO.
from sklearn.model_selection import KFold
cv = KFold(n_splits=3, shuffle=False) #dont do this :)
model = KNeighborsClassifier(n_neighbors=1)

y_preds = []
y_tests = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_tests.extend(y_test)
    y_preds.extend(y_pred)
    print(len(y_preds))

print(accuracy_score(y_tests, y_preds))
print(accuracy_score(y, y_preds))


50
100
150
0.0
0.0


In [30]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=3, shuffle=True)
model = KNeighborsClassifier(n_neighbors=1)

y_preds = []
y_tests = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_tests.extend(y_test)
    y_preds.extend(y_pred)

print(accuracy_score(y_tests, y_preds))


0.9533333333333334
