In [3]:
import numpy as np
import pandas as pd

# Step 1: Create dataset
data = pd.DataFrame({
    'Feature1': [2.5, 0.5, 2.2, 1.9, 3.1],
    'Feature2': [2.4, 0.7, 2.9, 2.2, 3.0],
    'Class': ['A', 'A', 'B', 'B', 'A']
})

print("Original Data:\n", data, "\n")

# Step 2: Center the data (subtract the mean of each feature)
features = data[['Feature1', 'Feature2']]
mean_vec = features.mean()
centered = features - mean_vec
print("Centered Data:\n", centered, "\n")

# Step 3: Compute the covariance matrix
cov_matrix = np.cov(centered.T)
print("Covariance Matrix:\n", cov_matrix, "\n")

# Step 4: Find eigenvalues and eigenvectors
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print("Eigenvalues:\n", eig_vals)
print("Eigenvectors:\n", eig_vecs, "\n")

# Step 5: Sort eigenvectors by descending eigenvalues
sorted_indices = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sorted_indices]
eig_vecs = eig_vecs[:, sorted_indices]
print("Sorted Eigenvectors (Principal Components):\n", eig_vecs, "\n")

# -------- Make it GENERIC --------
n = 2   # Number of principal components you want to keep (1 or 2 for this dataset)

# Step 6: Project data onto top 'n' principal components
pcs = eig_vecs[:, :n]                     # Select top-n eigenvectors
projected = centered.dot(pcs)             # Project data
projected_df = pd.DataFrame(projected, columns=[f'PC{i+1}' for i in range(n)])
data = pd.concat([data, projected_df], axis=1)

print(f"Data projected onto top {n} principal components:\n", data[[*projected_df.columns, 'Class']], "\n")

# Step 7: Project a new sample onto the same PCs
new_sample = np.array([2.0, 2.5])
new_centered = new_sample - mean_vec
new_proj = new_centered.dot(pcs)
print(f"Projected new sample onto top {n} PCs:", new_proj, "\n")

# Step 8: Predict class using nearest neighbor (in nD space)
if n == 1:
    # For 1D case, just compare PC1 values
    closest_idx = np.abs(data['PC1'] - new_proj[0]).idxmin()
else:
    # For multi-dimensional case, use Euclidean distance
    distances = np.linalg.norm(projected - new_proj, axis=1)
    closest_idx = np.argmin(distances)

predicted_class = data.loc[closest_idx, 'Class']
print("Predicted class of new sample:", predicted_class)


Original Data:
    Feature1  Feature2 Class
0       2.5       2.4     A
1       0.5       0.7     A
2       2.2       2.9     B
3       1.9       2.2     B
4       3.1       3.0     A 

Centered Data:
    Feature1  Feature2
0      0.46      0.16
1     -1.54     -1.54
2      0.16      0.66
3     -0.14     -0.04
4      1.06      0.76 

Covariance Matrix:
 [[0.938  0.8405]
 [0.8405 0.853 ]] 

Eigenvalues:
 [1.73707382 0.05392618]
Eigenvectors:
 [[ 0.72474155 -0.68902082]
 [ 0.68902082  0.72474155]] 

Sorted Eigenvectors (Principal Components):
 [[ 0.72474155 -0.68902082]
 [ 0.68902082  0.72474155]] 

Data projected onto top 2 principal components:
    PC1  PC2 Class
0  NaN  NaN     A
1  NaN  NaN     A
2  NaN  NaN     B
3  NaN  NaN     B
4  NaN  NaN     A 

Projected new sample onto top 2 PCs: [0.15015575 0.21599364] 

Predicted class of new sample: B


In [1]:
import numpy as np
import pandas as pd

# Step 1: Create a small dataset
data = np.array([
    [2.5, 2.4],
    [0.5, 0.7],
    [2.2, 2.9],
    [1.9, 2.2],
    [3.1, 3.0],
    [2.3, 2.7],
    [2.0, 1.6],
    [1.0, 1.1],
    [1.5, 1.6],
    [1.1, 0.9]
])

# Step 2: Standardize the data
mean = np.mean(data, axis=0)
data_centered = data - mean

# Step 3: Compute covariance matrix
cov_matrix = np.cov(data_centered.T)
print("Covariance Matrix:\n", cov_matrix)

# Step 4: Compute eigenvalues and eigenvectors
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

# Step 5: Sort by eigenvalue (descending order)
sorted_indices = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sorted_indices]
eig_vecs = eig_vecs[:, sorted_indices]

print("\nEigenvalues:", eig_vals)
print("Eigenvectors:\n", eig_vecs)

# Step 6: Choose top k components (let’s take k=1)
k = 1
W = eig_vecs[:, :k]

# Step 7: Transform data to new subspace
X_reduced = np.dot(data_centered, W)
print("\nReduced Data (1D):\n", X_reduced)

print("\nOriginal shape:", data.shape)
print("Reduced shape:", X_reduced.shape)


Covariance Matrix:
 [[0.61655556 0.61544444]
 [0.61544444 0.71655556]]

Eigenvalues: [1.28402771 0.0490834 ]
Eigenvectors:
 [[-0.6778734  -0.73517866]
 [-0.73517866  0.6778734 ]]

Reduced Data (1D):
 [[-0.82797019]
 [ 1.77758033]
 [-0.99219749]
 [-0.27421042]
 [-1.67580142]
 [-0.9129491 ]
 [ 0.09910944]
 [ 1.14457216]
 [ 0.43804614]
 [ 1.22382056]]

Original shape: (10, 2)
Reduced shape: (10, 1)


In [2]:
import numpy as np
import pandas as pd

# -------------------------------
# Step 1: Create a sample dataset
# -------------------------------
# Each row = [Feature1, Feature2, Feature3], last column = Class label
data = np.array([
    [2.5, 2.4, 1.8, 0],  # Class 0
    [0.5, 0.7, 0.3, 0],
    [2.2, 2.9, 1.6, 0],
    [1.9, 2.2, 1.4, 0],
    [3.1, 3.0, 2.0, 0],
    [2.3, 2.7, 1.5, 1],
    [2.0, 1.6, 1.2, 1],
    [1.0, 1.1, 0.6, 1],
    [1.5, 1.6, 0.9, 1],
    [1.1, 0.9, 0.5, 1]
])

X = data[:, :-1]   # features
y = data[:, -1]    # labels

# -----------------------------------
# Step 2: Standardize (mean normalization)
# -----------------------------------
mean = np.mean(X, axis=0)
X_centered = X - mean

# -----------------------------------
# Step 3: Compute PCA manually
# -----------------------------------
cov_matrix = np.cov(X_centered.T)
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

# Sort by descending eigenvalues
sorted_indices = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sorted_indices]
eig_vecs = eig_vecs[:, sorted_indices]

# Choose top k principal components
k = 2
W = eig_vecs[:, :k]

# Transform data
X_pca = np.dot(X_centered, W)
print("Original shape:", X.shape)
print("After PCA shape:", X_pca.shape)

# -----------------------------------
# Step 4: Define KNN (using Manhattan distance)
# -----------------------------------
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

def knn_predict(X_train, y_train, X_test, k):
    predictions = []
    for test_point in X_test:
        distances = []
        for i, x_train in enumerate(X_train):
            dist = manhattan_distance(test_point, x_train)
            distances.append((dist, y_train[i]))
        distances.sort(key=lambda x: x[0])  # sort by distance
        neighbors = [label for (_, label) in distances[:k]]
        # majority vote
        prediction = max(set(neighbors), key=neighbors.count)
        predictions.append(prediction)
    return np.array(predictions)

# -----------------------------------
# Step 5: Test data (also apply PCA)
# -----------------------------------
X_test = np.array([[1.8, 2.0, 1.0],
                   [2.9, 3.1, 1.8]])
X_test_centered = X_test - mean
X_test_pca = np.dot(X_test_centered, W)

# -----------------------------------
# Step 6: Predict using KNN
# -----------------------------------
k_value = 3
predictions = knn_predict(X_pca, y, X_test_pca, k_value)

print("\nPredictions for test points:", predictions)


Original shape: (10, 3)
After PCA shape: (10, 2)

Predictions for test points: [1. 0.]


## QUESTION NUMBER 1

In [None]:
import numpy as np
import pandas as pd

# data = feature1, feature2
data = np.array([[2.5, 2.4],
                 [0.5, 0.7],
                 [2.2, 2.9],
                 [1.9, 2.2],
                 [3.1, 3.0]])

classes = np.array(['A', 'A', 'B', 'B', 'A'])

df = pd.DataFrame(data, columns=['Feature 1', 'Feature 2'],index=[1,2,3,4,5])

df['Class'] = classes
print("Original Dataset:")
print(df)

Original Dataset:
   Feature 1  Feature 2 Class
1        2.5        2.4     A
2        0.5        0.7     A
3        2.2        2.9     B
4        1.9        2.2     B
5        3.1        3.0     A


In [15]:
#calculate mean of each feature
meanF1 = np.mean(data[:, 0])
meanF2 = np.mean(data[:, 1])
print("Mean of feature 1 - ",meanF1,"\tMean of feature 2 - ", meanF2)

# Center the data by subtracting the mean
centered_data = data - np.array([meanF1, meanF2])
print("\nCentered Data:\n",centered_data)

Mean of feature 1 -  2.04 	Mean of feature 2 -  2.2399999999999998

Centered Data:
 [[ 0.46  0.16]
 [-1.54 -1.54]
 [ 0.16  0.66]
 [-0.14 -0.04]
 [ 1.06  0.76]]


In [16]:
# covariance matrix
cov_matrix = np.cov(centered_data, rowvar=False)
print(cov_matrix)

[[0.938  0.8405]
 [0.8405 0.853 ]]


In [24]:
Eval,Evec = np.linalg.eig(cov_matrix)
print("1st E.Value = ",Eval[0],"\t2nd E.Value = ",Eval[1])
# print(Evec)
print("1st E.Vector = ",Evec[:,0],"\t2nd E.Vector = ",Evec[:,1])

1st E.Value =  1.737073823262107 	2nd E.Value =  0.05392617673789313
1st E.Vector =  [0.72474155 0.68902082] 	2nd E.Vector =  [-0.68902082  0.72474155]


In [25]:
# Sorting eigenvalues and eigenvectors
sorted_indices = np.argsort(Eval)[::-1]
sorted_Eval = Eval[sorted_indices]
sorted_Evec = Evec[:, sorted_indices]  
print(sorted_indices)
print(sorted_Eval)
print(sorted_Evec)

[0 1]
[1.73707382 0.05392618]
[[ 0.72474155 -0.68902082]
 [ 0.68902082  0.72474155]]


In [None]:
# principle components

firstComponent = sorted_Evec[:, 0]
secondComponent = sorted_Evec[:, 1]
print("\n1st Principle Component:\n",firstComponent)
print("\n2nd Principle Component:\n",secondComponent)


1st Principle Component:
 [0.72474155 0.68902082]

2nd Principle Component:
 [-0.68902082  0.72474155]


In [None]:
# Project data onto PC1
pc1_values = centered_data.dot(firstComponent)
print("PC1 Values (projection onto first principal component):")
for i, val in enumerate(pc1_values):
    print(f"Sample {i+1}: {val:.4f}")

# Create DataFrame showing the projection
projection_df = pd.DataFrame({
    'Sample': range(1, 6),
    'Original_F1': data[:, 0],
    'Original_F2': data[:, 1],
    'PC1_Value': pc1_values,
    'Class': classes
})
print("\nProjection Results:")
print(projection_df)

PC1 Values (projection onto first principal component):
Sample 1: 0.4436
Sample 2: -2.1772
Sample 3: 0.5707
Sample 4: -0.1290
Sample 5: 1.2919

Projection Results:
   Sample  Original_F1  Original_F2  PC1_Value Class
0       1          2.5          2.4   0.443624     A
1       2          0.5          0.7  -2.177194     A
2       3          2.2          2.9   0.570712     B
3       4          1.9          2.2  -0.129025     B
4       5          3.1          3.0   1.291882     A


In [None]:
print("Original 2D data shape:", data.shape)
print("Reduced 1D data shape:", pc1_values.shape)
print("Dimensionality Reduction: 2D → 1D:\n")
for i in range(len(data)):
    print(f"Sample {i+1}: ({data[i,0]}, {data[i,1]}) → {pc1_values[i]}")

Original 2D data shape: (5, 2)
Reduced 1D data shape: (5,)
Dimensionality Reduction: 2D → 1D:

Sample 1: (2.5, 2.4) → 0.44362444280870694
Sample 2: (0.5, 0.7) → -2.1771940447906886
Sample 3: (2.2, 2.9) → 0.5707123885045612
Sample 4: (1.9, 2.2) → -0.1290246493796061
Sample 5: (3.1, 3.0) → 1.2918818628570272


In [43]:
# New sample
new_sample = np.array([2.0, 2.5])
print(f"New sample: {new_sample}")
new_sample_centered = new_sample - np.array([meanF1, meanF2])
print("New Zero-Centered sample",new_sample_centered)

# Project onto PC1
new_sample_pc1 = new_sample_centered.dot(firstComponent)
print(f"New sample PC1 value: {new_sample_pc1:.4f}")

# Simple classification using nearest neighbor in PC1 space
distances = np.abs(pc1_values - new_sample_pc1)
nearest_idx = np.argmin(distances)
predicted_class = classes[nearest_idx]

print("\nClassification based on PC1:")
# print("Distances to existing samples in PC1 space:")
# for i, dist in enumerate(distances):
#     print(f"  Sample {i+1} (Class {classes[i]}): {dist:.4f}")

print(f"\nNearest sample: Sample {nearest_idx + 1}")
print("Sample ",nearest_idx+1 ,"=", data[nearest_idx]," of class ",predicted_class)
print(f"Predicted class: {predicted_class}")

New sample: [2.  2.5]
New Zero-Centered sample [-0.04  0.26]
New sample PC1 value: 0.1502

Classification based on PC1:

Nearest sample: Sample 4
Sample  4 = [1.9 2.2]  of class  B
Predicted class: B


## Question Number 2

# PART A:

In [52]:
from sklearn.datasets import load_wine


In [53]:
wine = load_wine()
X = wine.data  # Features (13 chemical properties)
y = wine.target  # Target classes (0, 1, 2)

print("Dataset shape:", X.shape)
print("Number of classes:", len(np.unique(y)))
print("Feature names:", wine.feature_names)
print("Class names:", wine.target_names)
print("Sample data:\n")
sample=pd.DataFrame(X,columns=wine.feature_names)
sample.head()

Dataset shape: (178, 13)
Number of classes: 3
Feature names: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Class names: ['class_0' 'class_1' 'class_2']
Sample data:



Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [63]:
print("Before standardization: \n",X[:1])
standardizedData = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
print("After: \n",standardizedData[:1])

Before standardization: 
 [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
  2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]]
After: 
 [[ 1.51861254 -0.5622498   0.23205254 -1.16959318  1.91390522  0.80899739
   1.03481896 -0.65956311  1.22488398  0.25171685  0.36217728  1.84791957
   1.01300893]]


In [65]:
newSample= np.array([13.0, 2.0, 2.5, 16.0, 100.0, 2.5, 2.0, 0.3, 2.0, 5.0, 1.0, 3.0, 1000.0])
standardizedNewSample = (newSample - np.mean(X, axis=0)) / np.std(X, axis=0)
print("New Sample before standardization: \n",newSample)
print("New Sample after standardization: \n",standardizedNewSample)

New Sample before standardization: 
 [1.3e+01 2.0e+00 2.5e+00 1.6e+01 1.0e+02 2.5e+00 2.0e+00 3.0e-01 2.0e+00
 5.0e+00 1.0e+00 3.0e+00 1.0e+03]
New Sample after standardization: 
 [-7.63365990e-04 -3.01927486e-01  4.87926405e-01 -1.04947918e+00
  1.81450206e-02  3.28297930e-01 -2.93857675e-02 -4.98406993e-01
  7.16779586e-01 -2.51279393e-02  1.86683727e-01  5.48472176e-01
  8.06016834e-01]


In [73]:
def EuclideanDistance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))


In [None]:
distances = np.array([EuclideanDistance(standardizedNewSample, standardizedData[i]) for i in range(len(standardizedData))])

# Find the closest point
sorted_indices = np.argsort(distances)
closest_point_index = sorted_indices[0]

# Get the prediction
pred = y[closest_point_index]
print("Predicted class for test sample:", pred)
print(f"Distance to closest point: {distances[closest_point_index]:.4f}")
print(f"Closest point index: {closest_point_index}")

Predicted class for test sample: 0
Distance to closest point: 1.6284
Closest point index: 37


# PART B

In [79]:
x_Centered = standardizedData - np.mean(standardizedData, axis=0)
cov_matrix = np.cov(x_Centered, rowvar=False)
print("Centered Data:\n",x_Centered[:1])
print("\nCovariance Matrix:\n", cov_matrix[:1])

Centered Data:
 [[ 1.51861254 -0.5622498   0.23205254 -1.16959318  1.91390522  0.80899739
   1.03481896 -0.65956311  1.22488398  0.25171685  0.36217728  1.84791957
   1.01300893]]

Covariance Matrix:
 [[ 1.00564972  0.09493026  0.21273976 -0.31198788  0.27232816  0.29073446
   0.23815287 -0.15681042  0.13747022  0.549451   -0.07215255  0.07275191
   0.64735687]]


In [85]:
Eval,Evec = np.linalg.eig(cov_matrix)
# print(Evec)  
sortedIndex=np.argsort(Eval)[::-1]
sorted_eigenvalues = Eval[sortedIndex]
sorted_eigenvectors = Evec[:, sortedIndex]

print("Top 4 Evals = ", sorted_eigenvalues[:4])
print("Top 4 Evecs = \n", sorted_eigenvectors[:,:4] )

Top 4 Evals =  [4.73243698 2.51108093 1.45424187 0.92416587]
Top 4 Evecs = 
 [[-0.1443294   0.48365155 -0.20738262  0.0178563 ]
 [ 0.24518758  0.22493093  0.08901289 -0.53689028]
 [ 0.00205106  0.31606881  0.6262239   0.21417556]
 [ 0.23932041 -0.0105905   0.61208035 -0.06085941]
 [-0.14199204  0.299634    0.13075693  0.35179658]
 [-0.39466085  0.06503951  0.14617896 -0.19806835]
 [-0.4229343  -0.00335981  0.1506819  -0.15229479]
 [ 0.2985331   0.02877949  0.17036816  0.20330102]
 [-0.31342949  0.03930172  0.14945431 -0.39905653]
 [ 0.0886167   0.52999567 -0.13730621 -0.06592568]
 [-0.29671456 -0.27923515  0.08522192  0.42777141]
 [-0.37616741 -0.16449619  0.16600459 -0.18412074]
 [-0.28675223  0.36490283 -0.12674592  0.23207086]]


In [88]:
n=4
PrincipleComponents = sorted_eigenvectors[:, :n]
DataPCA4D = np.dot(x_Centered, PrincipleComponents)
print("original data shape:", standardizedData.shape)
print("PCA matrix shape:", PrincipleComponents.shape)
print("\nTop 4 Samples in 4D PCA space:\n", DataPCA4D[:4])  


original data shape: (178, 13)
PCA matrix shape: (13, 4)

Top 4 Samples in 4D PCA space:
 [[-3.31675081  1.44346263 -0.16573904  0.21563119]
 [-2.20946492 -0.33339289 -2.02645737  0.29135832]
 [-2.51674015  1.0311513   0.98281867 -0.72490231]
 [-3.75706561  2.75637191 -0.17619184 -0.56798331]]


In [92]:
#transformation of new sample
transformedSampleCentered = standardizedNewSample - np.mean(standardizedData, axis=0)
transformedSamppleResult = np.dot(transformedSampleCentered, PrincipleComponents)
print("\nNew sample in 4D PCA space:\n", transformedSamppleResult)


New sample in 4D PCA space:
 [-1.31230811  0.27621539 -0.28713416  0.05651436]


In [94]:
#distance computation in 4D PCA space
distances_4d = np.array([EuclideanDistance(transformedSamppleResult, DataPCA4D[i]) for i in range(len(DataPCA4D))])
sorted_indices_pca = np.argsort(distances_4d)
closest_point_index_pca = sorted_indices_pca[0]
pred_pca_4d = y[closest_point_index_pca]

print(f"Predicted class (4D PCA): {pred_pca_4d}")
print(f"Distance to closest point (4D): {distances_4d[closest_point_index_pca]:.4f}")
print(f"Closest point index (4D): {closest_point_index_pca}")

Predicted class (4D PCA): 0
Distance to closest point (4D): 0.6736
Closest point index (4D): 26


# PART C

In [74]:

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier


data = load_wine()
X = pd.DataFrame(data['data'])
y = data['target']

pca = PCA(n_components=4)
X_pca = pca.fit_transform(X)
print("PCA transformed data shape:", X_pca.shape)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_pca, y)

test_sample = np.array([13.0, 2.0, 2.5, 16.0, 100.0, 2.5, 2.0, 0.3, 2.0, 5.0, 1.0, 3.0, 1000.0])

test_sample_pca = pca.transform([test_sample])  
print("Transformed test sample shape:", test_sample_pca.shape)

pred = knn.predict(test_sample_pca)
print("Predicted label:", pred)


PCA transformed data shape: (178, 4)
Transformed test sample shape: (1, 4)
Predicted label: [0]
