# Univariate Linear Regression

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
df = [
    [35.0, 179.0],
    [42.0, 200.0],
    [50.0, 221.0],
    [60.0, 263.0],
    [67.0, 280.0],
    [75.0, 314.0],
    [80.0, 327.0],
    [90.0, 360.0],
    [95.0, 377.0],
    [100.0, 391.0],
    [110.0, 425.0],
    [120.0, 462.0],
    [130.0, 493.0],
    [140.0, 521.0],
    [150.0, 552.0],
    [160.0, 582.0],
    [175.0, 631.0],
    [190.0, 675.0],
    [210.0, 740.0],
    [230.0, 804.0]
]


In [10]:
df = pd.DataFrame(df, columns=['feature', 'target'])

In [11]:
print(df.head())

   feature  target
0     35.0   179.0
1     42.0   200.0
2     50.0   221.0
3     60.0   263.0
4     67.0   280.0


In [12]:
print(df.shape)

(20, 2)


In [13]:
X = df['feature']
y = df['target']

In [14]:
print(X.head())
print(y.head())

0    35.0
1    42.0
2    50.0
3    60.0
4    67.0
Name: feature, dtype: float64
0    179.0
1    200.0
2    221.0
3    263.0
4    280.0
Name: target, dtype: float64


In [16]:
X_mean = X.mean()
X_min = X.min()
X_max = X.max()
X_std = X.std(ddof=0)

y_mean =y.mean()
y_min = y.min()
y_max = y.max()
y_std = y.std(ddof=0)

print(f"{X_mean:.2f} {X_min:.2f} {X_max:.2f} {X_std:.2f}")
print(f"{y_mean:.2f} {y_min:.2f} {y_max:.2f} {y_std:.2f}")

115.45 35.00 230.00 55.22
439.85 179.00 804.00 177.34


In [17]:
alpha = 0.01
epochs = 1000

In [18]:
X_norm = (X - X_mean) / (X_std)

In [19]:
print(X_norm.head())

0   -1.456972
1   -1.330200
2   -1.185317
3   -1.004215
4   -0.877443
Name: feature, dtype: float64


In [28]:
def gradient_descent(X, y, alpha=0.01, epochs=1000):
    theta0, theta1 = 0, 0
    
    n = len(X)

    for epoch in range(epochs):
        y_cap = theta0 + (theta1 * X)
        grad0 = sum((y_cap - y)) / n
        grad1 = sum(((y_cap - y) * X)) / n

        theta0 -= alpha * grad0
        theta1 -= alpha * grad1

    return theta0, theta1

In [29]:
theta0, theta1 = gradient_descent(X_norm, y)

In [30]:
print(theta0, theta1)

439.83101112682647 177.29315298210062


In [31]:
def mse(X, y):
    y_cap = theta0 + (theta1 * X)
    n = len(X)
    return sum((y_cap - y) ** 2) / (2 * n)

In [32]:
error = mse(X_norm, y)

In [33]:
print(error)

7.57582866937003


In [38]:
def predict(val):
    X = (val - X_mean) / X_std
    return theta0 + (theta1 * X)

In [39]:
print(predict(150))
print(predict(200))

550.7651054617813
711.3064865543353


# Multi Variate Linear Regression

In [40]:
df = [
    [35.0, 1.0, 20.0, 179.0],
    [42.0, 2.0, 15.0, 200.0],
    [50.0, 2.0, 18.0, 221.0],
    [60.0, 3.0, 10.0, 263.0],
    [67.0, 3.0, 8.0, 280.0],
    [75.0, 3.0, 12.0, 314.0],
    [80.0, 4.0, 5.0, 327.0],
    [90.0, 4.0, 5.0, 360.0],
    [95.0, 4.0, 6.0, 377.0],
    [100.0, 5.0, 5.0, 391.0],
    [110.0, 5.0, 3.0, 425.0],
    [120.0, 5.0, 2.0, 462.0],
    [130.0, 6.0, 2.0, 493.0],
    [140.0, 6.0, 1.0, 521.0],
    [150.0, 6.0, 1.0, 552.0],
    [160.0, 7.0, 1.0, 582.0],
    [175.0, 7.0, 2.0, 631.0],
    [190.0, 8.0, 2.0, 675.0],
    [210.0, 8.0, 1.0, 740.0],
    [230.0, 9.0, 1.0, 804.0]
]


In [41]:
df = pd.DataFrame(df, columns=['size', 'rooms', 'age', 'price'])

In [42]:
print(df.head())

   size  rooms   age  price
0  35.0    1.0  20.0  179.0
1  42.0    2.0  15.0  200.0
2  50.0    2.0  18.0  221.0
3  60.0    3.0  10.0  263.0
4  67.0    3.0   8.0  280.0


In [43]:
print(df.shape)

(20, 4)


In [45]:
mean_vals = df.mean()
min_vals = df.min()
max_vals = df.max()
std_vals = df.std(ddof=0)

In [46]:
print(mean_vals, min_vals, max_vals, std_vals)

size     115.45
rooms      4.90
age        6.00
price    439.85
dtype: float64 size      35.0
rooms      1.0
age        1.0
price    179.0
dtype: float64 size     230.0
rooms      9.0
age       20.0
price    804.0
dtype: float64 size      55.217275
rooms      2.165641
age        5.822371
price    177.343530
dtype: float64


In [47]:
alpha = 0.01
epochs = 300

In [48]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [49]:
print(X.head())
print(y.head())

   size  rooms   age
0  35.0    1.0  20.0
1  42.0    2.0  15.0
2  50.0    2.0  18.0
3  60.0    3.0  10.0
4  67.0    3.0   8.0
0    179.0
1    200.0
2    221.0
3    263.0
4    280.0
Name: price, dtype: float64


In [50]:
X_mean = mean_vals[:-1]
X_std = std_vals[:-1]
X_norm = (X - X_mean) / X_std

In [52]:
print(X_norm.head())

       size     rooms       age
0 -1.456972 -1.800853  2.404519
1 -1.330200 -1.339096  1.545762
2 -1.185317 -1.339096  2.061016
3 -1.004215 -0.877338  0.687005
4 -0.877443 -0.877338  0.343503


In [54]:
X_aug = np.c_[np.ones(X_norm.shape[0]), X_norm]


In [69]:
def gradient_descent(X, y, theta, alpha=0.01, epochs=300):
    theta = np.zeros(X.shape[1])
    n = len(X)

    for epoch in range(epochs):
        y_cap = X.dot(theta)
        grad = X.T.dot(y_cap - y) / n
        theta -= alpha * grad
    
    return theta

In [70]:
theta = 0
theta = gradient_descent(X_aug, y, theta)

In [71]:
print(theta)

[418.27936274  82.59776628  69.70922778 -25.24441416]


In [72]:
def mse(X, y, theta):
    n = len(X)
    y_cap = X.dot(theta)
    error = np.sum((y_cap - y) ** 2) / (2 * n)
    return error

In [73]:
error = mse(X_aug, y, theta)

In [74]:
print(error)

507.4749084772522


In [75]:
theta_norm = np.linalg.inv(X_aug.T.dot(X_aug)).dot(X_aug.T).dot(y)

In [76]:
print(theta_norm, theta)

[439.85       178.33638311  -5.88264569  -5.86457497] [418.27936274  82.59776628  69.70922778 -25.24441416]


In [77]:
mse_norm = mse(X_aug, y, theta_norm)

In [78]:
print(mse_norm)

3.7089239283979465


In [80]:
print(f"MSE Difference={round(abs(error - mse_norm), 5)}")

MSE Difference=503.76598


In [87]:
new = pd.DataFrame([[150, 3, 5], [200, 4, 2]], columns=['size', 'rooms', 'age'])
new_norm = (new - X_mean) / (X_std)
new_aug = np.c_[np.ones(new_norm.shape[0]), new_norm]

In [88]:
pred_gd = new_aug.dot(theta)

for price in pred_gd:
    print(round(price, 2))

413.14
533.13


# Univariate Logisitic Regression

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = [
    [35.0, 0.0],
    [42.0, 0.0],
    [50.0, 0.0],
    [60.0, 0.0],
    [67.0, 1.0],
    [75.0, 1.0],
    [80.0, 1.0],
    [90.0, 1.0],
    [95.0, 1.0],
    [100.0, 1.0],
    [110.0, 1.0],
    [120.0, 1.0],
    [130.0, 1.0],
    [140.0, 1.0],
    [150.0, 1.0],
    [160.0, 1.0],
    [175.0, 1.0],
    [190.0, 1.0],
    [210.0, 1.0],
    [230.0, 1.0]
]


In [3]:
df = pd.DataFrame(df, columns=['exam_score', 'admitted'])

In [4]:
print(df.head())

   exam_score  admitted
0        35.0       0.0
1        42.0       0.0
2        50.0       0.0
3        60.0       0.0
4        67.0       1.0


In [5]:
print(df.shape)

(20, 2)


In [6]:
X = df['exam_score']
y = df['admitted']

In [7]:
print(X.head())
print(y.head())

0    35.0
1    42.0
2    50.0
3    60.0
4    67.0
Name: exam_score, dtype: float64
0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: admitted, dtype: float64


In [8]:
X_mean = X.mean()
X_min = X.min()
X_max = X.max()
X_std = X.std(ddof=0)

In [9]:
print(X_mean, X_min, X_max, X_std)

115.45 35.0 230.0 55.21727537646167


In [10]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [11]:
def loss_fn(X, y, theta0, theta1):
    ep = 1e-8
    n = len(X)
    z = theta0 + (theta1 * X)
    y_cap = sigmoid(z)

    grad = np.sum((y * np.log(y_cap + ep)) + ((1 - y) * np.log(1 - y_cap + ep)))
    return (-1 / n) * grad

In [12]:
def gradient_descent(X, y, alpha=0.01, epochs=1000):
    theta0, theta1 = 0, 0
    n = len(X)
    for epoch in range(epochs):
        z = theta0 + (theta1 * X)
        y_cap = sigmoid(z)

        grad0 = np.sum(y_cap - y) / n
        grad1 = np.sum((y_cap - y) * X) / n

        theta0 -= alpha * grad0
        theta1 -= alpha * grad1

    return theta0, theta1

In [13]:
theta0, theta1 = gradient_descent(X, y)

In [14]:
print(theta0, theta1)

-1.176757571288268 0.19101263702684323


In [15]:
final_loss = loss_fn(X, y, theta0, theta1)

In [16]:
print(final_loss)

1.5508684160835362


In [17]:
def predict(val):
    z = theta0 + (theta1 * val)
    y_cap = sigmoid(z)
    return y_cap

In [18]:
print(predict(65))
print(predict(155))

0.9999868498446497
0.9999999999995504


# Multivariate Logistic Regression

In [19]:
df = [
    [35, 40, 5, 0],
    [42, 50, 6, 0],
    [50, 52, 7, 0],
    [60, 65, 8, 0],
    [67, 70, 9, 1],
    [75, 78, 10, 1],
    [80, 85, 12, 1],
    [90, 88, 14, 1],
    [95, 90, 15, 1],
    [100, 92, 16, 1],
    [110, 100, 17, 1],
    [120, 105, 18, 1],
    [130, 110, 19, 1],
    [140, 115, 20, 1],
    [150, 118, 22, 1],
    [160, 120, 24, 1],
    [175, 125, 25, 1],
    [190, 128, 26, 1],
    [210, 130, 28, 1],
    [230, 135, 30, 1]
]

df = pd.DataFrame(df, columns=['exam1', 'exam2', 'hours_study', 'admitted'])


In [20]:
print(df.head())

   exam1  exam2  hours_study  admitted
0     35     40            5         0
1     42     50            6         0
2     50     52            7         0
3     60     65            8         0
4     67     70            9         1


In [21]:
print(df.shape)

(20, 4)


In [22]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [23]:
print(X.head())
print(y.head())

   exam1  exam2  hours_study
0     35     40            5
1     42     50            6
2     50     52            7
3     60     65            8
4     67     70            9
0    0
1    0
2    0
3    0
4    1
Name: admitted, dtype: int64


In [27]:
X_mean = X.mean()
X_min = X.min()
X_max = X.max()
X_std = X.std(ddof=1)

In [28]:
print(X_mean, X_min, X_max, X_std)

exam1          115.45
exam2           94.80
hours_study     16.55
dtype: float64 exam1          35
exam2          40
hours_study     5
dtype: int64 exam1          230
exam2          135
hours_study     30
dtype: int64 exam1          56.651729
exam2          28.666136
hours_study     7.667258
dtype: float64


In [29]:
X_std = X.std(ddof=0)

In [30]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [31]:
def loss_fn(X, y, theta):
    z = X.dot(theta)
    y_cap = sigmoid(z)
    ep = 1e-8
    n = len(X)

    return -np.sum((y * np.log(y_cap + ep)) + ((1 - y) * np.log(1 - y_cap + ep))) / n

In [32]:
def gradient_descent(X, y, alpha=0.01, epochs=1500):
    theta = np.zeros(X.shape[1])
    n = len(X)
    

    for epoch in range(epochs):
        z = X.dot(theta)
        y_cap = sigmoid(z)

        grad = X.T.dot(y_cap - y) / n
        theta -= alpha * grad

    return theta

In [33]:
X_norm = (X - X_mean) / X_std

In [34]:
X_aug = np.c_[np.ones(X_norm.shape[0]), X_norm]

In [35]:
print(X_aug)

[[ 1.         -1.45697156 -1.96132514 -1.54553952]
 [ 1.         -1.33019964 -1.60341909 -1.41172658]
 [ 1.         -1.18531745 -1.53183788 -1.27791363]
 [ 1.         -1.00421471 -1.06656002 -1.14410069]
 [ 1.         -0.87744279 -0.887607   -1.01028774]
 [ 1.         -0.73256059 -0.60128216 -0.8764748 ]
 [ 1.         -0.64200922 -0.35074793 -0.6088489 ]
 [ 1.         -0.46090648 -0.24337611 -0.34122301]
 [ 1.         -0.37035511 -0.1717949  -0.20741007]
 [ 1.         -0.27980374 -0.10021369 -0.07359712]
 [ 1.         -0.09870099  0.18611114  0.06021583]
 [ 1.          0.08240175  0.36506417  0.19402877]
 [ 1.          0.26350449  0.54401719  0.32784172]
 [ 1.          0.44460723  0.72297022  0.46165466]
 [ 1.          0.62570998  0.83034203  0.72928055]
 [ 1.          0.80681272  0.90192324  0.99690645]
 [ 1.          1.07846683  1.08087626  1.13071939]
 [ 1.          1.35012094  1.18824808  1.26453234]
 [ 1.          1.71232643  1.25982929  1.53215823]
 [ 1.          2.07453191  1.43

In [36]:
theta = gradient_descent(X_aug, y)

In [37]:
print(theta)

[1.85554877 0.44588511 1.06200287 0.60765865]


In [38]:
final_loss = loss_fn(X_aug, y, theta)

In [39]:
print(final_loss)

0.19543268196377778


In [40]:
def predict(X):
    X_std = (X - mean_vals) / std_vals
    X_aug = np.c_[np.ones(X_std.shape[0]), X_std]
    p = sigmoid(X_aug.dot(theta))
    return p

In [42]:
new = pd.DataFrame([[72, 80, 11], [150, 118, 20]], columns=['exam1', 'exam2', 'hours_study'])
new_norm = (new - X_mean) / (X_std)
new_aug = np.c_[np.ones(new_norm.shape[0]), new_norm]

In [43]:

pred = new_aug.dot(theta)
pred = sigmoid(pred)

for price in pred:
    print(round(price, 2))

0.62
0.96


# KNN with euclidean distance

In [14]:
import pandas as pd
import numpy as np
import math
from collections import Counter

In [2]:
df = [
    [5.1, 3.5, 1.4, 0.2, "setosa"],
    [4.9, 3.0, 1.4, 0.2, "setosa"],
    [4.7, 3.2, 1.3, 0.2, "setosa"],
    [4.6, 3.1, 1.5, 0.2, "setosa"],
    [5.0, 3.6, 1.4, 0.2, "setosa"],
    [5.4, 3.9, 1.7, 0.4, "setosa"],
    [5.8, 4.0, 1.2, 0.2, "setosa"],
    [6.0, 2.2, 4.0, 1.5, "versicolor"],
    [6.1, 2.8, 4.7, 1.4, "versicolor"],
    [5.9, 3.0, 4.2, 1.5, "versicolor"],
    [6.7, 3.1, 4.4, 1.4, "versicolor"],
    [6.3, 2.5, 4.9, 1.5, "versicolor"],
    [6.5, 3.0, 5.1, 2.0, "virginica"],
    [6.2, 2.8, 4.5, 1.5, "versicolor"],
    [6.4, 2.9, 4.3, 1.3, "versicolor"],
    [5.5, 2.4, 4.0, 1.3, "versicolor"],
    [5.7, 2.8, 4.1, 1.3, "versicolor"],
    [5.8, 2.7, 5.1, 1.9, "virginica"],
    [6.9, 3.1, 5.4, 2.3, "virginica"],
    [6.0, 2.2, 5.0, 1.5, "virginica"],
    [6.3, 2.3, 5.6, 2.4, "virginica"],
    [6.1, 2.8, 5.6, 2.4, "virginica"],
    [5.6, 2.9, 3.6, 1.3, "versicolor"],
    [5.8, 2.7, 4.1, 1.0, "versicolor"],
    [6.0, 2.9, 4.5, 1.5, "versicolor"],
    [6.1, 2.6, 4.7, 1.4, "versicolor"],
    [6.5, 3.0, 5.2, 2.0, "virginica"],
    [6.2, 2.9, 5.4, 2.3, "virginica"],
    [5.9, 3.0, 5.1, 1.8, "virginica"],
    [6.3, 2.7, 5.6, 2.1, "virginica"]
]


In [3]:
df = pd.DataFrame(df, columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWdith', 'Class'])

In [4]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWdith   Class
0          5.1         3.5          1.4         0.2  setosa
1          4.9         3.0          1.4         0.2  setosa
2          4.7         3.2          1.3         0.2  setosa
3          4.6         3.1          1.5         0.2  setosa
4          5.0         3.6          1.4         0.2  setosa


In [5]:
print(df.shape)

(30, 5)


In [7]:
def distance(p1, p2):
    return math.sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

In [None]:
def k_fold_split(df, k):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return np.array_split(df, k)

In [15]:
def knn(train_X, train_y, test_x, k):
    distances = []
    for i in range(len(train_X)):
        dist = distance(train_X.iloc[i], test_x)
        distances.append((dist, train_y.iloc[i]))
    distances.sort(key=lambda x: x[0])
    top_k = [label for _, label in distances[:k]]
    return Counter(top_k).most_common(1)[0][0]


In [16]:
def cross_validate(df, k_values, folds=5):
    results = {}
    split_data = k_fold_split(df, folds)
    for k in k_values:
        acc_list = []
        for i in range(folds):
            test_df = split_data[i]
            train_df = pd.concat([split_data[j] for j in range(folds) if j != i])
            train_X, train_y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
            test_X, test_y = test_df.iloc[:, :-1], test_df.iloc[:, -1]
            correct = 0
            for idx in range(len(test_df)):
                pred = knn(train_X, train_y, test_X.iloc[idx], k)
                if pred == test_y.iloc[idx]:
                    correct += 1
            acc_list.append(correct / len(test_df))
        results[k] = round(np.mean(acc_list), 4)
    
    return results
                

In [17]:
def final_test(df, best_k, test_ratio=0.2):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    split_idx = int(len(df) * (1 - test_ratio))
    train_df, test_df = df[:split_idx], df[split_idx:]
    train_X, train_y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
    test_X, test_y = test_df.iloc[:, :-1], test_df.iloc[:, -1]
    correct = 0
    for idx in range(len(test_df)):
        pred = knn(train_X, train_y, test_X.iloc[idx], best_k)
        if pred == test_y.iloc[idx]:
            correct += 1
        
    accuracy = round(correct / len(test_df), 2)
    print(f"k={best_k} Accuracy:{accuracy:.2f}")

In [19]:
k_vals = [1, 3, 5, 7, 9]
cv_results = cross_validate(df, k_vals)
best_k = max(cv_results, key=cv_results.get)

  return bound(*args, **kwds)


In [21]:
print(cv_results)
print(best_k)

{1: 0.9667, 3: 0.9667, 5: 0.9667, 7: 0.9333, 9: 0.8333}
1


In [27]:
final_test(df, best_k)

k=1 Accuracy:0.83


# KNN with manhatten and normalization

In [28]:
df = [
    [5.2, 3.4, 1.5, 0.2, "setosa"],
    [4.8, 3.1, 1.6, 0.3, "setosa"],
    [5.0, 3.2, 1.2, 0.2, "setosa"],
    [5.3, 3.7, 1.4, 0.3, "setosa"],
    [4.9, 3.0, 1.5, 0.1, "setosa"],
    [5.1, 3.5, 1.3, 0.3, "setosa"],
    [5.4, 3.4, 1.7, 0.2, "setosa"],
    [5.0, 3.3, 1.4, 0.2, "setosa"],
    [6.0, 2.7, 4.2, 1.3, "versicolor"],
    [6.2, 2.9, 4.3, 1.3, "versicolor"],
    [5.7, 2.6, 3.5, 1.0, "versicolor"],
    [5.8, 2.7, 4.1, 1.2, "versicolor"],
    [6.1, 3.0, 4.6, 1.4, "versicolor"],
    [5.6, 2.8, 4.0, 1.3, "versicolor"],
    [6.3, 2.5, 4.9, 1.5, "versicolor"],
    [6.0, 3.4, 4.5, 1.6, "versicolor"],
    [5.9, 3.0, 4.2, 1.5, "versicolor"],
    [6.4, 2.8, 5.0, 1.7, "versicolor"],
    [5.5, 2.5, 4.0, 1.2, "versicolor"],
    [6.2, 2.2, 4.8, 1.8, "versicolor"],
    [6.5, 3.0, 5.2, 2.0, "virginica"],
    [6.9, 3.1, 5.4, 2.1, "virginica"],
    [6.7, 3.0, 5.8, 2.2, "virginica"],
    [7.1, 3.0, 5.9, 2.1, "virginica"],
    [6.3, 2.9, 5.6, 1.8, "virginica"],
    [6.6, 2.8, 5.3, 2.0, "virginica"],
    [7.0, 3.2, 5.7, 2.3, "virginica"],
    [6.5, 3.2, 5.1, 2.0, "virginica"],
    [6.8, 3.0, 5.5, 2.1, "virginica"],
    [6.4, 2.9, 5.6, 2.2, "virginica"],
    [6.2, 3.4, 5.4, 2.3, "virginica"],
    [6.9, 3.1, 5.1, 2.3, "virginica"],
    [7.2, 3.2, 6.0, 2.2, "virginica"],
    [6.3, 2.8, 5.7, 1.9, "virginica"],
    [6.1, 3.0, 5.5, 1.8, "virginica"],
    [6.7, 3.3, 5.7, 2.1, "virginica"],
    [6.4, 3.1, 5.5, 1.8, "virginica"],
    [6.8, 3.2, 5.9, 2.3, "virginica"],
    [7.3, 2.9, 6.1, 2.5, "virginica"],
    [6.5, 3.0, 5.8, 2.2, "virginica"]
]


In [29]:
df = pd.DataFrame(df, columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class'])

In [30]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWidth   Class
0          5.2         3.4          1.5         0.2  setosa
1          4.8         3.1          1.6         0.3  setosa
2          5.0         3.2          1.2         0.2  setosa
3          5.3         3.7          1.4         0.3  setosa
4          4.9         3.0          1.5         0.1  setosa


In [31]:
print(df.shape)

(40, 5)


In [32]:
def manhatten(p1, p2):
    return np.sum(np.abs(p1 - p2))

In [33]:
def knn(train_X, train_y, test_x, k):
    distances = []
    for i in range(len(train_X)):
        dist = manhatten(train_X.iloc[i], test_x)
        distances.append((dist, train_y.iloc[i]))
    distances.sort(key=lambda x: x[0])
    best_k = [label for _, label in distances[:k]]
    return Counter(best_k).most_common(1)[0][0]

In [34]:
def normalize(df):
    df_norm = df.copy()
    for col in df.columns[:-1]:
        df_norm[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df_norm

In [35]:
def k_fold(df, k=5):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return np.array_split(df, k)

In [41]:
def cross_validate(df, k_values, folds=5):
    results = {}
    split_data = k_fold(df, folds)
    for k in k_values:
        acc_list = []
        for i in range(folds):
            test_df = split_data[i]
            train_df = pd.concat([split_data[j] for j in range(folds) if j != i])
            train_X, train_y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
            test_X, test_y = test_df.iloc[:, :-1], test_df.iloc[:, -1]
            correct = 0
            for idx in range(len(test_df)):
                pred = knn(train_X, train_y, test_X.iloc[idx], k)
                if pred == test_y.iloc[idx]:
                    correct += 1

            acc_list.append(correct / len(test_df))
        results[k] = round(np.mean(acc_list), 4)

    return results

In [54]:
def final_test(df, best_k, test_ratio=0.2):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    split_idx = int(len(df) * (1 - test_ratio))
    train_df, test_df = df[:split_idx], df[split_idx:]
    train_X, train_y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
    test_X, test_y = test_df.iloc[:, :-1], test_df.iloc[:, -1]
    correct = 0
    for idx in range(len(test_df)):
        pred = knn(train_X, train_y, test_X.iloc[idx], best_k)
        if pred == test_y.iloc[idx]:
            correct += 1
    accuracy = round(correct / len(test_df), 2)
    print(f"k: {best_k} Accuracy: {accuracy:.2f}")

In [55]:
df = normalize(df)

In [56]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWidth   Class
0         0.16    0.800000     0.061224    0.041667  setosa
1         0.00    0.600000     0.081633    0.083333  setosa
2         0.08    0.666667     0.000000    0.041667  setosa
3         0.20    1.000000     0.040816    0.083333  setosa
4         0.04    0.533333     0.061224    0.000000  setosa


In [57]:
k_vals = [1, 3, 5, 7, 9]
cv_results = cross_validate(df, k_vals)
best_k = max(cv_results, key=cv_results.get)

  return bound(*args, **kwds)


In [58]:
print(cv_results)
print(best_k)

{1: 0.95, 3: 0.95, 5: 0.925, 7: 0.925, 9: 0.925}
1


In [59]:
final_test(df, best_k)

k: 1 Accuracy: 1.00


# Naive Bayes

In [104]:
import pandas as pd
import numpy as np

In [105]:
df = [
    [4, 122, 61, 7, 52, 27.82219568, 0.635762173, 48, 0, 0],
    [1, 104, 56, 11, 0, 24.24503026, 0.089848385, 60, 0, 0],
    [3, 109, 62, 23, 0, 18.87690204, 0.332317171, 61, 0, 0],
    [3, 117, 87, 21, 108, 30.4921571, 0.405407555, 25, 1, 0],
    [2, 114, 56, 28, 10, 23.1479042, 0.120255598, 25, 0, 0],
    [3, 120, 95, 13, 165, 32.39493617, 0.211376261, 26, 1, 0],
    [2, 99, 59, 20, 3, 44.38536126, 0.219177313, 43, 0, 0],
    [3, 150, 78, 13, 116, 33.39655129, 0.309157203, 23, 0, 0],
    [0, 122, 68, 27, 19, 45.98096352, 0.298969424, 57, 0, 0],
    [2, 104, 66, 31, 110, 25.12408723, 0.494678442, 53, 0, 0],
    [4, 130, 54, 24, 0, 23.0334144, 0.340692543, 63, 1, 0],
    [2, 117, 68, 30, 43, 24.81089088, 0.260540091, 41, 0, 0],
    [1, 67, 56, 20, 12, 31.54429623, 0.116002259, 40, 0, 0],
    [3, 130, 49, 28, 150, 51.06064081, 0.081957536, 21, 0, 0],
    [4, 89, 71, 20, 116, 20.36519701, 0.105820288, 59, 1, 0],
    [4, 148, 62, 9, 0, 36.33764183, 0.559926645, 67, 1, 0],
    [1, 112, 90, 22, 63, 27.10024265, 0.679502021, 37, 1, 0],
    [2, 143, 82, 10, 18, 30.43337864, 0.145064816, 49, 0, 0],
    [3, 101, 90, 43, 156, 23.19386062, 0.165994, 28, 1, 0],
    [2, 122, 61, 16, 181, 38.15396827, 0.491571652, 46, 1, 0],
    [3, 156, 71, 26, 0, 31.69252407, 0.039152253, 48, 0, 0],
    [4, 111, 38, 19, 146, 39.61217485, 0.278828256, 54, 0, 0],
    [1, 102, 66, 7, 14, 33.22671636, 0.462085911, 46, 0, 0],
    [3, 139, 71, 7, 59, 18.47863216, 0.090168205, 33, 0, 0],
    [3, 139, 71, 27, 82, 39.67579957, 0.238599802, 64, 0, 0],
    [3, 103, 69, 35, 225, 17.90899557, 0.501099654, 57, 0, 0],
    [5, 60, 70, 12, 0, 18.91068122, 0.155102847, 33, 1, 0],
    [0, 101, 37, 7, 146, 25.57958124, 0.096417867, 58, 0, 0],
    [1, 140, 73, 16, 0, 25.67301121, 0.255078432, 45, 1, 0],
    [4, 96, 75, 10, 0, 41.77214956, 0.376397231, 67, 0, 0],
]


In [106]:
df = pd.DataFrame(df, columns = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
    "Outcome",
    "Outlier"
]
)

In [107]:
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin        BMI  \
0            4      122             61              7       52  27.822196   
1            1      104             56             11        0  24.245030   
2            3      109             62             23        0  18.876902   
3            3      117             87             21      108  30.492157   
4            2      114             56             28       10  23.147904   

   DiabetesPedigreeFunction  Age  Outcome  Outlier  
0                  0.635762   48        0        0  
1                  0.089848   60        0        0  
2                  0.332317   61        0        0  
3                  0.405408   25        1        0  
4                  0.120256   25        0        0  


In [108]:
print(df.shape)

(30, 10)


In [None]:
test_ratio = 0.3
split_idx = int(len(df) * (1 - test_ratio))
print(split_idx)
train_df = df[:split_idx]
test_df = df[split_idx:]

21


In [110]:
train_X = train_df.iloc[:, :-2]
train_y = train_df.iloc[:, -2]

In [111]:
def gaussian_prob(X, mean, std):
    return (1 / (std * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((X - mean) / std) ** 2)

In [112]:
def train_bayes(train_df):
    n_samples, n_features = train_df.shape[0], train_df.shape[1] - 2
    class_0_data = train_df[train_df.iloc[:, -2] == 0]
    class_1_data = train_df[train_df.iloc[:, -2] == 1]

    class_0_prior = len(class_0_data) / n_samples
    class_1_prior = len(class_1_data) / n_samples

    means_0 = class_0_data.iloc[:, :-2].mean()
    stds_0 = class_0_data.iloc[:, :-2].std(ddof=0)

    means_1 = class_1_data.iloc[:, :-2].mean()
    stds_1 = class_1_data.iloc[:, :-2].std(ddof=0)

    return class_0_prior, class_1_prior, means_0, stds_0, means_1, stds_1




In [113]:
class_0_prior, class_1_prior, means_0, stds_0, means_1, stds_1 = train_bayes(train_df)

In [114]:
test_X = test_df.iloc[:, :-2]
test_y = test_df.iloc[:, -2]

In [115]:
def predict(test_data, class_0_prior, class_1_prior, means_0, stds_0, means_1, stds_1):
    sample_data = test_data.iloc[:, :-2]
    probs_0 = gaussian_prob(sample_data, means_0, stds_0)
    like_0 = probs_0.prod(axis=1) * class_0_prior

    probs_1 = gaussian_prob(sample_data, means_1, stds_1)
    like_1 = probs_1.prod(axis=1) * class_1_prior

    pred = (like_1 > like_0).astype(int)
    return pred

In [116]:
def metrics(predictions, actual_labels):
    accuracy = np.sum(np.array(predictions) == np.array(actual_labels)) / len(actual_labels)
    precision = np.sum((np.array(predictions) == 1) & (np.array(actual_labels) == 1)) / np.sum(np.array(predictions) == 1) if np.sum(np.array(predictions) == 1) > 0 else 0
    recall = np.sum((np.array(predictions) == 1) & (np.array(actual_labels) == 1)) / np.sum(np.array(actual_labels) == 1) if np.sum(np.array(actual_labels) == 1) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1_score

In [117]:
predictions = predict(test_df, class_0_prior, class_1_prior, means_0, stds_0, means_1, stds_1)

In [118]:
accuracy, precision, recall, f1_score = metrics(predictions, test_y)

In [121]:
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Class 0 Prior: {class_0_prior:.2f}")
print(f"Class 1 Prior: {class_1_prior:.2f}")
print(f"Predictions: {[int(x) for x in predictions.tolist()]}")
print(f"Actual: {[int(x) for x in test_y.tolist()]}")
print(f"Accuracy={accuracy:.2f}")
print(f"Precision={precision:.2f}")
print(f"Recall={recall:.2f}")
print(f"F1={f1_score:.2f}")

Training samples: 21
Test samples: 9
Class 0 Prior: 0.62
Class 1 Prior: 0.38
Predictions: [1, 0, 0, 0, 1, 0, 0, 0, 0]
Actual: [0, 0, 0, 0, 0, 1, 0, 1, 0]
Accuracy=0.56
Precision=0.00
Recall=0.00
F1=0.00


# Multinomial Naive Bayes

In [165]:
import math

In [166]:
df = [
    ["I love this movie", 1],
    ["This film was great", 1],
    ["Amazing acting and story", 1],
    ["What a fantastic performance", 1],
    ["The direction was brilliant", 1],
    ["Highly enjoyable and emotional", 1],
    ["I really liked the soundtrack", 1],
    ["The characters were inspiring", 1],
    ["This was a boring movie", 0],
    ["I did not like the film", 0],
    ["Terrible acting and weak story", 0],
    ["The performance was disappointing", 0],
    ["It was a waste of time", 0],
    ["Not enjoyable at all", 0],
    ["I hated the soundtrack", 0],
    ["The characters were dull", 0],
    ["Amazing visuals but boring story", 0],
    ["Loved the emotional moments", 1],
    ["Brilliant soundtrack and acting", 1],
    ["Terrible direction and slow pace", 0],
    ["Enjoyable family film", 1],
    ["Weak characters and dull ending", 0],
    ["Fantastic film overall", 1],
    ["Not worth watching", 0]
]


In [167]:
df = pd.DataFrame(df, columns=['Text', 'Label'])

In [168]:
print(df.head())

                           Text  Label
0             I love this movie      1
1           This film was great      1
2      Amazing acting and story      1
3  What a fantastic performance      1
4   The direction was brilliant      1


In [169]:
print(df.shape)

(24, 2)


In [170]:
test_ratio = 0.3
split_idx = int(len(df) * (1 - test_ratio))
train_df = df[:split_idx]
test_df = df[split_idx:]

In [171]:
counts = {0: 0, 1: 0}
wfreq = {0: {}, 1: {}}
words_set = set()

In [172]:
for sen, cat in train_df.values:
    counts[cat] += 1
    for word in sen.split():
        words_set.add(word)
        wfreq[cat][word] = wfreq[cat].get(word, 0) + 1
doc_count = len(train_df)
p = {x: counts[x] / doc_count for x in [0, 1]}
total_words = len(words_set)
pword = {0: {}, 1: {}}
for x in [0, 1]:
    total_word_count = sum(wfreq[x].values())
    for word in words_set:
        pword[x][word] = (wfreq[x].get(word, 0) + 1) / (total_word_count + total_words)

print(p)

{0: 0.5, 1: 0.5}


In [173]:
def classify(sen):
    tokens = sen.split()
    results = {}
    for x in [0, 1]:
        score = math.log(p[x])
        for token in tokens:
            if token in words_set:
                score += math.log(pword[x][token])
        results[x] = score
    return max(results, key=results.get)

In [174]:
predicted_labels = []
true_labels = []
for sen, cat in test_df.values:
    pred = classify(sen.lower())
    predicted_labels.append(pred)
    true_labels.append(cat)

In [175]:
true_pos = sum((p == 1 and a == 1) for p, a in zip(predicted_labels, true_labels))
true_neg = sum((p == 0 and a == 0) for p, a in zip(predicted_labels, true_labels))
false_pos = sum((p == 1 and a == 0) for p, a in zip(predicted_labels, true_labels))
false_neg = sum((p == 0 and a == 1) for p, a in zip(predicted_labels, true_labels))

In [176]:
acc = (true_pos + true_neg) / len(true_labels)
prec = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
rec = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
f1_score = (2 * prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

In [177]:
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Class 0 Prior: {p[0]:.2f}")
print(f"Class 1 Prior: {p[1]:.2f}")
print("Predictions:", predicted_labels)
print("Actual:", true_labels)
print(f"Accuracy={acc:.2f}")
print(f"Precision={prec:.2f}")
print(f"Recall={rec:.2f}")
print(f"F1={f1_score:.2f}")

Training samples: 16
Test samples: 8
Class 0 Prior: 0.50
Class 1 Prior: 0.50
Predictions: [0, 1, 1, 1, 1, 0, 1, 0]
Actual: [0, 1, 1, 0, 1, 0, 1, 0]
Accuracy=0.88
Precision=0.80
Recall=1.00
F1=0.89
