<a href="https://colab.research.google.com/github/matiahasmasan/TSP---berlin52/blob/main/tema4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KNN

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go

In [12]:
!pip install -q gdown

!gdown "https://drive.google.com/uc?id=1WO3yoK_Fd-v3JBLBCVNvZVZHfS5Un_Em"

Downloading...
From: https://drive.google.com/uc?id=1WO3yoK_Fd-v3JBLBCVNvZVZHfS5Un_Em
To: /content/strength_training_data.csv
  0% 0.00/3.17k [00:00<?, ?B/s]100% 3.17k/3.17k [00:00<00:00, 9.10MB/s]


## Clasificare folosind KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [14]:
df = pd.read_csv('strength_training_data.csv')
df

Unnamed: 0,Bench_Press_kg,Squat_kg,Deadlift_kg,Body_Weight_kg,Strength_Level
0,83.4,91.6,90.9,75.0,Newbie
1,47.0,50.2,55.1,66.2,Newbie
2,69.7,70.8,74.7,60.8,Newbie
3,51.9,58.2,60.7,67.3,Newbie
4,55.8,65.3,71.0,84.5,Newbie
...,...,...,...,...,...
95,167.1,256.1,274.7,110.8,Elite
96,149.0,177.4,190.9,78.7,Elite
97,181.5,245.4,251.1,97.9,Elite
98,179.1,223.7,237.9,88.1,Elite


In [15]:
X = df[['Bench_Press_kg', 'Squat_kg', 'Deadlift_kg', 'Body_Weight_kg']]
y = df['Strength_Level']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [16]:
fig = px.scatter_3d(df,
                    x='Bench_Press_kg',
                    y='Squat_kg',
                    z='Deadlift_kg',
                    color='Strength_Level',
                    title='Strength Training Data Distribution',
                    labels={'Bench_Press_kg': 'Bench Press (kg)',
                            'Squat_kg': 'Squat (kg)',
                            'Deadlift_kg': 'Deadlift (kg)'})

fig.show()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)

In [18]:
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform', metric='euclidean')
knn.fit(X_train, y_train)

In [19]:
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.88


In [20]:
y_pred_full = knn.predict(X)

cm_full = confusion_matrix(y_encoded, y_pred_full)
labels = le.classes_

fig = px.imshow(cm_full,
                labels=dict(x="Predicted Label", y="True Label", color="Count"),
                x=labels,
                y=labels,
                text_auto=True,
                title="Confusion Matrix on Whole Dataset")
fig.show()

In [21]:
new_user = np.array([[110, 160, 180, 92]]) # Bench Press = 110kg, Squat = 160kg, Deadlift = 180kg, Body Weight = 92kg
pred = knn.predict(new_user)
predicted_label = le.inverse_transform(pred)[0]
print("Predicted Level:", predicted_label)

Predicted Level: Advanced



X does not have valid feature names, but KNeighborsClassifier was fitted with feature names



In [22]:
fig = px.scatter_3d(df,
                    x='Bench_Press_kg',
                    y='Squat_kg',
                    z='Deadlift_kg',
                    color='Strength_Level',
                    title='Strength Training Data with New User',
                    labels={'Bench_Press_kg': 'Bench Press (kg)',
                            'Squat_kg': 'Squat (kg)',
                            'Deadlift_kg': 'Deadlift (kg)'})

fig.add_trace(go.Scatter3d(x=[new_user[0][0]],
                           y=[new_user[0][1]],
                           z=[new_user[0][2]],
                           mode='markers',
                           marker=dict(color='red', size=10, symbol='diamond'),
                           name=f'New User ({predicted_label})'))

fig.show()

In [23]:
df['Average_Lift_kg'] = df[['Bench_Press_kg', 'Squat_kg', 'Deadlift_kg']].mean(axis=1)

fig = px.scatter(df,
                 x='Average_Lift_kg',
                 y='Body_Weight_kg',
                 color='Strength_Level',
                 title='Average Lift vs. Body Weight',
                 labels={'Average_Lift_kg': 'Average Lift (kg)',
                         'Body_Weight_kg': 'Body Weight (kg)'})

new_user_avg_lift = new_user[0][:3].mean()
new_user_body_weight = new_user[0][3]

fig.add_trace(go.Scatter(x=[new_user_avg_lift],
                         y=[new_user_body_weight],
                         mode='markers',
                         marker=dict(color='red', size=10, symbol='diamond'),
                         name=f'New User ({predicted_label})'))

fig.show()

## Regresie folosind KNN

In [24]:
from sklearn.neighbors import KNeighborsRegressor

In [25]:
df = pd.read_csv('strength_training_data.csv')
df

Unnamed: 0,Bench_Press_kg,Squat_kg,Deadlift_kg,Body_Weight_kg,Strength_Level
0,83.4,91.6,90.9,75.0,Newbie
1,47.0,50.2,55.1,66.2,Newbie
2,69.7,70.8,74.7,60.8,Newbie
3,51.9,58.2,60.7,67.3,Newbie
4,55.8,65.3,71.0,84.5,Newbie
...,...,...,...,...,...
95,167.1,256.1,274.7,110.8,Elite
96,149.0,177.4,190.9,78.7,Elite
97,181.5,245.4,251.1,97.9,Elite
98,179.1,223.7,237.9,88.1,Elite


In [26]:
X = df[['Bench_Press_kg', 'Squat_kg', 'Deadlift_kg']]
y = df['Body_Weight_kg']

In [27]:
fig = px.scatter_3d(df,
                    x='Bench_Press_kg',
                    y='Squat_kg',
                    z='Deadlift_kg',
                    color='Body_Weight_kg',
                    title='Strength Training Data Distribution',
                    labels={'Bench_Press_kg': 'Bench Press (kg)',
                            'Squat_kg': 'Squat (kg)',
                            'Deadlift_kg': 'Deadlift (kg)'})

fig.show()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [29]:
knn = KNeighborsRegressor(n_neighbors=3, weights='uniform', metric='euclidean')
knn.fit(X_train, y_train)

In [30]:
new_user = np.array([[110, 160, 180]]) # Bench Press = 110kg, Squat = 160kg, Deadlift = 180kg
pred = knn.predict(new_user)
print("Predicted Weight:", pred[0])
predicted_body_weight = round(pred[0], 2)

Predicted Weight: 85.13333333333333



X does not have valid feature names, but KNeighborsRegressor was fitted with feature names



In [31]:
fig = px.scatter_3d(df,
                    x='Bench_Press_kg',
                    y='Squat_kg',
                    z='Deadlift_kg',
                    color='Body_Weight_kg',
                    title='Strength Training Data with New User',
                    labels={'Bench_Press_kg': 'Bench Press (kg)',
                            'Squat_kg': 'Squat (kg)',
                            'Deadlift_kg': 'Deadlift (kg)'})

fig.add_trace(go.Scatter3d(x=[new_user[0][0]],
                           y=[new_user[0][1]],
                           z=[new_user[0][2]],
                           mode='markers',
                           marker=dict(color=[predicted_body_weight], size=10, symbol='diamond'),
                           name=f'New User ({predicted_body_weight})'))

fig.show()

In [32]:
df['Average_Lift_kg'] = df[['Bench_Press_kg', 'Squat_kg', 'Deadlift_kg']].mean(axis=1)

fig = px.scatter(df,
                 x='Average_Lift_kg',
                 y='Body_Weight_kg',
                 color='Body_Weight_kg',
                 title='Average Lift vs. Body Weight',
                 labels={'Average_Lift_kg': 'Average Lift (kg)',
                         'Body_Weight_kg': 'Body Weight (kg)'})

new_user_avg_lift = new_user[0][:3].mean()

fig.add_trace(go.Scatter(x=[new_user_avg_lift],
                         y=[predicted_body_weight],
                         mode='markers',
                         marker=dict(color=[predicted_body_weight], size=10, symbol='diamond'),
                         name=f'New User ({predicted_body_weight})'))

fig.show()

## Implementare KNN manuală

In [33]:
from collections import Counter

In [34]:
def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

In [35]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions

    def _predict(self, x):
        # compute the distance
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # get the closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # majority votes
        # most_common = Counter(k_nearest_labels).most_common()
        # return most_common[0][0]
        return np.mean(k_nearest_labels)

In [36]:
classifier = KNN(k=3)
classifier.fit(X_train.values, y_train.values)
manual_prediction = classifier.predict(new_user)
manual_prediction[0]

np.float64(85.13333333333333)

## Temă

Testați modelele de clasificare și regresie folosind diferite valori pentru K. Comparați rezultatele și explicați ce diferențe apar.

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

df = pd.read_csv("strength_training_data.csv")

X = df[['Bench_Press_kg', 'Squat_kg', 'Deadlift_kg', 'Body_Weight_kg']]
y = df['Strength_Level']

# Beginner -> 0, Intermediate -> 1
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 0.25 - 0.75
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42
)

k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Classification
print(f"{'KNN CLASSIFICATION RESULTS'}")
print(f"{'K-Value':<10} | {'Accuracy Score':<20}")
print("-" * 32)

for k in k_values:
    # Initialize and fit
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_class = knn_clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred_class)

    print(f"K = {k:<6} | {acc:.4f}")

print("\n")

# --- 3. Regression Task (KNeighborsRegressor) ---
# Note: Regression on labels implies an ordinal relationship (distance between classes matters)



print(f"{'KNN REGRESSION RESULTS'}")
print(f"{'K-Value':<10} | {'MSE (Mean Squared Error)':<20}")
print("-" * 32)

for k in k_values:
    # Initialize and fit
    knn_reg = KNeighborsRegressor(n_neighbors=k)
    knn_reg.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_reg = knn_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_reg)

    print(f"K = {k:<6} | {mse:.4f}")

KNN CLASSIFICATION RESULTS
K-Value    | Accuracy Score      
--------------------------------
K = 1      | 0.7600
K = 2      | 0.7600
K = 3      | 0.8800
K = 4      | 0.7200
K = 5      | 0.7200
K = 6      | 0.7200
K = 7      | 0.7200
K = 8      | 0.7200
K = 9      | 0.6800


KNN REGRESSION RESULTS
K-Value    | MSE (Mean Squared Error)
--------------------------------
K = 1      | 0.3600
K = 2      | 0.3100
K = 3      | 0.2089
K = 4      | 0.2350
K = 5      | 0.2640
K = 6      | 0.2744
K = 7      | 0.2914
K = 8      | 0.2775
K = 9      | 0.3062
