# 1. Import data set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [3]:
df['Gender'] = df['Gender'].astype('category')
df['Gender'] = df['Gender'].cat.codes
df.head(10)


Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801
5,1,67.253016,152.212156
6,1,68.785081,183.927889
7,1,68.348516,167.971111
8,1,67.01895,175.92944
9,1,63.456494,156.399676


In [4]:
df.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

# 2. Separate X (Gender, Height) and Y (y=Weight).

In [5]:
x = df.drop(columns=['Weight'], axis=1)
x.head()

Unnamed: 0,Gender,Height
0,1,73.847017
1,1,68.781904
2,1,74.110105
3,1,71.730978
4,1,69.881796


In [6]:
y = df['Weight']
y.head()

0    241.893563
1    162.310473
2    212.740856
3    220.042470
4    206.349801
Name: Weight, dtype: float64

# 3. Train = 70%, Test = 30%

In [7]:
x.mean()

Gender     0.584454
Height    66.809925
dtype: float64

In [8]:
y.mean()

165.6327353266768

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

In [11]:
df.shape

(8555, 3)

In [12]:
x_train.shape

(5988, 2)

In [13]:
x_test.shape

(2567, 2)

# 4. Apply Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
reg = LinearRegression()

In [16]:
reg.fit(x_train,y_train)

In [17]:
reg.coef_

array([19.54151693,  5.9562333 ])

In [18]:
reg.intercept_

-243.79306041624102

# 5. Evaluate the Model (Testing and training Accuracy, MSE for testing)

In [19]:
y_pred_test = reg.predict(x_test)
y_pred_test

array([142.44868114, 186.0517281 , 198.42283706, ..., 102.94080643,
       190.83713617, 143.34786753])

In [20]:
y_test.head(10)

6006    149.668369
1197    197.642244
2862    178.551191
6497    141.343095
2860    195.322675
7401    108.174020
6680    183.287278
4220    177.403382
1046    211.737442
5292    115.228166
Name: Weight, dtype: float64

In [21]:
from sklearn.metrics import r2_score

In [22]:
test_score = r2_score(y_test, y_pred_test)
test_score

0.9059112424422658

In [23]:
y_pred_train = reg.predict(x_train)
y_pred_train

array([178.35427565, 202.22486558, 129.22191775, ..., 139.34406368,
       187.17224616, 158.78183812])

In [24]:
y_train.head(10)

553     186.751417
1397    211.031652
7934    143.768451
8367    151.814648
3320    196.505814
1760    165.763926
2858    136.672643
297     210.369909
443     165.404545
777     179.370856
Name: Weight, dtype: float64

In [25]:
train_score = r2_score(y_pred_train,y_train)
train_score

0.8856440156287799

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
MSE = mean_squared_error(y_pred_test, y_test)
MSE

96.83734437830613

# 6. Apply KNN Regressor: Scikit-Learn Link

In [28]:
from sklearn.neighbors import KNeighborsRegressor

In [29]:
knn_model = KNeighborsRegressor(n_neighbors=3)

In [30]:
knn_model.fit(x_train, y_train)

# 7. Evaluate the Model (Testing and training Accuracy, MSE for testing)

In [31]:
knn_train_accuracy = knn_model.score(x_train,y_train)
knn_train_accuracy

0.9304521916751347

In [32]:
knn_test_accuracy = knn_model.score(x_test, y_test)
knn_test_accuracy

0.8679879688589832

In [33]:
knn_y_pred_test = knn_model.predict(x_test)
knn_y_pred_test

array([142.5130416 , 173.90630823, 184.6686197 , ...,  95.44597987,
       182.99507227, 147.37662317])

In [34]:
knn_mse = mean_squared_error(y_test, knn_y_pred_test)
knn_mse

135.8684592453893

# 8. Compare KNN & Linear Regression with the KNN Model and Linear regression as well

''' Both the mean squared error (MSE) and the coefficient of determination (R-squared or R2) are important metrics for evaluating regression models, but they provide slightly different insights into the model's performance. Let's analyze the R2 scores along with the previously mentioned MSE values:

Linear Regression:

Train R2 score: 0.8856440156287799
Test R2 score: 0.9059112424422658
MSE: 96.83734437830613
KNN Regressor:

Train R2 score: 0.9304521916751347
Test R2 score: 0.8679879688589832
MSE: 135.8684592453893

Here's what we can interpret from these metrics:

MSE Comparison: The Linear Regression model has a lower MSE (96.837) compared to the KNN Regressor (135.868). Lower MSE indicates better performance, as it signifies that the model's predictions are closer to the actual values.

R-squared (R2) Comparison: R2 is a measure of how well the model explains the variability in the target variable. Higher R2 values indicate that the model explains a larger portion of the variability. Both models have relatively high R2 scores, but the Linear Regression model has slightly higher R2 scores for both training and testing sets.

Considering these metrics:

The Linear Regression model has better MSE and slightly higher R2 scores for both training and testing sets. This indicates that it is able to make predictions that are closer to the actual values and that it explains a greater proportion of the variability in the target variable compared to the KNN Regressor.'''