# 1. Import data set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [3]:
df.shape


(8555, 3)

In [4]:
df.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

In [5]:
df['Gender'] = df['Gender'].astype('category')
df['Gender'] = df['Gender'].cat.codes
df.head()

Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801


# 2. Separate X and Y. (y=Gender)


In [6]:
x = df.drop(columns=['Gender'], axis = 1)
x.head()

Unnamed: 0,Height,Weight
0,73.847017,241.893563
1,68.781904,162.310473
2,74.110105,212.740856
3,71.730978,220.04247
4,69.881796,206.349801


In [7]:
y = df['Gender']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Gender, dtype: int8

# 3. Train = 70%, Test = 30%

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

# 4. Apply KNN Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
knn_model = KNeighborsClassifier(n_neighbors=3)

In [12]:
knn_model.fit(x_train, y_train)

# 5. Evaluate the Model by only Accuracy

In [13]:
train_accuracy = knn_model.score(x_train, y_train)
train_accuracy

0.9367067468269873

In [14]:
test_accuracy = knn_model.score(x_test, y_test)
test_accuracy

0.901441371250487

In [15]:
y_pred_train = knn_model.predict(x_train)

In [16]:
y_pred_train

array([1, 1, 0, ..., 1, 1, 0], dtype=int8)

In [17]:
y_train.head(10)

553     1
1397    1
7934    0
8367    0
3320    1
1760    1
2858    1
297     1
443     1
777     1
Name: Gender, dtype: int8

In [18]:
y_pred_test = knn_model.predict(x_test)

In [19]:
y_pred_test

array([0, 1, 1, ..., 0, 1, 0], dtype=int8)

In [20]:
y_test.head(10)

6006    0
1197    1
2862    1
6497    0
2860    1
7401    0
6680    0
4220    1
1046    1
5292    0
Name: Gender, dtype: int8

In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_train

0.0632932531730127

In [23]:
mse_test = mean_squared_error(y_test, y_pred_test)
mse_test

0.09855862874951304