In [16]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [17]:
data = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [20]:
data["BMI Category"].value_counts()

Normal           195
Overweight       148
Normal Weight     21
Obese             10
Name: BMI Category, dtype: int64

## Preprocessing

In [3]:
# Person ID

data.drop(['Person ID', 'Occupation', 'Blood Pressure'], axis=1, inplace=True)

In [4]:
# Gender

data["Gender"] = data["Gender"].replace({
    "Male": 0,
    "Female": 1
})

In [5]:
# Occupation, BMI Category

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(data[["BMI Category"]])

data_ohe = pd.DataFrame(ohe.transform(data[["BMI Category"]]))

data.drop(["BMI Category"], axis=1, inplace=True)
data = pd.concat([data, data_ohe], axis=1)

In [6]:
# Sleep Disorder

data["Sleep Disorder"].fillna("Normal", inplace=True)

data["Sleep Disorder"] = data["Sleep Disorder"].replace({
    "None": 0,
    "Insomnia": 1,
    "Sleep Apnea": 2
})

In [7]:
data.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Sleep Disorder,0,1,2,3
0,0,27,6.1,6,42,6,77,4200,0,0.0,0.0,0.0,1.0
1,0,28,6.2,6,60,8,75,10000,0,1.0,0.0,0.0,0.0
2,0,28,6.2,6,60,8,75,10000,0,1.0,0.0,0.0,0.0
3,0,28,5.9,4,30,8,85,3000,2,0.0,0.0,1.0,0.0
4,0,28,5.9,4,30,8,85,3000,2,0.0,0.0,1.0,0.0


## Prediksi

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X = data.drop(["Sleep Disorder"], axis=1)
X.columns = X.columns.astype(str)
y = data["Sleep Disorder"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.88


In [10]:
best_acc = (0, 0) # skor, k

for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if acc > best_acc[0]:
        best_acc = (acc, k)

best_acc

(0.88, 2)

kesimpulannya, k terbaik adalah sama dengan dua.

In [11]:
knn_final = KNeighborsClassifier(n_neighbors=2)
knn_final.fit(X, y)

## Saving Model

In [12]:
import pickle

In [13]:
pickle.dump(knn_final, open("model1.pkl", "wb"))

In [14]:
pickle.dump(ohe, open("ohe1.pkl", "wb"))

In [15]:
def preprocessing(data, ohe):
    data.drop(['Person ID', 'Occupation', 'Blood Pressure'], axis=1, inplace=True)
    
    data["Gender"] = data["Gender"].replace({
    "Male": 0,
    "Female": 1
    })
    
    data_ohe = pd.DataFrame(ohe.transform(data[["BMI Category"]]), columns=ohe.get_feature_names_out())
    data.drop(["BMI Category"], axis=1, inplace=True)
    data = pd.concat([data, data_ohe], axis=1)
    
    data["Sleep Disorder"].fillna("Normal", inplace=True)

    data["Sleep Disorder"] = data["Sleep Disorder"].replace({
        "Normal": 0,
        "Insomnia": 1,
        "Sleep Apnea": 2
    })
    
    return data