In [1]:
import pandas as pd
import warnings
from sklearn import preprocessing

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


## Preprocessing

In [6]:
# Person ID

data.drop(['Person ID'], axis=1, inplace=True)

In [7]:
# Gender

data["Gender"] = data["Gender"].replace({
    "Male": 0,
    "Female": 1
})

In [8]:
# Occupation, BMI Category

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(data[["Occupation", "BMI Category"]])

data_ohe = pd.DataFrame(ohe.transform(data[["Occupation", "BMI Category"]]))

data.drop(["Occupation", "BMI Category"], axis=1, inplace=True)
data = pd.concat([data, data_ohe], axis=1)

In [9]:
int("125/83".split("/")[0])

125

In [10]:
int("125/83".split("/")[1])

83

In [9]:
# Blood Pressure

data["systolic"] = data["Blood Pressure"].apply(lambda x: int(x.split("/")[0]))
data["diastolic"] = data["Blood Pressure"].apply(lambda x: int(x.split("/")[1]))

data.drop(["Blood Pressure"], axis=1, inplace=True)

In [10]:
# Sleep Disorder

data["Sleep Disorder"] = data["Sleep Disorder"].replace({
    "None": 0,
    "Insomnia": 1,
    "Sleep Apnea": 2
})

In [11]:
data.head()

Unnamed: 0,Gender,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Sleep Disorder,0,...,7,8,9,10,11,12,13,14,systolic,diastolic
0,0,27,6.1,6,42,6,77,4200,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,126,83
1,0,28,6.2,6,60,8,75,10000,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,125,80
2,0,28,6.2,6,60,8,75,10000,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,125,80
3,0,28,5.9,4,30,8,85,3000,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,140,90
4,0,28,5.9,4,30,8,85,3000,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,140,90


In [3]:
df1 = data.copy()
label_encoder = preprocessing.LabelEncoder()
df1['Gender'] = label_encoder.fit_transform(df1['Gender'])
df1['Occupation'] = label_encoder.fit_transform(df1['Occupation'])
df1['BMI Category'] = label_encoder.fit_transform(df1['BMI Category'])
df1['Sleep Disorder'] = label_encoder.fit_transform(df1['Sleep Disorder'])
df1.drop('Blood Pressure', axis = 1, inplace = True)
df1.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder
0,1,1,27,9,6.1,6,42,6,3,77,4200,1
1,2,1,28,1,6.2,6,60,8,0,75,10000,1
2,3,1,28,1,6.2,6,60,8,0,75,10000,1
3,4,1,28,6,5.9,4,30,8,2,85,3000,2
4,5,1,28,6,5.9,4,30,8,2,85,3000,2


In [4]:
import plotly.express as px
fig = px.imshow(df1.corr())
fig.show()

## Prediksi

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X = data.drop(["Sleep Disorder"], axis=1)
X.columns = X.columns.astype(str)
y = data["Sleep Disorder"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.88


In [16]:
best_acc = (0, 0) # skor, k

for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if acc > best_acc[0]:
        best_acc = (acc, k)

best_acc

(0.88, 2)

kesimpulannya, k terbaik adalah sama dengan dua.

In [17]:
knn_final = KNeighborsClassifier(n_neighbors=2)
knn_final.fit(X, y)

## Saving Model

In [18]:
import pickle

In [19]:
pickle.dump(knn_final, open("model.pkl", "wb"))

In [20]:
pickle.dump(ohe, open("ohe.pkl", "wb"))

In [21]:
def preprocessing(data, ohe):
    data.drop(['Person ID'], axis=1, inplace=True)
    
    data["Gender"] = data["Gender"].replace({
    "Male": 0,
    "Female": 1
    })
    
    data_ohe = pd.DataFrame(ohe.transform(data[["Occupation", "BMI Category"]]), columns=ohe.get_feature_names_out())
    data.drop(["Occupation", "BMI Category"], axis=1, inplace=True)
    data = pd.concat([data, data_ohe], axis=1)
    
    data["systolic"] = data["Blood Pressure"].apply(lambda x: int(x.split("/")[0]))
    data["diastolic"] = data["Blood Pressure"].apply(lambda x: int(x.split("/")[1]))

    data.drop(["Blood Pressure"], axis=1, inplace=True)
    
    data["Sleep Disorder"].fillna("Normal", inplace=True)

    data["Sleep Disorder"] = data["Sleep Disorder"].replace({
        "Normal": 0,
        "Insomnia": 1,
        "Sleep Apnea": 2
    })
    
    return data

Pertanyaan :

1. Blood pressure itu kan harus cek dulu, solusinya apakah dipertahankan atau dihilangkan saja.
2. untuk pekerjaan kita terkendala karena datset yang kita dapat itu jenis pekerjaannya hanya sedikit itu baiknya gimana, karena sudah coba cari data yang serupa tidak ada yang memiiki kolom untuk sleep disorder.