# Importing

In [344]:
# Importing
    
import numpy as np
import pandas as pd
import os
import plotly.graph_objs as go
import plotly.express as px
import warnings
import pickle
warnings.filterwarnings('ignore')

# Load the dataset

In [345]:

df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')

# Preprocessing - Divide 'Blood Pressure' to highest and lowest

In [346]:
df['Blood Pressure'].unique()

array(['126/83', '125/80', '140/90', '120/80', '132/87', '130/86',
       '117/76', '118/76', '128/85', '131/86', '128/84', '115/75',
       '135/88', '129/84', '130/85', '115/78', '119/77', '121/79',
       '125/82', '122/80', '135/90', '140/95', '139/91', '118/75'],
      dtype=object)

In [347]:
df1 = pd.concat([df, df['Blood Pressure'].str.split('/', expand=True)], axis=1).drop('Blood Pressure', axis=1)

In [348]:
df1.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,0,1
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,3,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [349]:
df1 = df1.rename(columns={0: 'BloodPressure_Upper_Value', 1: 'BloodPressure_Lower_Value'})

In [350]:
df1['BloodPressure_Upper_Value'] = df1['BloodPressure_Upper_Value'].astype(float)
df1['BloodPressure_Lower_Value'] = df1['BloodPressure_Lower_Value'].astype(float)


# Handling Categorical Variables

In [351]:
#import label encoder
from sklearn import preprocessing 
#make an instance of Label Encoder


label_encoder = preprocessing.LabelEncoder()
df1['Gender'] = label_encoder.fit_transform(df1['Gender'])
df1['Occupation'] = label_encoder.fit_transform(df1['Occupation'])
df1['BMI Category'] = label_encoder.fit_transform(df1['BMI Category'])
df1['Sleep Disorder'] = label_encoder.fit_transform(df1['Sleep Disorder'])



In [352]:
# # Make an instance of LabelEncoder
# label_encoder = preprocessing.LabelEncoder()

# # Dictionary to hold the mappings
# mappings = {}

# # Encode 'Gender' column
# df1['Gender'] = label_encoder.fit_transform(df1['Gender'])
# mappings['Gender'] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# # Encode 'Occupation' column
# df1['Occupation'] = label_encoder.fit_transform(df1['Occupation'])
# mappings['Occupation'] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# # Encode 'BMI Category' column
# df1['BMI Category'] = label_encoder.fit_transform(df1['BMI Category'])
# mappings['BMI Category'] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# # Encode 'Sleep Disorder' column
# df1['Sleep Disorder'] = label_encoder.fit_transform(df1['Sleep Disorder'])
# mappings['Sleep Disorder'] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# # Display the DataFrame and the mappings
# df1.head()
# for column, mapping in mappings.items():
#     print(f"{column} mapping: {mapping}")


In [353]:
# Outlier Removal
num_col = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
           'Heart Rate', 'Daily Steps', 'BloodPressure_Upper_Value', 'BloodPressure_Lower_Value']

Q1 = df1[num_col].quantile(0.25)
Q3 = df1[num_col].quantile(0.75)
IQR = Q3 - Q1

df1 = df1[~((df1[num_col] < (Q1 - 1.5 * IQR)) | (df1[num_col] > (Q3 + 1.5 * IQR))).any(axis=1)]


In [354]:

# # Dictionary for renaming columns
# new_column_names = {'Gender': 'gender', 'Age': 'age', 'Occupation': 'occupation', 'Sleep Duration': 'sleepduration', 'Quality of sleep': 'sleepquality' 'sleepquality', 'Physical Activity Level': 'physicalactivity', 'Stress Level': 'stresslevel', 'BMI Category': 'bmi', 'Heart Rate': 'heartrate', "Daily Steps": 'dailysteps', 'BloodPressure_Upper_Value':'bloodpressureuv', 'BloodPressure_Lower_Value': 'bloodpressurelv'}


# # Rename columns
# df1 = df1.rename(columns=new_column_names)
# print(df1)


# Machine Learning - Multi-Classification Prediction

In [355]:
# Machine Learning - Multi-Classification Prediction
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


# Prepare the data

In [356]:
# Define feature names
FEATURE_NAMES = ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep',
                 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Heart Rate',
                 'Daily Steps', 'Sleep Disorder', 'BloodPressure_Upper_Value', 'BloodPressure_Lower_Value']

In [357]:
df1 = df1.reindex(columns=FEATURE_NAMES)

In [358]:
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [359]:
# Prepare the data
X = df1.drop(['Person ID', 'Sleep Disorder'], axis=1)
y = df1['Sleep Disorder']


In [360]:
# feature_names = df1.columns.tolist()

# # Extract the first row of the dataset
# first_row = df1.iloc[0]

# print("Feature Names:", feature_names)
# print("First Row:", first_row)

# Split the data into train and test sets

In [361]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Create a pipeline

In [362]:
# Create a pipeline with data preprocessing and classification model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', GradientBoostingClassifier())
])

In [363]:
# Define parameter grids for hyperparameter tuning
param_grid = [
    {
        'clf': [GradientBoostingClassifier()],
        'clf__n_estimators': [100, 200, 300,400],
        'clf__learning_rate': [0.01, 0.1, 1],
    }
]

# Perform grid search for hyperparameter tuning

In [364]:
# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

#Calculate predictions for each model
models = [
    ('Gradient Boosting', GradientBoostingClassifier())
]

predictions = {}
for name, model in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    predictions[name] = y_pred

y_pred

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0])

In [365]:
with open('model.pkl', 'wb') as file:
    pickle.dump({'model': best_model, 'label_encoders': label_encoders}, file)