# Sleep Disorder Prediction - Creating Pipeline

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')

### Load The Dataset

In [2]:
sleep_dis_df = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

In [3]:
sleep_dis_df.head(5)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,none
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,none
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,none
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [4]:
sleep_dis_df[['Systolic','Diastolic'] ]= sleep_dis_df['Blood Pressure'].str.split('/', expand=True)
sleep_dis_df.drop(['Blood Pressure'],axis=1, inplace= True)
sleep_dis_df.drop(['Person ID'],axis=1, inplace= True)

In [5]:
# Converting to Integer
sleep_dis_df['Systolic']=sleep_dis_df['Systolic'].astype(int)
sleep_dis_df['Diastolic']=sleep_dis_df['Diastolic'].astype(int)

In [6]:
sleep_dis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   374 non-null    object 
 1   Age                      374 non-null    int64  
 2   Occupation               374 non-null    object 
 3   Sleep Duration           374 non-null    float64
 4   Quality of Sleep         374 non-null    int64  
 5   Physical Activity Level  374 non-null    int64  
 6   Stress Level             374 non-null    int64  
 7   BMI Category             374 non-null    object 
 8   Heart Rate               374 non-null    int64  
 9   Daily Steps              374 non-null    int64  
 10  Sleep Disorder           374 non-null    object 
 11  Systolic                 374 non-null    int32  
 12  Diastolic                374 non-null    int32  
dtypes: float64(1), int32(2), int64(6), object(4)
memory usage: 35.2+ KB


### Categorical Features
- Gender
- Occupation
- BMI Category

### Numerical Features
- Age
- Sleep Duration
- Quality of Sleep
- Physical Activity Level
- Stress Level
- Heart Rate
- Daily Steps
- Systolic
- Diastolic

In [7]:
x_features = ['Gender','Age','Occupation','Sleep Duration',
              'Quality of Sleep','Physical Activity Level',
              'Stress Level','BMI Category','Heart Rate','Daily Steps',
              'Systolic','Diastolic']

In [8]:
cat_vars = ['Gender',
            'Occupation',
            'BMI Category']

In [9]:
num_vars = list(set(x_features) - set(cat_vars))

In [10]:
num_vars

['Physical Activity Level',
 'Diastolic',
 'Age',
 'Stress Level',
 'Systolic',
 'Daily Steps',
 'Quality of Sleep',
 'Heart Rate',
 'Sleep Duration']

### Need for Data Transformation

1. Categorical columns
    - OHE Encoding
2. Numerical Columns
    - No Transformation Required

### Setting X and y variables

In [11]:
X = sleep_dis_df[x_features]
y = sleep_dis_df['Sleep Disorder']

#### Data Spliting

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.7,
                                                    random_state = 20)

In [14]:
(X_train.shape,X_test.shape)

((261, 12), (113, 12))

## Creating Pipelines

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [16]:
one_hot_encoding = OneHotEncoder(handle_unknown='ignore')  
cat_transform = Pipeline(steps= [('oheencoder',one_hot_encoding)]) 

In [17]:
preprocesser = ColumnTransformer(
            transformers=[('numerical','passthrough',num_vars),
                          ('categorical', cat_transform,cat_vars)]
)

### SVC Model

In [18]:
from sklearn.svm import SVC

In [19]:
sv = SVC()

In [20]:
svc_pipeline = Pipeline(steps= [('preprocessing',preprocesser),
                                 ('regression', sv)])

In [21]:
svc_pipeline

In [22]:
svc_pipeline.fit(X_train,y_train)

### Predict on Test Set

In [23]:
y_pred = svc_pipeline.predict(X_test)

### Predicting on new data

In [24]:
data = {"Gender":"Male",                             
          "Age":27, 
          "Occupation" : "Software Engineer",                                      
          "Sleep Duration":6.1,                 
          "Quality of Sleep":6,              
          "Physical Activity Level":42,          
          "Stress Level":6,                     
          "BMI Category":"Overweight",                     
          "Heart Rate":77,                        
          "Daily Steps":4200,                                       
          "Systolic":126,                    
          "Diastolic":83}

In [25]:
data_df = pd.DataFrame(data , index=[0])

In [26]:
data_df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Systolic,Diastolic
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,126,83


In [27]:
disorder_pred = svc_pipeline.predict(data_df)[0]
disorder_pred

'none'

## Save the Pipeline

In [28]:
from joblib import dump

In [29]:
dump(svc_pipeline,'Sleep_Disorder_Prediction.pkl')

['Sleep_Disorder_Prediction.pkl']