<a href="https://colab.research.google.com/github/manusmriti31/Sleep-anomality-detector/blob/main/DS_Project_sleep_anomality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exection Part 1

In [None]:
# Step 1: Load the dataset
import pandas as pd
import requests
from pathlib import Path

if Path("dataset.csv").is_file():
  print("Dataset already exists, skipping download")
else:
  print("Downloading helper function....")
  request = requests.get("https://github.com/lisstasy/sleep_disorder_prediction/raw/refs/heads/main/data.csv")
  with open("dataset.csv", "wb") as f:
    f.write(request.content)


# Replace 'your_dataset_link.csv' with the actual link to download the dataset
data = pd.read_csv('dataset.csv')
print(data.head())


Downloading helper function....
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1 

In [None]:
# Step 1: Load and preprocess the dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('dataset.csv')

# Correcting the inconsistency in 'BMI Category'
data['BMI Category'].replace({'Normal Weight': 'Normal'}, inplace=True)

# Splitting the 'Blood Pressure' column into 'Systolic' and 'Diastolic' columns
data['Systolic'] = data['Blood Pressure'].str.split('/').str[0].astype(int)
data['Diastolic'] = data['Blood Pressure'].str.split('/').str[1].astype(int)

# Dropping unnecessary columns
data.drop(['Blood Pressure', 'Person ID'], axis=1, inplace=True)

# Encoding categorical variables
label_encoders = {}
cat_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']

for col in cat_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Features and target variable
X = data.drop(['Sleep Disorder'], axis=1)
y = data['Sleep Disorder']

# Encoding the target variable
le_target = LabelEncoder()  # Define le_target globally
y = le_target.fit_transform(y)

# Display the first few rows of the modified dataset
print(data.head())


   Gender  Age  Occupation  Sleep Duration  Quality of Sleep  \
0       1   27           9             6.1                 6   
1       1   28           1             6.2                 6   
2       1   28           1             6.2                 6   
3       1   28           6             5.9                 4   
4       1   28           6             5.9                 4   

   Physical Activity Level  Stress Level  BMI Category  Heart Rate  \
0                       42             6             2          77   
1                       60             8             0          75   
2                       60             8             0          75   
3                       30             8             1          85   
4                       30             8             1          85   

   Daily Steps  Sleep Disorder  Systolic  Diastolic  
0         4200               2       126         83  
1        10000               2       125         80  
2        10000               2  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['BMI Category'].replace({'Normal Weight': 'Normal'}, inplace=True)


In [None]:
# Step 2: Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Features and target variable
X = data.drop(['Sleep Disorder'], axis=1)
y = data['Sleep Disorder']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 3: Train the model
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)


In [None]:
# Step 4: Predicting with individual input data
def predict_sleep_disorder(input_data):
    # Convert input_data dictionary to DataFrame
    input_df = pd.DataFrame([input_data])

    # Ensure all required columns are present and in the correct order
    required_columns = X_train.columns.tolist()

    # Add missing columns with default values (0 or appropriate defaults)
    for col in required_columns:
        if col not in input_df.columns:
            input_df[col] = 0  # Use a default value that makes sense for your context

    # Reorder columns to match training data
    input_df = input_df[required_columns]

    # Transform categorical variables using the same label encoders as used for training
    for col in label_encoders.keys():
        if col in input_df.columns:
            input_df[col] = label_encoders[col].transform(input_df[col])

    # Make prediction
    prediction = model.predict(input_df)

    # Decode the prediction back to original label
    return le_target.inverse_transform(prediction)

# Example usage:
input_data = {
    'Gender': 'Male',  # Example input; replace with actual values
    'Age': 30,
    'Occupation': 'Teacher',
    'Sleep Duration': 7.5,
    'Quality of Sleep': 8,
    'Physical Activity Level': 60,
    'Stress Level': 3,
    'BMI Category': 'Normal',
    'Daily Steps': 8000,
    'Systolic': 120,
    'Diastolic': 80,
    'Heart Rate': 70  # Ensure this field is included if it's part of your model
}

prediction = predict_sleep_disorder(input_data)
print(f'Predicted Sleep Disorder: {prediction[0]}')


Predicted Sleep Disorder: 2


## Execution Part 2

### 1. Importing the `dataset.csv`

In [None]:
# Step 1: Load the dataset
import pandas as pd
import requests
from pathlib import Path

if Path("dataset.csv").is_file():
  print("Dataset already exists, skipping download")
else:
  print("Downloading helper function....")
  request = requests.get("https://github.com/lisstasy/sleep_disorder_prediction/raw/refs/heads/main/data.csv")
  with open("dataset.csv", "wb") as f:
    f.write(request.content)


# Replace 'your_dataset_link.csv' with the actual link to download the dataset
data = pd.read_csv('dataset.csv')
data.head()
data.info()


Dataset already exists, skipping download
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
mem

### 2. Data Analysis

In [None]:
import pprint

# descriptive statistics
describe_stats = data.describe()

# Unique values in each categorical colum
unique_values = {}
cat_columns = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
for col in cat_columns:
  unique_values[col] = data[col].unique()

display(describe_stats)

print()
for key, value in unique_values.items():
  print(f"{key}: {value}\n")


Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0



Gender: ['Male' 'Female']

Occupation: ['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Salesperson' 'Manager']

BMI Category: ['Overweight' 'Normal' 'Obese' 'Normal Weight']

Sleep Disorder: [nan 'Sleep Apnea' 'Insomnia']



In [None]:
data['Blood Pressure'][:5]

Unnamed: 0,Blood Pressure
0,126/83
1,125/80
2,125/80
3,140/90
4,140/90


Key changes in the data
1. In the `BMI category` making sure that the ***Normal Weight*** and the ***Normal*** are merged together
2. In the `Blood Pressure` column would like to seperate the ***systolic*** and ***Diastolic*** into two differnent colums so that the model can leverage the data seperately

In [None]:
# Correcting the inconsistency in 'BMI Category'
data['BMI Category'].replace({'Normal Weight':'Normal'})

# Splitting the 'Blood Pressure' column into 'Systolic' and 'Diastolic' colums
data['Systolic'] = data['Blood Pressure'].str.split('/').str[0].astype(int)
data['Dystolic'] = data['Blood Pressure'].str.split('/').str[1].astype(int)

data.drop(['Blood Pressure', 'Person ID'], axis = 1)

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Dystolic
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea,140,95
370,Female,59,Nurse,8.0,9,75,3,Overweight,68,7000,Sleep Apnea,140,95
371,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea,140,95
372,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea,140,95


### 3. Data Preprocessing

We are converting the named variables i.e `Gender`, `Occupation`, `BMI Category`, and `Sleep Disorder` into numeric format because the machine learning model better understands the numeric inputs

In [None]:
# installing required libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler

X = data.drop(['Sleep Disorder'], axis = 1)
y = data['Sleep Disorder']

# Label encoding for categorical variables in X
label_encoder = {} # To store the encoder objects for potential inverse transformation later

for col in X.select_dtypes(include = ['object']).columns:
  le = LabelEncoder()
  X[col] = le.fit_transform(X[col])
  label_encoders[col] = le

# Encoding the target variable
le_target = LabelEncoder()
y = le_target.fit_transform(y)

X.head(), y[:5]

(   Person ID  Gender  Age  Occupation  Sleep Duration  Quality of Sleep  \
 0          1       1   27           9             6.1                 6   
 1          2       1   28           1             6.2                 6   
 2          3       1   28           1             6.2                 6   
 3          4       1   28           6             5.9                 4   
 4          5       1   28           6             5.9                 4   
 
    Physical Activity Level  Stress Level  BMI Category  Blood Pressure  \
 0                       42             6             3              11   
 1                       60             8             0               9   
 2                       60             8             0               9   
 3                       30             8             2              22   
 4                       30             8             2              22   
 
    Heart Rate  Daily Steps  Systolic  Dystolic  
 0          77         4200       126   

### 4. Model Building

#### 4.1 Regularization
Inorder to regularize the numeric features as the model better understands the regularized data in a much better way.

Thus we are applying `StandardScaler()` to the numeric fetures for the regularization purpose

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, stratify = y, random_state = 42)

# Applying Standard to Numeric variables
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
y_train_scaled = y_train.copy()
y_test_scaled = y_test.copy()

# List of key numerical variables
num_vars = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

X_train_scaled[num_vars] = scaler.fit_transform(X_train[num_vars])
X_test_scaled[num_vars] = scaler.fit_transform(X_test[num_vars])
y_train_scaled[num_vars] = scaler.fit_transform(y_train[num_vars])
y_test_scaled[num_vars] = scaler.fit_transform(y_test[num_vars])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

#### 4.2 Trainig Function
We will train the model on the training data with `cross-validation`. As cross-validation provides an estimate of the model's performance based on multiple splits of the training data, which is more robust because it reduces the risk that the model is only tuned to a specific portion of the data

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Function to train and validate models
def train_and_cross_validate(model, X_train, y_train, cv = 5):
  """
  Function to train the models using cross-validation and return the average score
  """
  scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1_weight')
  print(f"{model.__class__.__name__} Cross-Validation F1_weighted: {np.mean(scores):.2f} +/- {np.std(scores):.2f}")
  model.fit(X_train, y_train)
  return model

In [None]:
train_and_cross_validate(RandomForestClassifier, X_train_scaled, y_train_scaled)

NameError: name 'y_train_scaled' is not defined