In [1]:
print("Hello World")

Hello World


In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import os
import warnings
import pickle
warnings.filterwarnings('ignore')

In [3]:
DATASET_PATH='/Users/lokeshnagasaidarla/Developer/webdev/sleep-health-cardio-prediction/sleep-health-detection/dataset/'
df = pd.read_csv(os.path.join(DATASET_PATH, "sleep_lifestyle.csv"))

In [4]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [5]:
df['Sleep Disorder'].fillna('No Disorder',inplace=True)
df=df.drop(columns=['Person ID'])

In [6]:
df.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,No Disorder
1,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No Disorder
2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,No Disorder
3,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [7]:
# Create age groups
bins = [0, 20, 30, 40, 50, 60, 70]
labels = ['<20', '21-30', '31-40', '41-50', '51-60', '61-70']
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
# Drop the 'Age' column
df = df.drop(columns=['Age'])

In [8]:
# Split Blood Pressure into Systolic and Diastolic
df[['Systolic', 'Diastolic']] = df['Blood Pressure'].str.split('/', expand=True)
df['Systolic'] = df['Systolic'].astype(int)
df['Diastolic'] = df['Diastolic'].astype(int)
df = df.drop(columns=['Blood Pressure'])

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            le = LabelEncoder()
            le.fit(X[column])
            self.label_encoders[column] = le
        return self

    def transform(self, X):
        X = X.copy()
        for column, le in self.label_encoders.items():
            X[column] = le.transform(X[column])
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), ['Systolic', 'Diastolic']),
        ('gender', OneHotEncoder(drop='first'), ['Gender']),
        ('label_encoder', LabelEncoderTransformer(columns=['Occupation', 'BMI Category', 'Age Group']), ['Occupation', 'BMI Category', 'Age Group'])
    ],
    remainder='passthrough'
)


In [11]:
preprocessor

In [12]:
# Fit and transform the data
X = df.drop(columns=['Sleep Disorder'])
y = df['Sleep Disorder']

X_processed = preprocessor.fit_transform(X)

# Convert processed data into DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())

# Concatenate and save as before
preprocessed_df = pd.concat([X_processed_df, y.reset_index(drop=True)], axis=1)
preprocessed_df.to_csv(os.path.join(DATASET_PATH, "preprocessed_data.csv"), index=False)

In [13]:
# Concatenate the processed features with the target variable
preprocessed_df = pd.concat([X_processed_df, y.reset_index(drop=True)], axis=1)

# Save the preprocessed data along with the target variable to a CSV file
preprocessed_df.to_csv(os.path.join(DATASET_PATH, "preprocessed_data.csv"), index=False)

In [14]:
MODELS_PATH='/Users/lokeshnagasaidarla/Developer/webdev/sleep-health-cardio-prediction/sleep-health-detection/models/'

In [15]:
# Save the preprocessor pipeline as a .pkl file
with open(os.path.join(MODELS_PATH, "preprocessor.pkl"), 'wb') as file:
    pickle.dump(preprocessor, file)
