In [60]:
import pandas as pd
import numpy as np
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

## Data Extraction and Initial Loading

In [61]:
# Set the path to the file you'd like to load
file_path = "sleep_deprivation_dataset_detailed.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "sacramentotechnology/sleep-deprivation-and-cognitive-performance",
  file_path,
)

print("First 5 records:", df.head())

First 5 records:   Participant_ID  Sleep_Hours  Sleep_Quality_Score  Daytime_Sleepiness  \
0             P1         5.25                   15                  12   
1             P2         8.70                   12                  14   
2             P3         7.39                   17                  10   
3             P4         6.59                   14                   3   
4             P5         3.94                   20                  12   

   Stroop_Task_Reaction_Time  N_Back_Accuracy  Emotion_Regulation_Score  \
0                       1.60            64.20                        12   
1                       2.54            65.27                        21   
2                       3.40            74.28                        35   
3                       3.54            72.42                        25   
4                       3.09            99.72                        60   

   PVT_Reaction_Time  Age  Gender    BMI  Caffeine_Intake  \
0             365.85   35 

In [62]:
# Create a copy of the DataFrame
df_copy = df.copy()

## Data Exploration and Initial Cleaning

In [64]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Participant_ID,Sleep_Hours,Sleep_Quality_Score,Daytime_Sleepiness,Stroop_Task_Reaction_Time,N_Back_Accuracy,Emotion_Regulation_Score,PVT_Reaction_Time,Age,Gender,BMI,Caffeine_Intake,Physical_Activity_Level,Stress_Level
0,P1,5.25,15,12,1.6,64.2,12,365.85,35,Female,30.53,2,1,33
1,P2,8.7,12,14,2.54,65.27,21,288.95,20,Male,27.28,3,8,37
2,P3,7.39,17,10,3.4,74.28,35,325.93,18,Male,30.0,1,2,32
3,P4,6.59,14,3,3.54,72.42,25,276.86,18,Male,34.47,5,0,23
4,P5,3.94,20,12,3.09,99.72,60,383.45,36,Male,29.7,3,4,14


In [65]:
# Display the last few rows of the dataset
df.tail()

Unnamed: 0,Participant_ID,Sleep_Hours,Sleep_Quality_Score,Daytime_Sleepiness,Stroop_Task_Reaction_Time,N_Back_Accuracy,Emotion_Regulation_Score,PVT_Reaction_Time,Age,Gender,BMI,Caffeine_Intake,Physical_Activity_Level,Stress_Level
55,P56,8.53,16,16,3.51,64.76,41,391.48,34,Male,23.01,2,5,8
56,P57,3.53,19,16,3.24,66.13,36,355.01,24,Female,34.93,2,1,25
57,P58,4.18,3,1,2.62,92.43,29,397.13,41,Male,29.85,0,1,21
58,P59,3.27,4,1,4.32,56.83,33,330.7,40,Female,24.53,4,1,29
59,P60,4.95,6,21,4.42,85.45,21,419.01,22,Male,30.53,0,10,16


In [66]:
# Check the dimensions of the dataset
df.shape

(60, 14)

In [67]:
# Get column names and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Participant_ID             60 non-null     object 
 1   Sleep_Hours                60 non-null     float64
 2   Sleep_Quality_Score        60 non-null     int64  
 3   Daytime_Sleepiness         60 non-null     int64  
 4   Stroop_Task_Reaction_Time  60 non-null     float64
 5   N_Back_Accuracy            60 non-null     float64
 6   Emotion_Regulation_Score   60 non-null     int64  
 7   PVT_Reaction_Time          60 non-null     float64
 8   Age                        60 non-null     int64  
 9   Gender                     60 non-null     object 
 10  BMI                        60 non-null     float64
 11  Caffeine_Intake            60 non-null     int64  
 12  Physical_Activity_Level    60 non-null     int64  
 13  Stress_Level               60 non-null     int64  
d

In [68]:
# Check for missing values
df.isnull().sum() 

Participant_ID               0
Sleep_Hours                  0
Sleep_Quality_Score          0
Daytime_Sleepiness           0
Stroop_Task_Reaction_Time    0
N_Back_Accuracy              0
Emotion_Regulation_Score     0
PVT_Reaction_Time            0
Age                          0
Gender                       0
BMI                          0
Caffeine_Intake              0
Physical_Activity_Level      0
Stress_Level                 0
dtype: int64

In [69]:
# Generate descriptive statistics for numerical columns
df.describe()

Unnamed: 0,Sleep_Hours,Sleep_Quality_Score,Daytime_Sleepiness,Stroop_Task_Reaction_Time,N_Back_Accuracy,Emotion_Regulation_Score,PVT_Reaction_Time,Age,BMI,Caffeine_Intake,Physical_Activity_Level,Stress_Level
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,5.8055,8.316667,12.0,3.245,75.005833,38.15,332.539,29.516667,27.329333,2.383333,4.116667,17.866667
std,1.832357,5.63732,7.577845,0.827368,13.671457,17.1334,87.800288,8.168753,4.541382,1.688362,3.108717,11.015346
min,3.12,0.0,0.0,1.6,50.9,10.0,201.56,18.0,18.74,0.0,0.0,0.0
25%,4.1075,4.0,6.0,2.595,64.62,25.0,257.535,21.75,23.58,1.0,1.0,8.75
50%,5.69,8.0,11.5,3.26,74.27,37.0,327.21,28.5,27.365,2.5,4.0,17.5
75%,7.285,13.0,19.0,3.965,85.6,54.25,402.845,36.0,30.7475,4.0,6.0,26.25
max,8.82,20.0,24.0,4.49,99.73,67.0,494.55,43.0,34.93,5.0,10.0,40.0


In [70]:
# Check for duplicates in the dataset
df.duplicated().sum()

0

In [71]:
# Check the data types of all columns
df.dtypes

Participant_ID                object
Sleep_Hours                  float64
Sleep_Quality_Score            int64
Daytime_Sleepiness             int64
Stroop_Task_Reaction_Time    float64
N_Back_Accuracy              float64
Emotion_Regulation_Score       int64
PVT_Reaction_Time            float64
Age                            int64
Gender                        object
BMI                          float64
Caffeine_Intake                int64
Physical_Activity_Level        int64
Stress_Level                   int64
dtype: object

## Data Transformation and Feature Engineering