# The Impact of Sleep Quality on Daily Cognitive Performance
## DSA 210 Project Analysis

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
try:
    df = pd.read_csv("Dsa210_Project_Data.csv")
    print("Data loaded successfully.")
    print(f"Shape before cleaning: {df.shape}")
    
    # Data cleaning (dropping missing values)
    df = df.dropna()
    print(f"Shape after cleaning: {df.shape}")
    
    # Display first few rows
    display(df.head())
except FileNotFoundError:
    print("Error: CSV file not found. Please check the file path.")

# Step 1: Exploratory Data Analysis (EDA)
### Addressing feedback: "The visualization is very limited"
This section includes Correlation Heatmap and Pairplots to better understand the data before statistical testing.

In [None]:
# ==========================================
# STEP 1: EXPLORATORY DATA ANALYSIS (EDA)
# ==========================================
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Correlation Matrix (Heatmap) ---
plt.figure(figsize=(12, 10))
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix: Sleep Features vs. Cognitive Scores", fontsize=16)
plt.show()

# --- 2. Pairplot Analysis ---
selected_features = ['SleepDuration', 'SleepEfficiency', 'WASO', 'ReactionTime', 'AttentionScore']
existing_features = [col for col in selected_features if col in df.columns]

if existing_features:
    sns.pairplot(df[existing_features], diag_kind='kde', corner=True)
    plt.suptitle("Pairplot of Key Sleep & Cognitive Metrics", y=1.02, fontsize=16)
    plt.show()
else:
    print("Warning: Selected features for pairplot not found in the dataset.")

# Step 2: Statistical Hypothesis Testing
Here we perform Pearson Correlation tests to validate our hypotheses regarding sleep metrics and cognitive scores.

In [None]:
# Test 1: Sleep Duration vs. Reaction Time
if 'SleepDuration' in df.columns and 'ReactionTime' in df.columns:
    r_duration, p_duration = stats.pearsonr(df['SleepDuration'], df['ReactionTime'])
    print(f"\n1. Hypothesis: Sleep Duration vs. Reaction Time")
    print(f"   Pearson Correlation (r): {r_duration:.3f}")
    print(f"   P-value: {p_duration:.5f}")
    if p_duration < 0.05:
        print("   Result: Reject Null Hypothesis. Significant relationship found.")
    else:
        print("   Result: Fail to reject Null Hypothesis. No significant relationship.")

# Test 2: Deep Sleep vs. Memory Recall
if 'DeepSleep' in df.columns and 'MemoryRecall' in df.columns:
    r_deep, p_deep = stats.pearsonr(df['DeepSleep'], df['MemoryRecall'])
    print(f"\n2. Hypothesis: Deep Sleep vs. Memory Recall")
    print(f"   Pearson Correlation (r): {r_deep:.3f}")
    print(f"   P-value: {p_deep:.5f}")
    if p_deep < 0.05:
        print("   Result: Reject Null Hypothesis. Significant relationship found.")
    else:
        print("   Result: Fail to reject Null Hypothesis. No significant relationship.")

# Test 3: WASO (Wake After Sleep Onset) vs. Attention Score
if 'WASO' in df.columns and 'AttentionScore' in df.columns:
    r_attention, p_attention = stats.pearsonr(df['WASO'], df['AttentionScore'])
    print(f"\n3. Hypothesis: WASO vs. Attention Score")
    print(f"   Pearson Correlation (r): {r_attention:.3f}")
    print(f"   P-value: {p_attention:.5f}") 

    if p_attention < 0.05:
        print("   Result: Reject Null Hypothesis. Significant relationship found.")
    else:
        print("   Result: Fail to reject Null Hypothesis. No significant relationship.")

print("\n" + "="*50)

# Step 3: Machine Learning Analysis
### Addressing feedback: "Technical analysis seems limited"
This section implements a Random Forest Regressor to predict Reaction Time based on sleep metrics.

In [None]:
# ==========================================
# STEP 3: MACHINE LEARNING ANALYSIS
# ==========================================

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Feature Selection ---
target_col = 'ReactionTime'
feature_cols = ['SleepDuration', 'SleepEfficiency', 'DeepSleep', 'REM', 'WASO']

available_features = [col for col in feature_cols if col in df.columns]

if target_col in df.columns and available_features:
    X = df[available_features]
    y = df[target_col]

    # --- 2. Train-Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 3. Model Training (Random Forest) ---
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # --- 4. Prediction & Evaluation ---
    y_pred = rf_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("-" * 40)
    print(f"Model Performance: Random Forest Regressor")
    print("-" * 40)
    print(f"Target Variable : {target_col}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"RÂ² Score        : {r2:.2f}")
    print("-" * 40)

    # --- 5. Feature Importance ---
    feature_importances = pd.Series(rf_model.feature_importances_, index=available_features)
    print("\nFeature Importance (Top Predictors):")
    print(feature_importances.sort_values(ascending=False))

    # --- 6. Visualization: Actual vs. Predicted ---
    plt.figure(figsize=(9, 7))
    plt.scatter(y_test, y_pred, color='#2c3e50', alpha=0.7, s=100, label='Test Data Points')
    
    min_val = min(y.min(), y_pred.min())
    max_val = max(y.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction Line')
    
    plt.xlabel(f"Actual {target_col}", fontsize=12)
    plt.ylabel(f"Predicted {target_col}", fontsize=12)
    plt.title(f"Model Evaluation: Actual vs. Predicted {target_col}", fontsize=14)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

else:
    print(f"Error: Target '{target_col}' or features not found in dataset.")