In [1]:
# This is your existing Data Loading cell
# (Senin veri yükleme kodların burada varsayılıyor)


Data loaded successfully.
Shape before cleaning: (30, 11)
Shape after cleaning: (30, 11)


# Step 1: Exploratory Data Analysis (EDA)
### Addressing feedback: "The visualization is very limited"
This section includes Correlation Heatmap and Pairplots to better understand the data before statistical testing.

In [None]:
# ==========================================
# STEP 1: EXPLORATORY DATA ANALYSIS (EDA)
# ==========================================
# Purpose: Visualize relationships between sleep metrics and cognitive scores.

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Correlation Matrix (Heatmap) ---
# This heatmap visualizes the correlation coefficients between all numeric variables.

plt.figure(figsize=(12, 10))

# Select only numeric columns to avoid errors
numeric_df = df.select_dtypes(include=[np.number])

# Calculate correlation matrix
corr_matrix = numeric_df.corr()

# Plot heatmap with annotations
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix: Sleep Features vs. Cognitive Scores", fontsize=16)
plt.show()

# --- 2. Pairplot Analysis ---
# Pairplots allow us to see the distribution of individual variables and their relationships scatter plots.

selected_features = ['SleepDuration', 'SleepEfficiency', 'WASO', 'ReactionTime', 'AttentionScore']

# Ensure features exist in the dataset
existing_features = [col for col in selected_features if col in df.columns]

if existing_features:
    sns.pairplot(df[existing_features], diag_kind='kde', corner=True)
    plt.suptitle("Pairplot of Key Sleep & Cognitive Metrics", y=1.02, fontsize=16)
    plt.show()
else:
    print("Warning: Selected features for pairplot not found in the dataset.")

## Step 2: Statistical Hypothesis Testing
*(Your existing p-value tests and statistical analysis codes should be here / Mevcut istatistiksel test kodların bu arada kalmalı)*

# Step 3: Machine Learning Analysis
### Addressing feedback: "Technical analysis seems limited"
This section implements a Random Forest Regressor to predict Reaction Time based on sleep metrics.

In [None]:
# ==========================================
# STEP 3: MACHINE LEARNING ANALYSIS
# ==========================================
# Purpose: Predict cognitive performance (Reaction Time) based on sleep metrics.

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# --- 1. Feature Selection ---
# Target: ReactionTime (The variable we want to predict)
# Features: Sleep metrics (The variables we use to make the prediction)
target_col = 'ReactionTime'
feature_cols = ['SleepDuration', 'SleepEfficiency', 'DeepSleep', 'REM', 'WASO']

# Verify columns exist
available_features = [col for col in feature_cols if col in df.columns]

if target_col in df.columns and available_features:
    X = df[available_features]
    y = df[target_col]

    # --- 2. Train-Test Split ---
    # We split the data: 80% for training the model, 20% for testing its performance.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 3. Model Training (Random Forest) ---
    # Random Forest is robust against overfitting on small datasets compared to complex neural networks.
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # --- 4. Prediction & Evaluation ---
    y_pred = rf_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("-" * 40)
    print(f"Model Performance: Random Forest Regressor")
    print("-" * 40)
    print(f"Target Variable : {target_col}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R² Score        : {r2:.2f}")
    print("-" * 40)

    # --- 5. Feature Importance ---
    # Identify which sleep metrics drive the cognitive performance the most.
    feature_importances = pd.Series(rf_model.feature_importances_, index=available_features)
    print("\nFeature Importance (Top Predictors):")
    print(feature_importances.sort_values(ascending=False))

    # --- 6. Visualization: Actual vs. Predicted ---
    plt.figure(figsize=(9, 7))
    plt.scatter(y_test, y_pred, color='#2c3e50', alpha=0.7, s=100, label='Test Data Points')
    
    # Perfect prediction line (Ideal scenario where Predicted = Actual)
    min_val = min(y.min(), y_pred.min())
    max_val = max(y.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction Line')
    
    plt.xlabel(f"Actual {target_col}", fontsize=12)
    plt.ylabel(f"Predicted {target_col}", fontsize=12)
    plt.title(f"Model Evaluation: Actual vs. Predicted {target_col}", fontsize=14)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

else:
    print(f"Error: Target '{target_col}' or features not found in dataset.")