In [6]:
import glob
import pandas as pd

files = glob.glob("feature-data/*_feature.csv")
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
print(df_all.shape)


(144000, 35)


In [17]:
import glob
import pandas as pd

# Path to your CSV files
files = glob.glob("feature-data/*_feature.csv")

# Loop through each file and print label counts
for file in files:
    df = pd.read_csv(file)
    
    # You can change 'avg_heart_rate' to the actual label column if different (e.g., 'label' or 'drowsy')
    label_col = 'label'
    
    if label_col in df.columns:
        counts = df[label_col].value_counts().sort_index()
        print(f"\nFile: {file}")
        for label, count in counts.items():
            print(f"  Label {label}: {count} samples")
    else:
        print(f"\nFile: {file} — Column '{label_col}' not found.")



File: feature-data\01M_1_feature.csv
  Label 0: 7176 samples
  Label 1: 24 samples

File: feature-data\01M_2_feature.csv
  Label 0: 7144 samples
  Label 1: 56 samples

File: feature-data\02F_1_feature.csv
  Label 0: 7165 samples
  Label 1: 35 samples

File: feature-data\02F_2_feature.csv
  Label 0: 7171 samples
  Label 1: 29 samples

File: feature-data\03F_1_feature.csv
  Label 0: 7115 samples
  Label 1: 85 samples

File: feature-data\03F_2_feature.csv
  Label 0: 7108 samples
  Label 1: 92 samples

File: feature-data\04M_1_feature.csv
  Label 0: 7164 samples
  Label 1: 36 samples

File: feature-data\04M_2_feature.csv
  Label 0: 7155 samples
  Label 1: 45 samples

File: feature-data\05M_1_feature.csv
  Label 0: 7176 samples
  Label 1: 24 samples

File: feature-data\05M_2_feature.csv
  Label 0: 7162 samples
  Label 1: 38 samples

File: feature-data\06M_1_feature.csv
  Label 0: 7165 samples
  Label 1: 35 samples

File: feature-data\06M_2_feature.csv
  Label 0: 7183 samples
  Label 1: 17 

In [2]:
import glob
import pandas as pd
import numpy as np

# Load all the CSV files
files = glob.glob("feature-data/*_feature.csv")
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Ensure the column exists
if 'avg_heart_rate' not in df_all.columns:
    raise ValueError("Column 'avg_heart_rate' not found in the dataset.")

# Get min and max
min_hr = df_all['avg_heart_rate'].min()
max_hr = df_all['avg_heart_rate'].max()

print(f"Min avg_heart_rate: {min_hr}")
print(f"Max avg_heart_rate: {max_hr}")

# Define bins (windows of 10)
bins = np.arange(start=np.floor(min_hr), stop=np.ceil(max_hr) + 10, step=10)

# Use pandas cut to assign each value to a bin
df_all['hr_bin'] = pd.cut(df_all['avg_heart_rate'], bins=bins)

# Count samples per bin
bin_counts = df_all['hr_bin'].value_counts().sort_index()

# Display the results
print("\nSample count in each heart rate window (10 bpm steps):")
for bin_range, count in bin_counts.items():
    print(f"{bin_range}: {count} samples")


Min avg_heart_rate: 0
Max avg_heart_rate: 360

Sample count in each heart rate window (10 bpm steps):
(0.0, 10.0]: 0 samples
(10.0, 20.0]: 0 samples
(20.0, 30.0]: 0 samples
(30.0, 40.0]: 0 samples
(40.0, 50.0]: 0 samples
(50.0, 60.0]: 102346 samples
(60.0, 70.0]: 0 samples
(70.0, 80.0]: 0 samples
(80.0, 90.0]: 0 samples
(90.0, 100.0]: 0 samples
(100.0, 110.0]: 0 samples
(110.0, 120.0]: 36815 samples
(120.0, 130.0]: 0 samples
(130.0, 140.0]: 0 samples
(140.0, 150.0]: 0 samples
(150.0, 160.0]: 0 samples
(160.0, 170.0]: 0 samples
(170.0, 180.0]: 3250 samples
(180.0, 190.0]: 0 samples
(190.0, 200.0]: 0 samples
(200.0, 210.0]: 0 samples
(210.0, 220.0]: 0 samples
(220.0, 230.0]: 0 samples
(230.0, 240.0]: 1062 samples
(240.0, 250.0]: 0 samples
(250.0, 260.0]: 0 samples
(260.0, 270.0]: 0 samples
(270.0, 280.0]: 0 samples
(280.0, 290.0]: 0 samples
(290.0, 300.0]: 449 samples
(300.0, 310.0]: 0 samples
(310.0, 320.0]: 0 samples
(320.0, 330.0]: 0 samples
(330.0, 340.0]: 0 samples
(340.0, 350.0]: 0

In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [12]:

# -----------------------------
# Load all feature CSV files
# -----------------------------
DATA_PATH = "feature-data"
feature_files = [f for f in os.listdir(DATA_PATH) if f.endswith("_feature.csv")]

dfs = []
for f in feature_files:
    df = pd.read_csv(os.path.join(DATA_PATH, f))
    df["source_file"] = f  # keep track of origin
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
print("✅ Data loaded:", data.shape)

# -----------------------------
# Handle NaN / Inf
# -----------------------------
data = data.replace([np.inf, -np.inf], np.nan).dropna()

# -----------------------------
# Split features and labels
# -----------------------------
X = data.drop(columns=["label", "source_file"])
y = data["label"]

# Standardize features (important for SVM, KNN, Logistic)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(X_scaled.shape,X_train.shape,X_test.shape)
print(y.shape, y_train.shape, y_test.shape)






# -----------------------------
# Define classifiers
# -----------------------------
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# -----------------------------
# Train & Evaluate
# -----------------------------
for name, clf in classifiers.items():
    print(f"\n=== {name} ===")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Data loaded: (144000, 36)
(143954, 34) (115163, 34) (28791, 34)
(143954,) (115163,) (28791,)

=== Logistic Regression ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Classification Report:
               precision    recall  f1-score   support

           0     0.9945    1.0000    0.9972     28633
           1     0.0000    0.0000    0.0000       158

    accuracy                         0.9945     28791
   macro avg     0.4973    0.5000    0.4986     28791
weighted avg     0.9891    0.9945    0.9918     28791

Confusion Matrix:
 [[28633     0]
 [  158     0]]

=== K-Nearest Neighbors ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9946    1.0000    0.9973     28633
           1     0.6667    0.0127    0.0248       158

    accuracy                         0.9945     28791
   macro avg     0.8306    0.5063    0.5111     28791
weighted avg     0.9928    0.9945    0.9919     28791

Confusion Matrix:
 [[28632     1]
 [  156     2]]

=== Decision Tree ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9946    0.9938    0.9942     28633
           1 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Classification Report:
               precision    recall  f1-score   support

           0     0.9945    1.0000    0.9972     28633
           1     0.0000    0.0000    0.0000       158

    accuracy                         0.9945     28791
   macro avg     0.4973    0.5000    0.4986     28791
weighted avg     0.9891    0.9945    0.9918     28791

Confusion Matrix:
 [[28633     0]
 [  158     0]]

=== Support Vector Machine ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Classification Report:
               precision    recall  f1-score   support

           0     0.9945    1.0000    0.9972     28633
           1     0.0000    0.0000    0.0000       158

    accuracy                         0.9945     28791
   macro avg     0.4973    0.5000    0.4986     28791
weighted avg     0.9891    0.9945    0.9918     28791

Confusion Matrix:
 [[28633     0]
 [  158     0]]

=== Gradient Boosting ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9945    0.9995    0.9970     28633
           1     0.0000    0.0000    0.0000       158

    accuracy                         0.9940     28791
   macro avg     0.4973    0.4997    0.4985     28791
weighted avg     0.9891    0.9940    0.9915     28791

Confusion Matrix:
 [[28618    15]
 [  158     0]]


In [13]:
import numpy as np
from imblearn.over_sampling import SMOTE

# SMOTE Oversampling on train set only
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_res))

# Updated classifiers with class weights
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "Support Vector Machine": SVC(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

for name, clf in classifiers.items():
    print(f"\n=== {name} ===")
    clf.fit(X_res, y_res)
    y_pred = clf.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Before SMOTE: [114529    634]
After SMOTE : [114529 114529]

=== Logistic Regression ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9974    0.7481    0.8550     28633
           1     0.0139    0.6456    0.0273       158

    accuracy                         0.7476     28791
   macro avg     0.5057    0.6968    0.4411     28791
weighted avg     0.9920    0.7476    0.8504     28791

Confusion Matrix:
 [[21421  7212]
 [   56   102]]

=== K-Nearest Neighbors ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9953    0.9434    0.9687     28633
           1     0.0188    0.1962    0.0343       158

    accuracy                         0.9393     28791
   macro avg     0.5070    0.5698    0.5015     28791
weighted avg     0.9900    0.9393    0.9635     28791

Confusion Matrix:
 [[27012  1621]
 [  127    31]]

=== Decision Tree ===
Classification Report:
               precision    recall

### **For 5s Window CSV FILES same Logic**

In [15]:
import glob
import pandas as pd

files = glob.glob("feature-data-5second/*_feature5s.csv")
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
print(df_all.shape)

(28800, 35)


In [18]:
import glob
import pandas as pd

# Path to your CSV files
files = glob.glob("feature-data-5second/*_feature5s.csv")

# Loop through each file and print label counts
for file in files:
    df = pd.read_csv(file)
    
    # You can change 'avg_heart_rate' to the actual label column if different (e.g., 'label' or 'drowsy')
    label_col = 'label'
    
    if label_col in df.columns:
        counts = df[label_col].value_counts().sort_index()
        print(f"\nFile: {file}")
        for label, count in counts.items():
            print(f"  Label {label}: {count} samples")
    else:
        print(f"\nFile: {file} — Column '{label_col}' not found.")



File: feature-data-5second\01M_1_feature5s.csv
  Label 0: 1416 samples
  Label 1: 24 samples

File: feature-data-5second\01M_2_feature5s.csv
  Label 0: 1384 samples
  Label 1: 56 samples

File: feature-data-5second\02F_1_feature5s.csv
  Label 0: 1406 samples
  Label 1: 34 samples

File: feature-data-5second\02F_2_feature5s.csv
  Label 0: 1411 samples
  Label 1: 29 samples

File: feature-data-5second\03F_1_feature5s.csv
  Label 0: 1355 samples
  Label 1: 85 samples

File: feature-data-5second\03F_2_feature5s.csv
  Label 0: 1349 samples
  Label 1: 91 samples

File: feature-data-5second\04M_1_feature5s.csv
  Label 0: 1404 samples
  Label 1: 36 samples

File: feature-data-5second\04M_2_feature5s.csv
  Label 0: 1395 samples
  Label 1: 45 samples

File: feature-data-5second\05M_1_feature5s.csv
  Label 0: 1417 samples
  Label 1: 23 samples

File: feature-data-5second\05M_2_feature5s.csv
  Label 0: 1403 samples
  Label 1: 37 samples

File: feature-data-5second\06M_1_feature5s.csv
  Label 0: 1

In [20]:
import glob
import pandas as pd
import numpy as np

# Load all the CSV files
files = glob.glob("feature-data-5second/*_feature5s.csv")
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Ensure the column exists
if 'avg_heart_rate' not in df_all.columns:
    raise ValueError("Column 'avg_heart_rate' not found in the dataset.")

# Get min and max
min_hr = df_all['avg_heart_rate'].min()
max_hr = df_all['avg_heart_rate'].max()

print(f"Min avg_heart_rate: {min_hr}")
print(f"Max avg_heart_rate: {max_hr}")

# Define bins (windows of 10)
bins = np.arange(start=np.floor(min_hr), stop=np.ceil(max_hr) + 10, step=10)

# Use pandas cut to assign each value to a bin
df_all['hr_bin'] = pd.cut(df_all['avg_heart_rate'], bins=bins)

# Count samples per bin
bin_counts = df_all['hr_bin'].value_counts().sort_index()

# Display the results
print("\nSample count in each heart rate window (10 bpm steps):")
for bin_range, count in bin_counts.items():
    print(f"{bin_range}: {count} samples")


Min avg_heart_rate: 24.0
Max avg_heart_rate: 288.0

Sample count in each heart rate window (10 bpm steps):
(24.0, 34.0]: 0 samples
(34.0, 44.0]: 16 samples
(44.0, 54.0]: 871 samples
(54.0, 64.0]: 6677 samples
(64.0, 74.0]: 11337 samples
(74.0, 84.0]: 6144 samples
(84.0, 94.0]: 0 samples
(94.0, 104.0]: 1583 samples
(104.0, 114.0]: 857 samples
(114.0, 124.0]: 532 samples
(124.0, 134.0]: 323 samples
(134.0, 144.0]: 206 samples
(144.0, 154.0]: 0 samples
(154.0, 164.0]: 110 samples
(164.0, 174.0]: 48 samples
(174.0, 184.0]: 24 samples
(184.0, 194.0]: 11 samples
(194.0, 204.0]: 5 samples
(204.0, 214.0]: 0 samples
(214.0, 224.0]: 1 samples
(224.0, 234.0]: 3 samples
(234.0, 244.0]: 6 samples
(244.0, 254.0]: 16 samples
(254.0, 264.0]: 16 samples
(264.0, 274.0]: 0 samples
(274.0, 284.0]: 11 samples
(284.0, 294.0]: 2 samples


In [19]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


# -----------------------------
# Load all feature CSV files
# -----------------------------
DATA_PATH = "feature-data-5second"
feature_files = [f for f in os.listdir(DATA_PATH) if f.endswith("_feature5s.csv")]

dfs = []
for f in feature_files:
    df = pd.read_csv(os.path.join(DATA_PATH, f))
    df["source_file"] = f  # keep track of origin
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
print("✅ Data loaded:", data.shape)

# -----------------------------
# Handle NaN / Inf
# -----------------------------
data = data.replace([np.inf, -np.inf], np.nan).dropna()

# -----------------------------
# Split features and labels
# -----------------------------
X = data.drop(columns=["label", "source_file"])
y = data["label"]

# Standardize features (important for SVM, KNN, Logistic)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(X_scaled.shape,X_train.shape,X_test.shape)
print(y.shape, y_train.shape, y_test.shape)


import numpy as np
from imblearn.over_sampling import SMOTE

# SMOTE Oversampling on train set only
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_res))

# Updated classifiers with class weights
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "Support Vector Machine": SVC(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

for name, clf in classifiers.items():
    print(f"\n=== {name} ===")
    clf.fit(X_res, y_res)
    y_pred = clf.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




✅ Data loaded: (28800, 36)
(28800, 34) (23040, 34) (5760, 34)
(28800,) (23040,) (5760,)
Before SMOTE: [22412   628]
After SMOTE : [22412 22412]

=== Logistic Regression ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9892    0.7660    0.8634      5603
           1     0.0774    0.7006    0.1394       157

    accuracy                         0.7642      5760
   macro avg     0.5333    0.7333    0.5014      5760
weighted avg     0.9643    0.7642    0.8437      5760

Confusion Matrix:
 [[4292 1311]
 [  47  110]]

=== K-Nearest Neighbors ===
Classification Report:
               precision    recall  f1-score   support

           0     0.9818    0.8738    0.9246      5603
           1     0.0854    0.4204    0.1419       157

    accuracy                         0.8615      5760
   macro avg     0.5336    0.6471    0.5333      5760
weighted avg     0.9573    0.8615    0.9033      5760

Confusion Matrix:
 [[4896  707]
 [  91   66]]

===