In [2]:
# Import + C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n
import os, re
from glob import glob
import numpy as np

In [3]:
# Ch·ªët Data_Root + l·∫•y danh s√°ch File
DATA_ROOT = "/kaggle/input/sis-fall-original-dataset/SisFall_dataset"

all_files = glob(DATA_ROOT + "/SA*/*.txt")
print("Total files:", len(all_files))
print("Example file:", all_files[0] if len(all_files) > 0 else "No files")

Total files: 3537
Example file: /kaggle/input/sis-fall-original-dataset/SisFall_dataset/SA01/D09_SA01_R02.txt


In [4]:
# H√†m g√°n nh√£n (FALL/ADL) t·ª´ t√™n File
def label_from_filename(path):
    name = os.path.basename(path).upper()
    if re.match(r"F\d{2}_", name):
        return 1  # FALL
    if re.match(r"D\d{2}_", name):
        return 0  # ADL
    return None

In [5]:
# H√†m ƒë·ªçc File SisFall
_num_re = re.compile(r"[-+]?\d+(?:[.,]\d+)?")

def load_sisfall_file(path):
    rows = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            toks = _num_re.findall(line)
            if len(toks) < 6:
                continue

            vals = [t.replace(",", ".") for t in toks[:6]]
            try:
                rows.append([float(v) for v in vals])
            except:
                continue

    if len(rows) == 0:
        raise ValueError("No numeric rows parsed")
    return np.asarray(rows, dtype=np.float32)

In [6]:
# Load to√†n b·ªô File -> x_raw, y_raw + th·ªëng k√™
X_raw, y_raw = [], []
bad_files = []

for p in all_files:
    y = label_from_filename(p)
    if y is None:
        continue
    try:
        x = load_sisfall_file(p)   # (T, 6)
        X_raw.append(x)
        y_raw.append(y)
    except Exception as e:
        bad_files.append((p, str(e)))

print("Loaded signals:", len(X_raw))
print("Bad files:", len(bad_files))
print("FALL count:", int(np.sum(y_raw)))
print("ADL count:", len(y_raw) - int(np.sum(y_raw)))

if len(X_raw) > 0:
    print("Sample shape:", X_raw[0].shape)
else:
    print("No samples loaded yet.")

# in t·ªëi ƒëa 5 file l·ªói (n·∫øu c√≥)
for p, err in bad_files[:5]:
    print("BAD:", p, "|", err)

Loaded signals: 3537
Bad files: 0
FALL count: 1723
ADL count: 1814
Sample shape: (2400, 6)


In [7]:
# Windowing (C·∫Øt c·ª≠a s·ªï)
def sliding_window(x, win_size, step):
    T, C = x.shape
    if T < win_size:
        return np.empty((0, win_size, C), dtype=np.float32)

    windows = []
    for start in range(0, T - win_size + 1, step):
        windows.append(x[start:start + win_size])

    return np.stack(windows).astype(np.float32)

In [8]:
# T·∫°o dataset window: X_windows, Y_windows
WIN = 400
STEP = 200

X_windows = []
y_windows = []

for x, y in zip(X_raw, y_raw):
    W = sliding_window(x, WIN, STEP)
    if len(W) == 0:
        continue
    X_windows.append(W)
    y_windows.append(np.full((W.shape[0],), y, dtype=np.int32))

X_windows = np.concatenate(X_windows, axis=0)
y_windows = np.concatenate(y_windows, axis=0)

print("X_windows shape:", X_windows.shape)
print("y_windows shape:", y_windows.shape)
print("Positive rate:", float(y_windows.mean()))


X_windows shape: (56233, 400, 6)
y_windows shape: (56233,)
Positive rate: 0.42496398911671085


In [9]:
# Tr√≠ch ƒë·∫∑c tr∆∞ng (Features) cho RandomForest
def features_from_window(w):
    feats = np.concatenate([
        w.mean(axis=0),
        w.std(axis=0),
        w.max(axis=0),
        w.min(axis=0),
    ]).astype(np.float32)
    return feats

X_feat = np.stack([features_from_window(w) for w in X_windows])
print("X_feat shape:", X_feat.shape)  # (N, 24)

X_feat shape: (56233, 24)


In [10]:
# Train/Test split + Train RandomForest + ƒê√°nh gi√°
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_tr, X_te, y_tr, y_te = train_test_split(
    X_feat, y_windows,
    test_size=0.2,
    random_state=42,
    stratify=y_windows
)

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf.fit(X_tr, y_tr)
y_pred = rf.predict(X_te)

print(classification_report(y_te, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))

              precision    recall  f1-score   support

           0     0.9056    0.9688    0.9361      6467
           1     0.9533    0.8634    0.9061      4780

    accuracy                         0.9240     11247
   macro avg     0.9295    0.9161    0.9211     11247
weighted avg     0.9259    0.9240    0.9234     11247

Confusion matrix:
 [[6265  202]
 [ 653 4127]]


In [11]:
# L∆∞u Model
import joblib

joblib.dump(rf, "sisfall_random_forest.joblib")
print("Saved: sisfall_random_forest.joblib")

Saved: sisfall_random_forest.joblib


In [12]:
# Feature n√¢ng cao
import numpy as np

def advanced_features(w):
    # w: (WIN,6) -> ax ay az gx gy gz
    a = w[:, :3]
    g = w[:, 3:6]
    amag = np.sqrt((a*a).sum(axis=1))  # (WIN,)
    gmag = np.sqrt((g*g).sum(axis=1))

    feats = []

    # stats 6 k√™nh g·ªëc
    feats += list(w.mean(axis=0))
    feats += list(w.std(axis=0))
    feats += list(w.max(axis=0))
    feats += list(w.min(axis=0))

    # stats magnitude
    feats += [amag.mean(), amag.std(), amag.max(), amag.min()]
    feats += [gmag.mean(), gmag.std(), gmag.max(), gmag.min()]

    # energy (mean squared)
    feats += [(w*w).mean()]
    feats += [(amag*amag).mean()]
    feats += [(gmag*gmag).mean()]

    return np.array(feats, dtype=np.float32)

X_feat2 = np.stack([advanced_features(w) for w in X_windows])
print("X_feat2 shape:", X_feat2.shape)

X_feat2 shape: (56233, 35)


In [13]:
# Train l·∫°i RF tr√™n feature m·ªõi + th·ª≠ threshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_tr, X_te, y_tr, y_te = train_test_split(
    X_feat2, y_windows, test_size=0.2, random_state=42, stratify=y_windows
)

rf2 = RandomForestClassifier(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample",
    max_depth=None
)

rf2.fit(X_tr, y_tr)

proba = rf2.predict_proba(X_te)[:, 1]

for thr in [0.5, 0.45, 0.4, 0.35]:
    y_pred = (proba >= thr).astype(int)
    print("\n==== threshold =", thr, "====")
    print(classification_report(y_te, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))


==== threshold = 0.5 ====
              precision    recall  f1-score   support

           0     0.9110    0.9702    0.9396      6467
           1     0.9557    0.8718    0.9118      4780

    accuracy                         0.9283     11247
   macro avg     0.9334    0.9210    0.9257     11247
weighted avg     0.9300    0.9283    0.9278     11247

Confusion matrix:
 [[6274  193]
 [ 613 4167]]

==== threshold = 0.45 ====
              precision    recall  f1-score   support

           0     0.9293    0.9592    0.9440      6467
           1     0.9423    0.9013    0.9213      4780

    accuracy                         0.9346     11247
   macro avg     0.9358    0.9302    0.9326     11247
weighted avg     0.9348    0.9346    0.9344     11247

Confusion matrix:
 [[6203  264]
 [ 472 4308]]

==== threshold = 0.4 ====
              precision    recall  f1-score   support

           0     0.9432    0.9417    0.9424      6467
           1     0.9213    0.9232    0.9223      4780

    accu

In [14]:
import joblib

joblib.dump(
    {
        "model": rf2,
        "threshold": 0.4,
        "features": "advanced_features",
        "win": WIN,
        "step": STEP
    },
    "sisfall_rf_final.joblib"
)

print("Saved final model: sisfall_rf_final.joblib")

Saved final model: sisfall_rf_final.joblib


## üîπ Support Vector Machine (SVM)

In [15]:
# Feature extraction
import numpy as np

def extract_features(window):
    """
    window: (WIN, 6)
    return: (30,) feature vector
    """
    feats = []
    for ch in range(window.shape[1]):
        sig = window[:, ch]
        feats.extend([
            np.mean(sig),
            np.std(sig),
            np.max(sig),
            np.min(sig),
            np.mean(sig ** 2)  # energy
        ])
    return np.array(feats, dtype=np.float32)

# Tr√≠ch feature cho to√†n b·ªô dataset
X_svm = np.array([extract_features(w) for w in X_windows])
y_svm = y_windows.copy()

print("X_svm shape:", X_svm.shape)
print("y_svm shape:", y_svm.shape)
print("Sample feature vector length:", X_svm.shape[1])

X_svm shape: (56233, 30)
y_svm shape: (56233,)
Sample feature vector length: 30


In [16]:
# Scale + split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Chia train / test
X_train, X_test, y_train, y_test = train_test_split(
    X_svm, y_svm,
    test_size=0.2,
    random_state=42,
    stratify=y_svm
)

# Chu·∫©n h√≥a feature
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train_scaled.shape)
print("Test shape :", X_test_scaled.shape)
print("Mean (train, first feature):", X_train_scaled[:, 0].mean())
print("Std  (train, first feature):", X_train_scaled[:, 0].std())

Train shape: (44986, 30)
Test shape : (11247, 30)
Mean (train, first feature): 0.0
Std  (train, first feature): 1.0


In [17]:
# Train + Evaluate
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix
import time

# ===== 1) SVM Linear (NHANH) =====
t0 = time.time()
svm_linear = LinearSVC(
    C=1.0,
    class_weight="balanced",
    max_iter=20000,
    random_state=42
)
svm_linear.fit(X_train_scaled, y_train)
t1 = time.time()

pred_linear = svm_linear.predict(X_test_scaled)

print("=== SVM Linear ===")
print(f"Train time: {t1 - t0:.2f}s")
print(classification_report(y_test, pred_linear, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, pred_linear))

# ===== 2) SVM RBF (M·∫†NH nh∆∞ng c√≥ th·ªÉ CH·∫¨M) =====
# N·∫øu b·∫°n mu·ªën th·ª≠ lu√¥n, b·ªè comment 2 d√≤ng d∆∞·ªõi:
# run_rbf = True
run_rbf = False

if run_rbf:
    t0 = time.time()
    svm_rbf = SVC(
        kernel="rbf",
        C=3.0,
        gamma="scale",
        class_weight="balanced"
    )
    svm_rbf.fit(X_train_scaled, y_train)
    t1 = time.time()

    pred_rbf = svm_rbf.predict(X_test_scaled)

    print("\n=== SVM RBF ===")
    print(f"Train time: {t1 - t0:.2f}s")
    print(classification_report(y_test, pred_rbf, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, pred_rbf))

=== SVM Linear ===
Train time: 0.78s
              precision    recall  f1-score   support

           0     0.7835    0.8967    0.8363      6467
           1     0.8263    0.6649    0.7368      4780

    accuracy                         0.7982     11247
   macro avg     0.8049    0.7808    0.7866     11247
weighted avg     0.8017    0.7982    0.7940     11247

Confusion matrix:
 [[5799  668]
 [1602 3178]]


In [18]:
 # SVM Threshold Tuning 
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# L·∫•y decision score
scores = svm_linear.decision_function(X_test_scaled)

thresholds = [-1.0, -0.5, 0.0, 0.5, 1.0]

for th in thresholds:
    y_pred_th = (scores > th).astype(int)
    print(f"\n=== Threshold = {th} ===")
    print(classification_report(y_test, y_pred_th, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_th))


=== Threshold = -1.0 ===
              precision    recall  f1-score   support

           0     0.9379    0.0724    0.1344      6467
           1     0.4418    0.9935    0.6117      4780

    accuracy                         0.4639     11247
   macro avg     0.6899    0.5329    0.3730     11247
weighted avg     0.7271    0.4639    0.3372     11247

Confusion matrix:
 [[ 468 5999]
 [  31 4749]]

=== Threshold = -0.5 ===
              precision    recall  f1-score   support

           0     0.9312    0.3620    0.5213      6467
           1     0.5275    0.9638    0.6819      4780

    accuracy                         0.6178     11247
   macro avg     0.7294    0.6629    0.6016     11247
weighted avg     0.7596    0.6178    0.5896     11247

Confusion matrix:
 [[2341 4126]
 [ 173 4607]]

=== Threshold = 0.0 ===
              precision    recall  f1-score   support

           0     0.7835    0.8967    0.8363      6467
           1     0.8263    0.6649    0.7368      4780

    accuracy 

In [19]:
# SVM Save
import joblib

# 1) Baseline config
joblib.dump(
    {
        "model": svm_linear,
        "scaler": scaler,
        "features": "statistical_30 (mean,std,max,min,energy) on 6 channels",
        "threshold": 0.0,
        "win": WIN,
        "step": STEP
    },
    "svm_linear_baseline.joblib"
)

# 2) High-recall config (∆∞u ti√™n b·∫Øt ng√£)
joblib.dump(
    {
        "model": svm_linear,
        "scaler": scaler,
        "features": "statistical_30 (mean,std,max,min,energy) on 6 channels",
        "threshold": -0.5,
        "win": WIN,
        "step": STEP
    },
    "svm_linear_highrecall.joblib"
)

print("Saved:")
print("- svm_linear_baseline.joblib")
print("- svm_linear_highrecall.joblib")

Saved:
- svm_linear_baseline.joblib
- svm_linear_highrecall.joblib
