In [4]:
# Import + Cấu hình đường dẫn
import os, re
from glob import glob
import numpy as np

In [5]:
# Chốt Data_Root + lấy danh sách File
DATA_ROOT = "/kaggle/input/sis-fall-original-dataset/SisFall_dataset"

all_files = glob(DATA_ROOT + "/SA*/*.txt")
print("Total files:", len(all_files))
print("Example file:", all_files[0] if len(all_files) > 0 else "No files")

Total files: 3537
Example file: /kaggle/input/sis-fall-original-dataset/SisFall_dataset/SA01/D09_SA01_R02.txt


In [6]:
# Hàm gán nhãn (FALL/ADL) từ tên File
def label_from_filename(path):
    name = os.path.basename(path).upper()
    if re.match(r"F\d{2}_", name):
        return 1  # FALL
    if re.match(r"D\d{2}_", name):
        return 0  # ADL
    return None

In [7]:
# Hàm đọc File SisFall
_num_re = re.compile(r"[-+]?\d+(?:[.,]\d+)?")

def load_sisfall_file(path):
    rows = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            toks = _num_re.findall(line)
            if len(toks) < 6:
                continue

            vals = [t.replace(",", ".") for t in toks[:6]]
            try:
                rows.append([float(v) for v in vals])
            except:
                continue

    if len(rows) == 0:
        raise ValueError("No numeric rows parsed")
    return np.asarray(rows, dtype=np.float32)

In [8]:
# Load toàn bộ File -> x_raw, y_raw + thống kê
X_raw, y_raw = [], []
bad_files = []

for p in all_files:
    y = label_from_filename(p)
    if y is None:
        continue
    try:
        x = load_sisfall_file(p)   # (T, 6)
        X_raw.append(x)
        y_raw.append(y)
    except Exception as e:
        bad_files.append((p, str(e)))

print("Loaded signals:", len(X_raw))
print("Bad files:", len(bad_files))
print("FALL count:", int(np.sum(y_raw)))
print("ADL count:", len(y_raw) - int(np.sum(y_raw)))

if len(X_raw) > 0:
    print("Sample shape:", X_raw[0].shape)
else:
    print("No samples loaded yet.")

# in tối đa 5 file lỗi (nếu có)
for p, err in bad_files[:5]:
    print("BAD:", p, "|", err)

Loaded signals: 3537
Bad files: 0
FALL count: 1723
ADL count: 1814
Sample shape: (2400, 6)


In [9]:
# Windowing (Cắt cửa sổ)
def sliding_window(x, win_size, step):
    T, C = x.shape
    if T < win_size:
        return np.empty((0, win_size, C), dtype=np.float32)

    windows = []
    for start in range(0, T - win_size + 1, step):
        windows.append(x[start:start + win_size])

    return np.stack(windows).astype(np.float32)

In [10]:
# Tạo dataset window: X_windows, Y_windows
WIN = 400
STEP = 200

X_windows = []
y_windows = []

for x, y in zip(X_raw, y_raw):
    W = sliding_window(x, WIN, STEP)
    if len(W) == 0:
        continue
    X_windows.append(W)
    y_windows.append(np.full((W.shape[0],), y, dtype=np.int32))

X_windows = np.concatenate(X_windows, axis=0)
y_windows = np.concatenate(y_windows, axis=0)

print("X_windows shape:", X_windows.shape)
print("y_windows shape:", y_windows.shape)
print("Positive rate:", float(y_windows.mean()))


X_windows shape: (56233, 400, 6)
y_windows shape: (56233,)
Positive rate: 0.42496398911671085


In [11]:
# Trích đặc trưng (Features) cho RandomForest
def features_from_window(w):
    feats = np.concatenate([
        w.mean(axis=0),
        w.std(axis=0),
        w.max(axis=0),
        w.min(axis=0),
    ]).astype(np.float32)
    return feats

X_feat = np.stack([features_from_window(w) for w in X_windows])
print("X_feat shape:", X_feat.shape)  # (N, 24)

X_feat shape: (56233, 24)


In [12]:
# Train/Test split + Train RandomForest + Đánh giá
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_tr, X_te, y_tr, y_te = train_test_split(
    X_feat, y_windows,
    test_size=0.2,
    random_state=42,
    stratify=y_windows
)

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf.fit(X_tr, y_tr)
y_pred = rf.predict(X_te)

print(classification_report(y_te, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))

              precision    recall  f1-score   support

           0     0.9056    0.9688    0.9361      6467
           1     0.9533    0.8634    0.9061      4780

    accuracy                         0.9240     11247
   macro avg     0.9295    0.9161    0.9211     11247
weighted avg     0.9259    0.9240    0.9234     11247

Confusion matrix:
 [[6265  202]
 [ 653 4127]]


In [13]:
# Lưu Model
import joblib

joblib.dump(rf, "sisfall_random_forest.joblib")
print("Saved: sisfall_random_forest.joblib")

Saved: sisfall_random_forest.joblib


In [14]:
# Feature nâng cao
import numpy as np

def advanced_features(w):
    # w: (WIN,6) -> ax ay az gx gy gz
    a = w[:, :3]
    g = w[:, 3:6]
    amag = np.sqrt((a*a).sum(axis=1))  # (WIN,)
    gmag = np.sqrt((g*g).sum(axis=1))

    feats = []

    # stats 6 kênh gốc
    feats += list(w.mean(axis=0))
    feats += list(w.std(axis=0))
    feats += list(w.max(axis=0))
    feats += list(w.min(axis=0))

    # stats magnitude
    feats += [amag.mean(), amag.std(), amag.max(), amag.min()]
    feats += [gmag.mean(), gmag.std(), gmag.max(), gmag.min()]

    # energy (mean squared)
    feats += [(w*w).mean()]
    feats += [(amag*amag).mean()]
    feats += [(gmag*gmag).mean()]

    return np.array(feats, dtype=np.float32)

X_feat2 = np.stack([advanced_features(w) for w in X_windows])
print("X_feat2 shape:", X_feat2.shape)

X_feat2 shape: (56233, 35)


In [15]:
# Train lại RF trên feature mới + thử threshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_tr, X_te, y_tr, y_te = train_test_split(
    X_feat2, y_windows, test_size=0.2, random_state=42, stratify=y_windows
)

rf2 = RandomForestClassifier(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample",
    max_depth=None
)

rf2.fit(X_tr, y_tr)

proba = rf2.predict_proba(X_te)[:, 1]

for thr in [0.5, 0.45, 0.4, 0.35]:
    y_pred = (proba >= thr).astype(int)
    print("\n==== threshold =", thr, "====")
    print(classification_report(y_te, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))


==== threshold = 0.5 ====
              precision    recall  f1-score   support

           0     0.9110    0.9702    0.9396      6467
           1     0.9557    0.8718    0.9118      4780

    accuracy                         0.9283     11247
   macro avg     0.9334    0.9210    0.9257     11247
weighted avg     0.9300    0.9283    0.9278     11247

Confusion matrix:
 [[6274  193]
 [ 613 4167]]

==== threshold = 0.45 ====
              precision    recall  f1-score   support

           0     0.9293    0.9592    0.9440      6467
           1     0.9423    0.9013    0.9213      4780

    accuracy                         0.9346     11247
   macro avg     0.9358    0.9302    0.9326     11247
weighted avg     0.9348    0.9346    0.9344     11247

Confusion matrix:
 [[6203  264]
 [ 472 4308]]

==== threshold = 0.4 ====
              precision    recall  f1-score   support

           0     0.9432    0.9417    0.9424      6467
           1     0.9213    0.9232    0.9223      4780

    accu

In [16]:
import joblib

joblib.dump(
    {
        "model": rf2,
        "threshold": 0.4,
        "features": "advanced_features",
        "win": WIN,
        "step": STEP
    },
    "sisfall_rf_final.joblib"
)

print("Saved final model: sisfall_rf_final.joblib")

Saved final model: sisfall_rf_final.joblib
