In [None]:
!pip install pandas scikit-learn



In [None]:
#
import pandas as pd

DATA_PATH = 'mhealth.csv' # Assuming you upload e.g. 'mhealth.csv'

df = pd.read_csv(DATA_PATH)
print(f"Shape: {df.shape}")
display(df.head())

Shape: (1048575, 25)


Unnamed: 0,acc_chest_x,acc_chest_y,acc_chest_z,ecg_1,ecg_2,acc_ankle_x,acc_ankle_y,acc_ankle_z,gyro_ankle_x,gyro_ankle_y,...,acc_wrist_y,acc_wrist_z,gyro_wrist_x,gyro_wrist_y,gyro_wrist_z,mag_wrist_x,mag_wrist_y,mag_wrist_z,activity_label,subject
0,-9.8184,0.009971,0.29563,0.004186,0.004186,2.1849,-9.6967,0.63077,0.1039,-0.84053,...,-4.5781,0.18776,-0.44902,-1.0103,0.034483,-2.35,-1.6102,-0.030899,0,1
1,-9.8489,0.52404,0.37348,0.004186,0.016745,2.3876,-9.508,0.68389,0.085343,-0.83865,...,-4.3198,0.023595,-0.44902,-1.0103,0.034483,-2.1632,-0.88254,0.32657,0,1
2,-9.6602,0.18185,0.43742,0.016745,0.037677,2.4086,-9.5674,0.68113,0.085343,-0.83865,...,-4.2772,0.27572,-0.44902,-1.0103,0.034483,-1.6175,-0.16562,-0.030693,0,1
3,-9.6507,0.21422,0.24033,0.07954,0.11722,2.1814,-9.4301,0.55031,0.085343,-0.83865,...,-4.3163,0.36752,-0.45686,-1.0082,0.025862,-1.0771,0.006945,-0.38262,0,1
4,-9.703,0.30389,0.31156,0.22187,0.20513,2.4173,-9.3889,0.71098,0.085343,-0.83865,...,-4.1459,0.40729,-0.45686,-1.0082,0.025862,-0.53684,0.1759,-1.0955,0,1


In [None]:
import numpy as np

activity_to_risk = {
    0: 0, # Nothing
    1: 0, # Standing still
    2: 0, # Sitting and relaxing
    3: 0, # Lying down
    4: 1, # Walking
    5: 1, # Climbing stairs
    6: 1, # Waist bends forward
    7: 0, # Frontal elevation of arms
    8: 1, # Knees bending (crouching)
    9: 1, # Cycling
    10:1, # Jogging
    11:1, # Running
    12:1  # Jumping front & back
}

# Apply the activity-to-risk mapping to create the risk_label
df["risk_label"] = df['activity_label'].map(activity_to_risk) # Assuming 'activity_label' is the correct column name for activity


In [None]:
# 3. Split into features and labels
target_col = 'risk_label'
X = df.drop(columns=[target_col, 'subject', 'activity_label'])
y = df[target_col]


In [None]:
from sklearn.model_selection import train_test_split

# Typical 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)


print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Label distribution, train:", y_train.value_counts(normalize=True).round(2))
print("Label distribution, test :", y_test.value_counts(normalize=True).round(2))

Train shape: (838860, 23) Test shape: (209715, 23)
Label distribution, train: risk_label
0    0.82
1    0.18
Name: proportion, dtype: float64
Label distribution, test : risk_label
0    0.82
1    0.18
Name: proportion, dtype: float64


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to only training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Check new distribution
print("After SMOTE, Train label distribution:")
print(y_train.value_counts(normalize=True).round(2))


After SMOTE, Train label distribution:
risk_label
0    0.5
1    0.5
Name: proportion, dtype: float64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # Still keep because even after SMOTE there could be slight imbalance
)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred_rf)
prec = precision_score(y_test, y_pred_rf)
rec = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

# Print Evaluation
print("\nRandom Forest Evaluation Metrics:")
print(f"Accuracy  : {acc:.2f}")
print(f"Precision : {prec:.2f}")
print(f"Recall    : {rec:.2f}")
print(f"F1 Score  : {f1:.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))



Random Forest Evaluation Metrics:
Accuracy  : 0.97
Precision : 0.86
Recall    : 0.98
F1 Score  : 0.91

Confusion Matrix:
[[165532   6264]
 [   803  37116]]


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Subsample for Grid Search (Optional: Only if training is too slow)
subset_size = min(100000, len(X_train))  # Just in case you have fewer rows after SMOTE
rnd_idx = np.random.choice(X_train.index, size=subset_size, replace=False)
X_gs = X_train.loc[rnd_idx]
y_gs = y_train.loc[rnd_idx]

# Build pipeline
logreg_pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
)

# Grid Search hyperparameters
param_grid = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__solver': ['lbfgs']  # lbfgs is good for L2 regularization
}

grid = GridSearchCV(
    logreg_pipeline,
    param_grid,
    cv=3,
    scoring='f1',  # ⚡ Change scoring from 'accuracy' -> 'f1'
    n_jobs=-1,
    verbose=2
)
grid.fit(X_gs, y_gs)

# Best Params
best_C = grid.best_params_['logisticregression__C']
best_solver = grid.best_params_['logisticregression__solver']

# Final Model on full training set
final_pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=1000,
        C=best_C,
        solver=best_solver
    )
)
final_pipeline.fit(X_train, y_train)

# Predict
y_pred_lr = final_pipeline.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred_lr)
prec = precision_score(y_test, y_pred_lr)
rec = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)

print("\nLogistic Regression Evaluation Metrics:")
print(f"Accuracy  : {acc:.2f}")
print(f"Precision : {prec:.2f}")
print(f"Recall    : {rec:.2f}")
print(f"F1 Score  : {f1:.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))


Fitting 3 folds for each of 3 candidates, totalling 9 fits

Logistic Regression Evaluation Metrics:
Accuracy  : 0.70
Precision : 0.34
Recall    : 0.70
F1 Score  : 0.45

Confusion Matrix:
[[119120  52676]
 [ 11258  26661]]


In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Subsample for grid search (if needed)
subset_size = min(100000, len(X_train))  # Safer for SMOTE-balanced datasets
rnd_idx = np.random.choice(X_train.index, size=subset_size, replace=False)
X_gs = X_train.loc[rnd_idx]
y_gs = y_train.loc[rnd_idx]

# Build SVM pipeline (LinearSVC prefers scaled features)
svm_pipeline = make_pipeline(
    StandardScaler(),
    LinearSVC(class_weight='balanced', random_state=42, max_iter=2000, dual=False)
)

# Hyperparameter grid
param_grid = {
    'linearsvc__C': [0.01, 0.1, 1, 10]
}

# Grid search
grid = GridSearchCV(
    svm_pipeline,
    param_grid,
    cv=3,
    scoring='f1',  # ⚡ Focus on F1, not just accuracy
    n_jobs=-1,
    verbose=2
)
grid.fit(X_gs, y_gs)

print("Best SVM Parameters:", grid.best_params_)
print("Best SVM CV F1 Score: {:.3f}".format(grid.best_score_))

# Refit best model on full training data
best_C = grid.best_params_['linearsvc__C']

final_svm = make_pipeline(
    StandardScaler(),
    LinearSVC(
        class_weight='balanced',
        random_state=42,
        max_iter=2000,
        dual=False,
        C=best_C
    )
)
final_svm.fit(X_train, y_train)

# Predict
y_pred_svm = final_svm.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred_svm)
prec = precision_score(y_test, y_pred_svm)
rec = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)

print("\nSVM Evaluation Metrics:")
print(f"Accuracy  : {acc:.2f}")
print(f"Precision : {prec:.2f}")
print(f"Recall    : {rec:.2f}")
print(f"F1 Score  : {f1:.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best SVM Parameters: {'linearsvc__C': 0.01}
Best SVM CV F1 Score: 0.703

SVM Evaluation Metrics:
Accuracy  : 0.70
Precision : 0.34
Recall    : 0.70
F1 Score  : 0.46

Confusion Matrix:
[[119225  52571]
 [ 11248  26671]]
