In [None]:
# 1. Setup and Library Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report # Added classification_report

# Set up environment for reproducibility and plotting
random.seed(42)
np.random.seed(42)
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_theme(style="whitegrid")

print("Libraries imported and environment set up.")

Libraries imported and environment set up.


In [None]:
# 2. Loading and Initial Data Preparation

# --- File paths ---
# Assuming files are in the current directory or the standard Kaggle input path
train_file_path = "/content/Train.csv"
test_file_path = '/content/Test.csv'

# Load the datasets
try:
    train_df = pd.read_csv(train_file_path)
    test_df = pd.read_csv(test_file_path)
except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please ensure 'Train.csv' and 'Test.csv' are accessible.")
    # Stop execution if files are missing
    raise

# Drop unnecessary index column ('Unnamed: 0')
if 'Unnamed: 0' in train_df.columns:
    train_df.drop(['Unnamed: 0'], axis = 1, inplace = True)
if 'Unnamed: 0' in test_df.columns:
    test_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

# --- CRITICAL CHANGE FOR MULTI-CLASS CLASSIFICATION ---
# The target variable (y) is now the 'label' column, which contains all 13 activity names.
# The 'fall' column is now dropped from the features (X).
X_train = train_df.drop(['fall', 'label'], axis=1)
y_train = train_df['label'] # <--- Target changed to 'label'
X_test = test_df.drop(['fall', 'label'], axis=1)
y_test = test_df['label'] # <--- Target changed to 'label'

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nFirst 5 rows of Training Features (X_train):")
print(X_train.head())

Training data shape: (1428, 11)
Test data shape: (356, 11)

First 5 rows of Training Features (X_train):
     acc_max   gyro_max  acc_kurtosis  gyro_kurtosis    lin_max  acc_skewness  \
0  26.039919   7.309797     20.378162       2.782476  11.131080      3.891361   
1  25.864500   6.511954     14.187190       5.324864   7.945561      3.022175   
2  27.524501  12.944099     31.855926      22.891186  14.454818      4.849024   
3  30.647705  11.694868     23.608764       9.287735  15.228303      3.921537   
4  26.373917  11.168424     14.318453      15.983202  10.007396      3.087975   

   gyro_skewness  post_gyro_max  post_lin_max  
0       1.592927       7.086618     10.790400  
1       2.376939       6.325522      7.719352  
2       4.283890      12.888111     14.368784  
3       2.794609      11.549971     14.944151  
4       3.363557      11.057636      9.753058  


In [None]:
#3. Data Cleaning and Feature Retention (9 Features)
# --- Action: KEEPING ALL 9 FEATURES ---

# Convert all feature columns to numeric, forcing errors to NaN.
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Handle NaNs and Infinities: Essential for the StandardScaler
for col in X_train.columns:
    X_train[col].replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test[col].replace([np.inf, -np.inf], np.nan, inplace=True)

    mean_val = X_train[col].mean()

    X_train[col].fillna(mean_val, inplace=True)
    X_test[col].fillna(mean_val, inplace=True)

print(f"Features used for training: {list(X_train.columns)} (9 features)")


Features used for training: ['acc_max', 'gyro_max', 'acc_kurtosis', 'gyro_kurtosis', 'lin_max', 'acc_skewness', 'gyro_skewness', 'post_gyro_max', 'post_lin_max'] (9 features)


In [None]:
# 4. Feature Normalization using StandardScaler

scaler = StandardScaler()

# 1. Fit scaler ONLY on training data
scaler.fit(X_train)

# 2. Transform both training and test data (NumPy array output)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- DEFINITIVE FIX: Re-convert NumPy array back to DataFrame and overwrite X_train/X_test ---
# This ensures that the next steps use a DataFrame structure and column labels.
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Get the name of the first column dynamically for printing
first_col_name = X_train.columns[0]

print("Features normalized.")
# Use the column name directly to access the Series for mean/std calculation
print("Mean of first feature after scaling (should be close to 0):", round(X_train[first_col_name].mean(), 4))
print("Standard Deviation of first feature after scaling (should be 1):", round(X_train[first_col_name].std(), 4))


Features normalized.
Mean of first feature after scaling (should be close to 0): -0.0
Standard Deviation of first feature after scaling (should be 1): 1.0004


In [None]:
# Define the grid of hyperparameters
# Define the targeted grid of hyperparameters (Fine-Tuning)
random_grid = {
    # Test slightly above and below the previous best (400)
    'n_estimators': [350, 450, 550],

    # Keep fixed features and bootstrap
    'max_features': ['sqrt'],
    'bootstrap': [True],

    # Fine-tune sensitive tree complexity parameters around previous best (None, 9, 3)
    'max_depth': [None, 60, 80],
    'min_samples_split': [6, 9, 12],
    'min_samples_leaf': [2, 3, 4],
}

# Initialize the base model
rf = RandomForestClassifier(random_state=42)

# Initialize and run RandomizedSearchCV (50 Iterations for a deeper search in a narrow range)
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 50,
    cv = 5,
    verbose=2,
    random_state=42,
    n_jobs = -1
)

print("Starting Targeted Optimal Search (Training 50 combinations across 5 folds)...")
rf_random.fit(X_train, y_train)

# Output the best parameters found
best_params = rf_random.best_params_
print("\nTargeted Search Complete.")
print("Best parameters found:")
print(best_params)


Starting Targeted Optimal Search (Training 50 combinations across 5 folds)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Targeted Search Complete.
Best parameters found:
{'n_estimators': 450, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}


In [None]:
# Get the best estimator
# Get the best estimator
best_estimator = rf_random.best_estimator_

# Make predictions on the scaled test data
predictions = best_estimator.predict(X_test)

# Calculate and print accuracy
optimal_accuracy = accuracy_score(y_test, predictions) * 100

print("-" * 50)
print("Multi-Class Model Performance on Test Data (TARGETED OPTIMIZATION):")
print(f"Overall Accuracy = {optimal_accuracy:0.2f}%.")
print("\nDetailed Classification Report:")
# The classification report shows precision, recall, and F1-score for ALL 13 labels
print(classification_report(y_test, predictions))
print("-" * 50)

--------------------------------------------------
Multi-Class Model Performance on Test Data (TARGETED OPTIMIZATION):
Overall Accuracy = 72.47%.

Detailed Classification Report:
              precision    recall  f1-score   support

         BSC       0.76      0.84      0.79        37
         CSI       0.60      0.55      0.57        22
         CSO       0.79      0.73      0.76        26
         FKL       0.51      0.53      0.52        34
         FOL       0.61      0.61      0.61        44
         JOG       0.89      0.74      0.81        23
         JUM       0.81      0.92      0.86        24
         SCH       1.00      1.00      1.00        20
         SDL       0.50      0.50      0.50        38
         STD       1.00      1.00      1.00        23
         STN       0.66      0.86      0.75        22
         STU       0.90      0.75      0.82        24
         WAL       0.81      0.68      0.74        19

    accuracy                           0.72       356
   macro 

In [None]:
import joblib
from google.colab import files # To download the files from Colab

# --- 1. Define Filenames ---
MODEL_FILENAME = "activity_rf_model.joblib"
SCALER_FILENAME = "activity_scaler.joblib"

# --- 2. Get Your Trained Objects ---
# 'rf_random.best_estimator_' is your best Random Forest model
best_model = rf_random.best_estimator_

# 'scaler' is your fitted StandardScaler object from cell [4]
# (Make sure this variable is still in memory)

# --- 3. Save the Objects to Files ---
joblib.dump(best_model, MODEL_FILENAME)
joblib.dump(scaler, SCALER_FILENAME)

print(f"Model saved to: {MODEL_FILENAME}")
print(f"Scaler saved to: {SCALER_FILENAME}")

# --- 4. Download the Files ---
print("Downloading files to your local computer...")
files.download(MODEL_FILENAME)
files.download(SCALER_FILENAME)

Model saved to: activity_rf_model.joblib
Scaler saved to: activity_scaler.joblib
Downloading files to your local computer...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>