In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

directory = '/content/drive/MyDrive/Colab Notebooks/ML_Kaggle2/ai-4-ba-ml-2024-har'

files = os.listdir(directory)

for file in files:
    print(file)

demo.ipynb
sample_submission.csv
Train
Test
Accuracy.gdoc
Har.ipynb


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
import glob
import os
from imblearn.over_sampling import SMOTE

TRAIN_DIR = directory + "/Train/"
TEST_DIR = directory + "/Test/"

def load_data(folder_path):
    all_data = []
    all_labels = []

    print(f"Attempting to load data from: {folder_path}")

    if not os.path.exists(folder_path):
        raise ValueError(f"The folder path {folder_path} does not exist.")

    for label in range(18):
        csv_files = glob.glob(os.path.join(folder_path, str(label), "*.csv"))
        print(f"Found {len(csv_files)} CSV files for label {label}")

        for file in csv_files:
            try:
                df = pd.read_csv(file)
                features = extract_features(df)
                all_data.append(features)
                all_labels.append(label)
            except Exception as e:
                print(f"Error processing file {file}: {str(e)}")

    if len(all_data) == 0:
        raise ValueError("No data was loaded. Please check the folder structure and file paths.")

    print(f"Total samples loaded: {len(all_data)}")
    return np.array(all_data), np.array(all_labels)

def extract_features(df):
    features = []

    # Basic statistical features
    for col in df.columns:
        features.extend([
            df[col].mean(),
            df[col].std(),
            df[col].min(),
            df[col].max(),
            df[col].median()
        ])

    # Accelerometer features
    df['acc_magnitude'] = np.sqrt(df['Ax']**2 + df['Ay']**2 + df['Az']**2)
    features.extend([
        df['acc_magnitude'].mean(),
        df['acc_magnitude'].std(),
        df['acc_magnitude'].max()
    ])

    df['avg_acc'] = df[['Ax', 'Ay', 'Az']].mean(axis=1)
    df['max_acc'] = df[['Ax', 'Ay', 'Az']].max(axis=1)

    features.extend([
        df['avg_acc'].mean(),
        df['max_acc'].mean()
    ])

    # Velocity features
    df['velocity_x'] = df['Ax'].cumsum()
    df['velocity_y'] = df['Ay'].cumsum()
    df['velocity_z'] = df['Az'].cumsum()

    features.extend([
        df['velocity_x'].mean(),
        df['velocity_y'].mean(),
        df['velocity_z'].mean()
    ])

    # Jerk features
    df['jerk_x'] = df['Ax'].diff()
    df['jerk_y'] = df['Ay'].diff()
    df['jerk_z'] = df['Az'].diff()

    features.extend([
        df['jerk_x'].mean(),
        df['jerk_y'].mean(),
        df['jerk_z'].mean()
    ])

    # Gyroscope features
    df['gyro_magnitude'] = np.sqrt(df['Gx']**2 + df['Gy']**2 + df['Gz']**2)
    features.extend([
        df['gyro_magnitude'].mean(),
        df['gyro_magnitude'].std(),
        df['gyro_magnitude'].max()
    ])

    # Signal magnitude area (SMA)
    window_size = 5
    df['sma_ax'] = df['Ax'].rolling(window=window_size).mean()
    df['sma_ay'] = df['Ay'].rolling(window=window_size).mean()
    df['sma_az'] = df['Az'].rolling(window=window_size).mean()

    features.extend([
        df['sma_ax'].mean(),
        df['sma_ay'].mean(),
        df['sma_az'].mean()
    ])

    return features

def train_model():
    # Load and preprocess data
    X, y = load_data(TRAIN_DIR)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
    print("==================Train data===================")
    print(np.isnan(X).sum())
    print("===============================================")

    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    X_train = X_resampled
    y_train = y_resampled

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Hyperparameter distribution for tuning
    param_dist = {
        'n_estimators': list(range(100, 1001, 100)),
        'max_depth': list(range(5, 51, 5)),
        'min_samples_split': list(range(2, 11)),
        'min_samples_leaf': list(range(1, 6)),
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False]
    }

    # Initialize RandomForestClassifier
    model = RandomForestClassifier(random_state=101)

    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                       n_iter=50, cv=3, n_jobs=-1, verbose=1, random_state=101,
                                       scoring='f1_macro')

    # Fit model with hyperparameter tuning
    random_search.fit(X_train_scaled, y_train)

    # Get the best model and its hyperparameters
    best_model = random_search.best_estimator_
    print(f"Best parameters: {random_search.best_params_}")

    # Evaluate best model
    y_pred = best_model.predict(X_val_scaled)

    # Calculate F1 score
    f1_macro = f1_score(y_val, y_pred, average='macro')
    f1_weighted = f1_score(y_val, y_pred, average='weighted')
    f1_per_class = f1_score(y_val, y_pred, average=None)

    print(f"Validation F1 Score (Macro): {f1_macro}")
    print(f"Validation F1 Score (Weighted): {f1_weighted}")
    print("\nF1 Score per class:")
    for i, f1 in enumerate(f1_per_class):
        print(f"Class {i}: {f1}")

    print("\nDetailed Classification Report:")
    print(classification_report(y_val, y_pred))

    return best_model, scaler

def predict_test_data(model, scaler):
    test_files = glob.glob(os.path.join(TEST_DIR, "*.csv"))
    test_predictions = []

    for file in test_files:
        df = pd.read_csv(file)
        features = extract_features(df)
        features_scaled = scaler.transform([features])
        prediction = model.predict(features_scaled)[0]
        test_predictions.append((os.path.basename(file), prediction))

    return test_predictions

def write_submission_file(predictions):
    submission_df = pd.DataFrame(predictions, columns=['id', 'label'])
    submission_df.to_csv('submission.csv', index=False)

try:
    model, scaler = train_model()
    test_predictions = predict_test_data(model, scaler)
    write_submission_file(test_predictions)
except Exception as e:
    print(f"An error occurred: {str(e)}")

Attempting to load data from: /content/drive/MyDrive/Colab Notebooks/ML_Kaggle2/ai-4-ba-ml-2024-har/Train/
Found 1500 CSV files for label 0
Found 1500 CSV files for label 1
Found 1200 CSV files for label 2
Found 1000 CSV files for label 3
Found 1500 CSV files for label 4
Found 80 CSV files for label 5
Found 90 CSV files for label 6
Found 70 CSV files for label 7
Found 60 CSV files for label 8
Found 100 CSV files for label 9
Found 70 CSV files for label 10
Found 500 CSV files for label 11
Found 60 CSV files for label 12
Found 70 CSV files for label 13
Found 60 CSV files for label 14
Found 60 CSV files for label 15
Found 60 CSV files for label 16
Found 60 CSV files for label 17
Total samples loaded: 8040
0
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'n_estimators': 1000, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25, 'criterion': 'entropy', 'bootstrap': False}
Validation F1 Score (Macro): 0.8736517868630231
Vali

In [None]:
write_submission_file(test_predictions)