In [None]:
!pip install -q autogluon

import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from autogluon.tabular import TabularPredictor
from scipy.stats import iqr, skew, kurtosis  # Import the required functions

In [None]:
# Define the path to the train and test folders
train_path = "/kaggle/input/super-ai-engineer-5-human-activity-recognition/HAR/train"
test_path = "/kaggle/input/super-ai-engineer-5-human-activity-recognition/HAR/test"

# Get a list of all CSV files in the train directory
csv_files = []
for activity_folder in sorted(os.listdir(train_path)):
    activity_path = os.path.join(train_path, activity_folder)
    for file in sorted(glob.glob(f"{activity_path}/*.csv")):
        csv_files.append((file, activity_folder))  # Store file path and activity label

# Read and concatenate all CSV files
df_list = []
for file, label in csv_files:
    df = pd.read_csv(file)
    df['Activity'] = label  # Add activity label to each file
    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)  # Combine into a single DataFrame

# Display basic info
print(df.shape)
print(df.info())
df.head(2)

# Load dataframe (df is assumed to have 6 columns: 6 features + 1 target)
window_size = 500  # Window size for segmentation

# Function to process data in segments
def process_segments(df, window_size):
    num_segments = len(df) // window_size  # Number of windows
    processed_data = []

    for i in tqdm(range(num_segments), desc="Processing Segments", unit="window"):
        start_idx = i * window_size
        end_idx = (i + 1) * window_size
        segment = df.iloc[start_idx:end_idx]  # Extract segment

        # Dictionary to store aggregated features
        feature_dict = {}

        for col in df.columns[:-1]:  # Exclude the Activity column
            data = segment[col].values

            # Time-domain features
            feature_dict[f'{col}_mean'] = np.mean(data)
            feature_dict[f'{col}_std'] = np.std(data)
            feature_dict[f'{col}_mad'] = np.median(np.abs(data - np.median(data)))
            feature_dict[f'{col}_max'] = np.max(data)
            feature_dict[f'{col}_min'] = np.min(data)
            feature_dict[f'{col}_sma'] = np.sum(np.abs(data))  # Signal Magnitude Area
            feature_dict[f'{col}_energy'] = np.sum(np.square(data))
            feature_dict[f'{col}_iqr'] = iqr(data)  # Interquartile range
            feature_dict[f'{col}_skewness'] = skew(data)  # Skewness
            feature_dict[f'{col}_kurtosis'] = kurtosis(data)  # Kurtosis

        # Assign the activity label (take the first row's label)
        feature_dict['Activity'] = segment.iloc[0, -1]  # Keep first label of window

        # Append the processed row
        processed_data.append(feature_dict)

    # Convert to a DataFrame
    reduced_df = pd.DataFrame(processed_data)
    return reduced_df

# Apply the function
reduced_df = process_segments(df, window_size)

# Check the new shape
print(f"Original shape: {df.shape}")
print(f"Reduced shape: {reduced_df.shape}")
print(reduced_df.head())

# Separate features (X) and target (y)
X = reduced_df.drop(columns=['Activity'])  # Drop the target column
y = reduced_df['Activity']  # Target variable

# Initialize AutoGluon TabularPredictor
predictor = TabularPredictor(label='Activity', eval_metric='accuracy').fit(train_data=reduced_df)

# Load test data
test_files = sorted(glob.glob(f"{test_path}/*.csv"))
test_data = []

for file in test_files:
    df = pd.read_csv(file)
    df['ID'] = os.path.basename(file)  # Add ID column
    test_data.append(df)

# Concatenate all test data into a single DataFrame
test_df = pd.concat(test_data, ignore_index=True)

# Apply feature engineering (reuse the process_segments function)
test_reduced_df = process_segments(test_df, window_size=500)

# Check new test data shape
print(f"Processed Test Data Shape: {test_reduced_df.shape}")
print(test_reduced_df.head())

# Split the ID column
X_test = test_reduced_df.drop(columns=['Activity'])  # Remove Activity (it's actually ID)
ID_column = test_reduced_df['Activity'].rename('ID')  # Rename it to 'ID'

# Ensure only features are used
X_test_final = X_test  # No Activity since it's test data

# Predict activity labels
test_predictions = predictor.predict(X_test_final)

# Save results
test_reduced_df["Predicted_Activity"] = test_predictions
test_reduced_df.to_csv("predicted_activities.csv", index=False)

print("Predictions saved!")

# Rename columns for submission
test_reduced_df.rename(columns={'Activity': 'id'}, inplace=True)
test_reduced_df.rename(columns={'Predicted_Activity': 'class'}, inplace=True)

# Prepare final submission DataFrame
final_df = test_reduced_df[['id', 'class']]

# Save to CSV
final_df.to_csv("/kaggle/working/submission_har_autogluon.csv", index=False)

# Check if the file is saved
print("CSV file saved as submission_har_autogluon.csv")