# CMI Competition: Data Exploration

This notebook explores the sensor data for body-focused repetitive behavior detection.

In [None]:
import sys

sys.path.append('../src')

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from bfrb.kaggle_utils import KaggleCompetition, setup_kaggle_credentials

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("Packages imported successfully!")

## Setup Kaggle Credentials and Download Data

In [None]:
# Setup Kaggle credentials
setup_kaggle_credentials()

# Initialize competition handler
competition = KaggleCompetition()

# Download data (this will skip if data already exists)
competition.download_data()

## Load and Examine Data

In [None]:
# Load training data
try:
    train_df = competition.load_train_data()
    print(f"Training data shape: {train_df.shape}")
    print(f"\nColumns: {list(train_df.columns)}")
    print("\nFirst few rows:")
    display(train_df.head())
except FileNotFoundError:
    print("Training data not found. Please ensure Kaggle credentials are set up correctly.")
    train_df = None

In [None]:
# Load test data
try:
    test_df = competition.load_test_data()
    print(f"Test data shape: {test_df.shape}")
    print(f"\nColumns: {list(test_df.columns)}")
    print("\nFirst few rows:")
    display(test_df.head())
except FileNotFoundError:
    print("Test data not found.")
    test_df = None

In [None]:
# Load sample submission
try:
    sample_sub = competition.load_sample_submission()
    print(f"Sample submission shape: {sample_sub.shape}")
    print(f"\nColumns: {list(sample_sub.columns)}")
    print("\nFirst few rows:")
    display(sample_sub.head())
except FileNotFoundError:
    print("Sample submission not found.")
    sample_sub = None

## Data Information and Basic Statistics

In [None]:
if train_df is not None:
    print("=== TRAINING DATA INFO ===")
    print(train_df.info())
    print("\n=== DATA TYPES ===")
    print(train_df.dtypes.value_counts())
    print("\n=== MISSING VALUES ===")
    missing_values = train_df.isnull().sum()
    print(missing_values[missing_values > 0])
    print("\n=== BASIC STATISTICS ===")
    display(train_df.describe())

## Target Variable Analysis

In [None]:
if train_df is not None and 'behavior' in train_df.columns:
    target_col = 'behavior'
elif train_df is not None and 'target' in train_df.columns:
    target_col = 'target'
else:
    target_col = None
    print("Target column not found in the data.")

if target_col and train_df is not None:
    print(f"=== TARGET VARIABLE: {target_col} ===")
    print(f"Unique values: {train_df[target_col].unique()}")
    print("\nValue counts:")
    print(train_df[target_col].value_counts())
    print("\nPercentage distribution:")
    print(train_df[target_col].value_counts(normalize=True) * 100)

    # Plot target distribution
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Count plot
    train_df[target_col].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title('Target Variable Distribution')
    axes[0].set_xlabel('Behavior')
    axes[0].set_ylabel('Count')

    # Pie chart
    train_df[target_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title('Target Variable Percentage')
    axes[1].set_ylabel('')

    plt.tight_layout()
    plt.show()

## Sensor Data Analysis

In [None]:
if train_df is not None:
    # Identify sensor columns
    sensor_keywords = ['accel', 'gyro', 'mag', 'x', 'y', 'z', 'sensor']
    sensor_columns = [col for col in train_df.columns if any(keyword in col.lower() for keyword in sensor_keywords)]

    print("=== SENSOR COLUMNS DETECTED ===")
    print(f"Total sensor columns: {len(sensor_columns)}")
    for col in sensor_columns[:20]:  # Show first 20
        print(f"  - {col}")
    if len(sensor_columns) > 20:
        print(f"  ... and {len(sensor_columns) - 20} more")

    # Statistical summary of sensor data
    if sensor_columns:
        print("\n=== SENSOR DATA STATISTICS ===")
        sensor_stats = train_df[sensor_columns].describe()
        display(sensor_stats)

## Time Series Analysis (if timestamp available)

In [None]:
if train_df is not None:
    # Check for timestamp columns
    time_columns = [col for col in train_df.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'timestamp'])]

    if time_columns:
        print("=== TIME COLUMNS DETECTED ===")
        for col in time_columns:
            print(f"  - {col}: {train_df[col].dtype}")
            print(f"    Sample values: {train_df[col].head(3).tolist()}")

        # Try to convert to datetime
        time_col = time_columns[0]
        try:
            train_df[time_col] = pd.to_datetime(train_df[time_col])
            print(f"\nSuccessfully converted {time_col} to datetime")
            print(f"Time range: {train_df[time_col].min()} to {train_df[time_col].max()}")
            print(f"Duration: {train_df[time_col].max() - train_df[time_col].min()}")
        except:
            print(f"Could not convert {time_col} to datetime")
    else:
        print("No timestamp columns detected.")

## Correlation Analysis

In [None]:
if train_df is not None and len(sensor_columns) > 0:
    # Calculate correlations with target if available
    if target_col:
        correlations = train_df[sensor_columns + [target_col]].corr()[target_col].abs().sort_values(ascending=False)
        print("=== TOP CORRELATIONS WITH TARGET ===")
        print(correlations.head(20))

        # Plot top correlations
        top_corr = correlations.head(15)
        plt.figure(figsize=(10, 6))
        top_corr[1:].plot(kind='bar')  # Exclude target itself
        plt.title('Top 15 Feature Correlations with Target')
        plt.xlabel('Features')
        plt.ylabel('Absolute Correlation')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    # Correlation heatmap for a subset of features
    if len(sensor_columns) > 5:
        sample_features = sensor_columns[:10]  # Take first 10 for visualization
        if target_col:
            sample_features.append(target_col)

        corr_matrix = train_df[sample_features].corr()

        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                   square=True, fmt='.2f')
        plt.title('Correlation Heatmap (Sample Features)')
        plt.tight_layout()
        plt.show()

## Data Visualization

In [None]:
if train_df is not None and len(sensor_columns) > 0:
    # Plot sample sensor data
    sample_size = min(1000, len(train_df))
    sample_data = train_df.sample(n=sample_size, random_state=42)

    # Select a few sensor columns for visualization
    viz_columns = sensor_columns[:4]  # First 4 sensor columns

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()

    for i, col in enumerate(viz_columns):
        if target_col:
            # Separate by target class
            for target_val in sample_data[target_col].unique():
                subset = sample_data[sample_data[target_col] == target_val]
                axes[i].scatter(range(len(subset)), subset[col],
                              alpha=0.6, label=f'Target {target_val}', s=10)
        else:
            axes[i].plot(sample_data[col], alpha=0.7)

        axes[i].set_title(f'{col}')
        axes[i].set_xlabel('Sample Index')
        axes[i].set_ylabel('Value')
        if target_col:
            axes[i].legend()

    plt.tight_layout()
    plt.show()

## Data Quality Assessment

In [None]:
if train_df is not None:
    print("=== DATA QUALITY ASSESSMENT ===")

    # Check for duplicates
    duplicates = train_df.duplicated().sum()
    print(f"Duplicate rows: {duplicates}")

    # Check for constant columns
    constant_cols = []
    for col in train_df.columns:
        if train_df[col].nunique() <= 1:
            constant_cols.append(col)

    print(f"Constant columns: {len(constant_cols)}")
    if constant_cols:
        print(f"  - {constant_cols}")

    # Check for high cardinality columns
    high_card_cols = []
    for col in train_df.columns:
        if train_df[col].dtype == 'object' and train_df[col].nunique() > len(train_df) * 0.8:
            high_card_cols.append(col)

    print(f"High cardinality columns: {len(high_card_cols)}")
    if high_card_cols:
        print(f"  - {high_card_cols}")

    # Memory usage
    memory_usage = train_df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory usage: {memory_usage:.2f} MB")

    # Data types summary
    print("\nData types summary:")
    print(train_df.dtypes.value_counts())

## Summary and Next Steps

In [None]:
print("=== DATA EXPLORATION SUMMARY ===")
if train_df is not None:
    print(f"✓ Training data loaded: {train_df.shape}")
    print(f"✓ Sensor columns identified: {len(sensor_columns)}")
    if target_col:
        print(f"✓ Target variable: {target_col}")
        print(f"✓ Class distribution: {dict(train_df[target_col].value_counts())}")
    print(f"✓ Missing values: {train_df.isnull().sum().sum()}")
    print(f"✓ Duplicate rows: {train_df.duplicated().sum()}")
else:
    print("✗ Training data not loaded - check Kaggle credentials")

if test_df is not None:
    print(f"✓ Test data loaded: {test_df.shape}")
else:
    print("✗ Test data not loaded")

print("\n=== NEXT STEPS ===")
print("1. Data preprocessing and feature engineering")
print("2. Model development and training")
print("3. Model evaluation and selection")
print("4. Submission generation")