<a href="https://colab.research.google.com/github/marzia272/AI4ALL_Project/blob/marzia/AI4ALL_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning and preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Signal processing
from scipy.signal import welch, butter, filtfilt

# Warnings control
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning, message='divide by zero encountered in log10')

# KaggleHub for loading datasets (if using Kaggle directly)
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Dimensionality reduction (for visualization)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Saving output
import os



SyntaxError: invalid syntax (ipython-input-20-1462611536.py, line 35)

In [32]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import kagglehub
import re

# Emotion labels based on game number
emotion_map = {
    'G1': 'Excited',
    'G2': 'Calm',
    'G3': 'Fearful',
    'G4': 'Bored'
}

# Subjects S01 to S28
subjects = []
for i in range(1, 29):
    subject = f"S{str(i).zfill(2)}"
    subjects.append(subject)

# Games G1 to G4
games = []
for j in range(1, 5):
    game = f"G{j}"
    games.append(game)

# All data storage
all_data = []

# Load dataset context (ONE TIME for all files)
with kagglehub.load_dataset("wajahat1064/emotion-recognition-using-eeg-and-computer-games",".","PANDAS") as dataset:

    for subject in subjects:
        for game in games:
            base_name = subject + game + "AllChannels"
            file_name = base_name + ".csv"

            try:
                # Step 1: Get full path to file
                file_path = dataset.path(file_name)

                # Step 2: Load with pandas
                df = pd.read_csv(file_path)

                # Step 3: Drop unnamed/index columns
                columns_to_keep = []
                for col in df.columns:
                    if not col.startswith("Unnamed"):
                        columns_to_keep.append(col)
                df = df[columns_to_keep]

                # Step 4: Drop all-NaN columns
                df = df.dropna(axis=1, how='all')

                # Step 5: Impute
                imputer = KNNImputer(n_neighbors=5)
                df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

                # Step 6: Normalize
                scaler = StandardScaler()
                df[df.columns] = scaler.fit_transform(df[df.columns])

                # Step 7: Add metadata
                df["Subject"] = subject
                df["Game"] = game
                df["Emotion"] = emotion_map[game]

                all_data.append(df)
                print("✅ Loaded:", file_name)

            except Exception as e:
                print("❌ Error loading", file_name + ":", str(e))

# Combine all data
combined_df = pd.concat(all_data, ignore_index=True)

# Encode emotion for ML
label_encoder = LabelEncoder()
combined_df['Emotion_encoded'] = label_encoder.fit_transform(combined_df['Emotion'])

# Sort by Subject and Game
combined_df = combined_df.sort_values(by=['Subject', 'Game'])

# Final summary
print("📊 Final shape:", combined_df.shape)
print("🎯 Emotions:", label_encoder.classes_)
print(combined_df[['Subject', 'Game', 'Emotion']].drop_duplicates().head())

# Optional: Save to CSV
combined_df.to_csv("All_EEG_Combined.csv", index=False)
print("💾 Saved to All_EEG_Combined.csv")

  with kagglehub.load_dataset("wajahat1064/emotion-recognition-using-eeg-and-computer-games",".","PANDAS") as dataset:


KeyError: 'wajahat1064/emotion-recognition-using-eeg-and-computer-games'

In [None]:
# Visualize distributions to find outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplots to visualize outliers")
plt.show()

# Optionally: remove rows with outliers using IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [8]:
# Check for object or categorical columns
print("Non-numeric columns:", df.select_dtypes(include=['object', 'category']).columns)

# If any categorical columns are present, encode them
# Example: df['Emotion'] = df['Emotion'].map({'Happy': 0, 'Sad': 1, 'Angry': 2, ...})
# Or use one-hot encoding:
# df = pd.get_dummies(df, drop_first=True)


Non-numeric columns: Index([], dtype='object')


In [7]:
# Example: mean signal per row (across channels)
df['Signal_Mean'] = df.mean(axis=1)
df['Signal_Std'] = df.std(axis=1)

# Optional: Feature interaction
df['Mean_to_Std_Ratio'] = df['Signal_Mean'] / (df['Signal_Std'] + 1e-5)

# Drop unnamed columns with all NaN values
df = df.loc[:, df.columns[~df.columns.str.contains('^Unnamed')]]

print("Shape after all prep:", df.shape)
print(df.head())
