<a href="https://colab.research.google.com/github/mcurry572/4540/blob/main/HW7_4540_Michael_Curry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The EEG dataset contains brain activity data recorded from different subjects. Each row represents a single EEG recording with multiple feature columns representing different EEG signal parameters. The final column represents the target variable. That can indicate a specific mental state, emotion, or classification category.

# Inputs and Outputs:
- **Inputs:** EEG signal features (numerical values representing brain activity)
- **Output:** A categorical variable indicating a specific class or mental state

# Classification or Regression problem?
In this dataset, the target variable has a limited number of unique values, which makes this a classification problem.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define file path
data_path = '/content/drive/My Drive/EEG_data.csv'

# Load the EEG dataset
df = pd.read_csv(data_path)

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

# Identify input features and target variable
# Assuming the last column is the target (Adjust if necessary)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Determine if it is a classification or regression problem
if y.nunique() > 2:
    print("This is a multi-class classification problem.")
elif y.nunique() == 2:
    print("This is a binary classification problem.")
else:
    print("This is a regression problem.")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader for PyTorch
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define Neural Network Model
class NeuralNet(nn.Module):
    def __init__(self, input_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 2)  # Assuming binary classification
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        return x

# Instantiate and train the model
input_size = X_train.shape[1]
model = NeuralNet(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

train_model(model, train_loader, criterion, optimizer, epochs=10)

# Evaluate the Neural Network Model
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.numpy())
            all_labels.extend(labels.numpy())
    print("Neural Network Performance:")
    print(classification_report(all_labels, all_preds))

evaluate_model(model, test_loader)

# Feature Engineering
# 1. Normalization is already done using StandardScaler
# 2. Remove outliers using IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_cleaned = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# 3. Select a subset of features (assuming first 5 features for simplicity)
df_selected = df_cleaned.iloc[:, :5]

# 4. Select a subset of SubjectIDs (if applicable)
if 'SubjectID' in df.columns:
    df_subjects = df_cleaned[df_cleaned['SubjectID'].isin(df_cleaned['SubjectID'].unique()[:10])]
else:
    df_subjects = df_cleaned  # No SubjectID column found

Mounted at /content/drive
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12811 entries, 0 to 12810
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SubjectID           12811 non-null  float64
 1   VideoID             12811 non-null  float64
 2   Attention           12811 non-null  float64
 3   Mediation           12811 non-null  float64
 4   Raw                 12811 non-null  float64
 5   Delta               12811 non-null  float64
 6   Theta               12811 non-null  float64
 7   Alpha1              12811 non-null  float64
 8   Alpha2              12811 non-null  float64
 9   Beta1               12811 non-null  float64
 10  Beta2               12811 non-null  float64
 11  Gamma1              12811 non-null  float64
 12  Gamma2              12811 non-null  float64
 13  predefinedlabel     12811 non-null  float64
 14  user-definedlabeln  12811 non-null  float64
dtypes: float64(15