In [1]:
import pandas as pd

# Load CSVs
eeg = pd.read_csv("../data/EEG_features_engineered.csv")
eye = pd.read_csv("../data/EYE_features_engineered.csv")
gsr = pd.read_csv("../data/GSR_features_engineered.csv")
ivt = pd.read_csv("../data/IVT_features_engineered.csv")
tiva = pd.read_csv("../data/TIVA_features_engineered.csv")
psy = pd.read_csv("../data/PSY_features_engineered.csv")  # labels

# 👉 Check what columns exist
print("EEG cols:", eeg.columns[:5])
print("PSY cols:", psy.columns[:5])


EEG cols: Index(['Key', 'Participant_ID', 'Start_ms', 'End_ms', 'eeg_mean_delta'], dtype='object')
PSY cols: Index(['Key', 'Participant_ID', 'Category', 'Difficulty', 'ResponseTime'], dtype='object')


In [2]:
import pandas as pd

# Load all CSVs
eeg = pd.read_csv("../data/EEG_features_engineered.csv")
eye = pd.read_csv("../data/EYE_features_engineered.csv")
gsr = pd.read_csv("../data/GSR_features_engineered.csv")
ivt = pd.read_csv("../data/IVT_features_engineered.csv")
tiva = pd.read_csv("../data/TIVA_features_engineered.csv")
psy = pd.read_csv("../data/PSY_features_engineered.csv")  # has 'Category' as labels

# Merge on Key + Participant_ID
data = eeg.merge(eye, on=["Key","Participant_ID"]) \
          .merge(gsr, on=["Key","Participant_ID"]) \
          .merge(ivt, on=["Key","Participant_ID"]) \
          .merge(tiva, on=["Key","Participant_ID"])

# Add labels
data["label"] = psy["Category"]

# Save combined dataset
data.to_csv("../data/combined.csv", index=False)

print("✅ Combined dataset created and saved in data/combined.csv")
print("Shape:", data.shape)
print("Label distribution:\n", data["label"].value_counts())


MergeError: Passing 'suffixes' which cause duplicate columns {'Start_ms_x', 'End_ms_x'} is not allowed.

In [3]:
import pandas as pd

# Load CSVs
eeg = pd.read_csv("../data/EEG_features_engineered.csv")
eye = pd.read_csv("../data/EYE_features_engineered.csv")
gsr = pd.read_csv("../data/GSR_features_engineered.csv")
ivt = pd.read_csv("../data/IVT_features_engineered.csv")
tiva = pd.read_csv("../data/TIVA_features_engineered.csv")
psy = pd.read_csv("../data/PSY_features_engineered.csv")  # labels

# Drop duplicate/time columns before merge (keep only feature columns)
# Keep 'Key' and 'Participant_ID' for merging
eeg_features = eeg.drop(columns=['Start_ms','End_ms'], errors='ignore')
eye_features = eye.drop(columns=['Start_ms','End_ms'], errors='ignore')
gsr_features = gsr.drop(columns=['Start_ms','End_ms'], errors='ignore')
ivt_features = ivt.drop(columns=['Start_ms','End_ms'], errors='ignore')
tiva_features = tiva.drop(columns=['Start_ms','End_ms'], errors='ignore')

# Merge all features on Key + Participant_ID
data = eeg_features.merge(eye_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(gsr_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(ivt_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(tiva_features, on=['Key','Participant_ID'], how='inner')

# Add labels from PSY
data['label'] = psy['Category']

# Optional: convert text labels to numbers
label_map = {'Neutral':0, 'Positive':1, 'Negative':2}
data['label'] = data['label'].map(label_map)

# Save combined dataset
data.to_csv("../data/combined.csv", index=False)
print("✅ Combined dataset saved as combined.csv")
print("Shape:", data.shape)
print("Label distribution:\n", data['label'].value_counts())


✅ Combined dataset saved as combined.csv
Shape: (1508, 50)
Label distribution:
 Series([], Name: count, dtype: int64)


In [4]:
# Merge all features
data = eeg_features.merge(eye_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(gsr_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(ivt_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(tiva_features, on=['Key','Participant_ID'], how='inner')

# Merge labels from PSY
data = data.merge(psy[['Key','Participant_ID','Category']], on=['Key','Participant_ID'], how='inner')

# Rename 'Category' to 'label' and convert to numbers
label_map = {'Neutral':0, 'Positive':1, 'Negative':2}
data['label'] = data['Category'].map(label_map)

# Drop the original 'Category' column
data = data.drop(columns=['Category'])

# Save
data.to_csv("../data/combined.csv", index=False)
print("✅ Combined dataset saved as combined.csv")
print("Shape:", data.shape)
print("Label distribution:\n", data['label'].value_counts())


✅ Combined dataset saved as combined.csv
Shape: (1572, 50)
Label distribution:
 Series([], Name: count, dtype: int64)


In [5]:
# Check types
print(eeg_features.dtypes[['Key','Participant_ID']])
print(psy.dtypes[['Key','Participant_ID']])

# Check unique values
print("EEG keys sample:", eeg_features['Key'].unique()[:10])
print("PSY keys sample:", psy['Key'].unique()[:10])
print("EEG Participant_ID sample:", eeg_features['Participant_ID'].unique()[:10])
print("PSY Participant_ID sample:", psy['Participant_ID'].unique()[:10])


Key               object
Participant_ID     int64
dtype: object
Key               object
Participant_ID     int64
dtype: object
EEG keys sample: ['1spl1' '1spl2' '1Item1' '1Item2' '1Item3' '1Item4' '1Item5' '1Item6'
 '1Item7' '1Item8']
PSY keys sample: ['1spl1' '1spl2' '1Item1' '1Item2' '1Item3' '1Item4' '1Item5' '1Item6'
 '1Item7' '1Item8']
EEG Participant_ID sample: [ 1  2  3  4  5  6  7  8  9 10]
PSY Participant_ID sample: [ 1  2  3  4  5  6  7  8  9 10]


In [6]:
print(psy['Category'].unique())
print(psy['Category'].isna().sum())


[1 2 3]
0


In [7]:
# If you want to remap 1,2,3 → 0,1,2
label_map = {1:0, 2:1, 3:2}  
psy['label'] = psy['Category'].map(label_map)

# Merge with features
data = eeg_features.merge(eye_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(gsr_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(ivt_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(tiva_features, on=['Key','Participant_ID'], how='inner') \
                   .merge(psy[['Key','Participant_ID','label']], on=['Key','Participant_ID'], how='inner')

# Save
data.to_csv("../data/combined.csv", index=False)

print("✅ Combined dataset saved as combined.csv")
print("Shape:", data.shape)
print("Label distribution:\n", data['label'].value_counts())


✅ Combined dataset saved as combined.csv
Shape: (1572, 50)
Label distribution:
 label
2    670
1    455
0    447
Name: count, dtype: int64


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Load dataset
df = pd.read_csv("../data/combined.csv")

# Features and labels
X = df.drop(columns=["label"])
y = df["label"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "../models/randomforest_sentiment.pkl")
print("✅ Model saved at models/randomforest_sentiment.pkl")


ValueError: could not convert string to float: '3Item1'

In [9]:
X = df.drop(columns=["label", "Key", "Participant_ID"])


In [12]:
X = data.drop(columns=['label', 'Key', 'Participant_ID'])
y = data['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)





In [13]:
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
joblib.dump(model, "../models/randomforest_sentiment.pkl")
print("✅ Model saved at models/randomforest_sentiment.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '../models/randomforest_sentiment.pkl'

In [16]:
import os
import joblib

# Make sure models/ folder exists
os.makedirs("../models", exist_ok=True)

# Save model
joblib.dump(model, "../models/randomforest_sentiment.pkl")
print("✅ Model saved at models/randomforest_sentiment.pkl")


✅ Model saved at models/randomforest_sentiment.pkl
