In [None]:
# 🧹 Clear any leftover Drive mount first (safe)
!fusermount -u /content/drive 2>/dev/null || echo "Not mounted yet"
!rm -rf /content/drive


Not mounted yet


In [None]:
# 🔗 1. Mount Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# 📂 2. Setup paths
import os
ROOT_IN  = '/content/drive/MyDrive/PHQ8_Project/transcripts'
ROOT_OUT = '/content/drive/MyDrive/PHQ8_Project/embeddings'
FINAL_CSV = '/content/drive/MyDrive/PHQ8_Project/bert_embeddings_all.csv'
os.makedirs(ROOT_OUT, exist_ok=True)

# 🧪 3. Sanity check: list 5 transcript files
import glob
files = sorted(glob.glob(f'{ROOT_IN}/*_transcript.csv'))
print(f'Found {len(files)} files:')
for f in files[:5]:
    print('-', os.path.basename(f))


Found 265 files:
- 300_transcript.csv
- 301_transcript.csv
- 302_transcript.csv
- 303_transcript.csv
- 304_transcript.csv


In [None]:
!pip install -q sentence-transformers tqdm

from sentence_transformers import SentenceTransformer
import pandas as pd, numpy as np
import re
from tqdm import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dim

def pid_from_path(path):
    m = re.search(r'/(\d+)_', path)
    return m.group(1) if m else None

def find_text_col(df):
    candidates = ['text', 'utterance', 'sentence', 'transcript']
    cols = [c.lower() for c in df.columns]
    for name in candidates:
        if name in cols:
            return df.columns[cols.index(name)]
    return df.columns[0]

records = []
for path in tqdm(files):
    pid = pid_from_path(path)
    if not pid:
        print(f'Skipping: {path} (no ID found)')
        continue

    try:
        df = pd.read_csv(path, dtype=str)
        text_col = find_text_col(df)
        text = " ".join(df[text_col].dropna().astype(str))
        vec = model.encode(text, convert_to_numpy=True)
    except Exception as e:
        print(f"Error with {path}: {e}")
        continue

    cols = [f'bert_{i}' for i in range(vec.size)]
    records.append({'participant_id': pid, **dict(zip(cols, vec))})

# 💾 Save final embedding table
master = pd.DataFrame(records).set_index('participant_id').sort_index()
master.to_csv(FINAL_CSV)
print(f'✅ Saved: {FINAL_CSV} with shape {master.shape}')


100%|██████████| 265/265 [00:09<00:00, 26.62it/s]


✅ Saved: /content/drive/MyDrive/PHQ8_Project/bert_embeddings_all.csv with shape (265, 384)


In [None]:
import glob, os, re
from collections import defaultdict

# 🔧 Adjust folder paths
FOLDERS = {
    'eGeMAPS':   '/content/drive/MyDrive/PHQ8_Project/BoAW_eGeMAPS/',
    'MFCC':      '/content/drive/MyDrive/PHQ8_Project/BoAW_MFCC/',
    'OpenFace':  '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/',
    'BERT':      '/content/drive/MyDrive/PHQ8_Project/transcripts/',
}

# 🔎 Helper: extract participant ID (e.g., 300 from 300_transcript.csv)
def extract_id(filename):
    m = re.search(r'(\d+)', os.path.basename(filename))
    return m.group(1) if m else None

# 📊 Track participants per folder
participants_per_folder = {}
all_ids = set()

for name, path in FOLDERS.items():
    files = glob.glob(f"{path}/*.csv") + glob.glob(f"{path}/*.npy")
    ids = set(filter(None, [extract_id(f) for f in files]))
    participants_per_folder[name] = ids
    all_ids.update(ids)
    print(f"📂 {name}: {len(ids)} participants")

# 🔍 Optionally: check who is missing from each folder
print("\n🔍 Participants missing from folders:")
for name, ids in participants_per_folder.items():
    missing = sorted(all_ids - ids)
    if missing:
        print(f"❌ {name} is missing {len(missing)} participants: {missing[:10]}{'...' if len(missing) > 10 else ''}")
    else:
        print(f"✅ {name} has all participants.")


📂 eGeMAPS: 219 participants
📂 MFCC: 275 participants
📂 OpenFace: 275 participants
📂 BERT: 265 participants

🔍 Participants missing from folders:
❌ eGeMAPS is missing 56 participants: ['600', '602', '604', '605', '606', '607', '609', '615', '618', '619']...
✅ MFCC has all participants.
✅ OpenFace has all participants.
❌ BERT is missing 10 participants: ['673', '677', '680', '684', '692', '695', '697', '702', '703', '707']


In [None]:
import glob, os, re
from collections import defaultdict

# 🔧 Updated FOLDER paths
FOLDERS = {
    'eGeMAPS':   '/content/drive/MyDrive/PHQ8_Project/BoAW_eGeMAPS/**',
    'MFCC':      '/content/drive/MyDrive/PHQ8_Project/BoAW_MFCC/',
    'OpenFace':  '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/',
    'BERT':      '/content/drive/MyDrive/PHQ8_Project/transcripts/',
}

# 🔍 Extract participant ID from filename
def extract_id(filename):
    m = re.search(r'(\d+)', os.path.basename(filename))
    return m.group(1) if m else None

# 📊 Collect participants per folder
participants_per_folder = {}
all_ids = set()

print("🔍 Counting participants per folder...\n")

for name, path in FOLDERS.items():
    # 🪄 Match all .csv/.npy recursively if path ends with **
    pattern = f"{path}/*.csv" if not path.endswith('**') else f"{path}/**/*.csv"
    files = glob.glob(pattern, recursive=True) + glob.glob(pattern.replace('.csv', '.npy'), recursive=True)

    ids = set(filter(None, [extract_id(f) for f in files]))
    participants_per_folder[name] = ids
    all_ids.update(ids)

    print(f"📂 {name}: {len(ids)} participants")

# 🧼 Summary of missing participants
print("\n🔍 Missing participant check:")
for name, ids in participants_per_folder.items():
    missing = sorted(all_ids - ids)
    if missing:
        print(f"❌ {name} is missing {len(missing)} participants: {missing[:10]}{'...' if len(missing) > 10 else ''}")
    else:
        print(f"✅ {name} has all participants.")


🔍 Counting participants per folder...

📂 eGeMAPS: 275 participants
📂 MFCC: 275 participants
📂 OpenFace: 275 participants
📂 BERT: 265 participants

🔍 Missing participant check:
✅ eGeMAPS has all participants.
✅ MFCC has all participants.
✅ OpenFace has all participants.
❌ BERT is missing 10 participants: ['673', '677', '680', '684', '692', '695', '697', '702', '703', '707']


In [None]:
import pandas as pd, numpy as np, os, re, glob

# Folder structure as defined earlier
FOLDERS = {
    'eGeMAPS':   '/content/drive/MyDrive/PHQ8_Project/BoAW_eGeMAPS',
    'MFCC':      '/content/drive/MyDrive/PHQ8_Project/BoAW_MFCC',
    'OpenFace':  '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs',
}

def extract_pid(path):
    m = re.search(r'(\d+)', os.path.basename(path))
    return m.group(1) if m else None

def summarize_numeric_df(df, prefix):
    df = df.select_dtypes(include=[np.number])
    summary = {}
    for i, col in enumerate(df.columns):
        base = f"{prefix}_f{i}"
        summary[f"{base}_mean"] = df[col].mean()
        summary[f"{base}_std"] = df[col].std()
        summary[f"{base}_min"] = df[col].min()
        summary[f"{base}_max"] = df[col].max()
    return summary

# Step 1: collect all paths by participant
participant_files = {}

for modality, folder in FOLDERS.items():
    pattern = f"{folder}/**/*.csv" if modality == 'eGeMAPS' else f"{folder}/*.csv"
    paths = glob.glob(pattern, recursive=True)
    for path in paths:
        pid = extract_pid(path)
        if not pid:
            continue
        participant_files.setdefault(pid, {})[modality] = path

# Step 2: summarize each participant
rows = []

for pid, file_dict in participant_files.items():
    row = {}
    has_valid_data = False

    for modality, path in file_dict.items():
        try:
            df = pd.read_csv(path, header=None)
            stats = summarize_numeric_df(df, modality)
            row.update(stats)
            has_valid_data = True
        except Exception as e:
            print(f"⚠️ Failed to process {modality} for {pid}: {e}")

    if has_valid_data:
        row['participant_id'] = pid
        rows.append(row)

# Step 3: finalize
final_df = pd.DataFrame(rows)
final_df = final_df.set_index('participant_id')
output_path = '/content/drive/MyDrive/PHQ8_Project/final_modality_summary.csv'
final_df.to_csv(output_path)

print(f"✅ Final summary saved to {output_path}, shape: {final_df.shape}")


✅ Final summary saved to /content/drive/MyDrive/PHQ8_Project/final_modality_summary.csv, shape: (275, 1212)


# The later Part of transcripts


In [None]:
import pandas as pd

# Paths
modality_summary_path = '/content/drive/MyDrive/PHQ8_Project/final_modality_summary.csv'
bert_path             = '/content/drive/MyDrive/PHQ8_Project/bert_embeddings_all.csv'
output_path           = '/content/drive/MyDrive/PHQ8_Project/final_fused_summary.csv'

# Load both
modality_df = pd.read_csv(modality_summary_path).set_index('participant_id')
bert_df     = pd.read_csv(bert_path).set_index('participant_id')

# Merge
fused_df = modality_df.join(bert_df, how='inner')  # inner: only participants in both
fused_df.to_csv(output_path)

print(f"✅ Final fused dataset saved to {output_path}, shape: {fused_df.shape}")


✅ Final fused dataset saved to /content/drive/MyDrive/PHQ8_Project/final_fused_summary.csv, shape: (265, 1596)


In [None]:
# 📁 2. File paths
import glob, os
TRANSCRIPT_DIR = '/content/drive/MyDrive/PHQ8_Project/transcripts'
transcript_files = sorted(glob.glob(f'{TRANSCRIPT_DIR}/*_transcript.csv'))

print(f'📂 Found {len(transcript_files)} transcript files')

# 🧹 3. Basic sanity check
import pandas as pd

bad_files = []
for path in transcript_files:
    try:
        df = pd.read_csv(path)
        assert 'text' in df.columns and 'start' in df.columns and 'end' in df.columns
        assert not df['text'].isnull().all()
        assert len(df) > 0
    except Exception as e:
        bad_files.append((os.path.basename(path), str(e)))

if bad_files:
    print(f"⚠️ Found {len(bad_files)} problematic files:")
    for name, err in bad_files:
        print(f" - {name}: {err}")
else:
    print("✅ All transcript files passed basic sanity check.")

📂 Found 275 transcript files
✅ All transcript files passed basic sanity check.


In [None]:
import pandas as pd, numpy as np, re
from tqdm import tqdm

# 🔑 Define useful sets
FIRST_PERSON = {'i','me','my','mine','myself'}
NEGATIONS    = {'no','not',"don't","didn't","won't","can't","never"}

# 🛠️ Core extraction function
def extract_timing_lexical(path):
    df = pd.read_csv(path)
    pid = re.search(r'(\d+)', os.path.basename(path)).group(1)

    # text & timestamps
    texts = df['text'].dropna().astype(str).tolist()
    starts = df['start'].astype(float).tolist()
    ends   = df['end'].astype(float).tolist()

    features = {}
    features['participant_id'] = pid
    features['total_utterances'] = len(texts)
    features['total_speaking_time'] = max(ends) - min(starts) if ends else 0

    # pause durations
    pauses = [starts[i] - ends[i-1] for i in range(1, len(starts))]
    features['avg_pause_duration'] = np.mean(pauses) if pauses else 0
    features['max_pause_duration'] = max(pauses) if pauses else 0

    # tokenization
    all_text = " ".join(texts).lower()
    words = re.findall(r'\b\w+\b', all_text)

    features['total_words'] = len(words)
    features['avg_words_per_utterance'] = len(words) / len(texts) if texts else 0
    features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0
    features['lexical_diversity'] = len(set(words)) / len(words) if words else 0

    features['first_person_pronoun_count'] = sum(w in FIRST_PERSON for w in words)
    features['negation_count'] = sum(w in NEGATIONS for w in words)

    return features

# 🌀 Run over all participants
summary_rows = []
for path in tqdm(transcript_files):
    try:
        feats = extract_timing_lexical(path)
        summary_rows.append(feats)
    except Exception as e:
        print(f"❌ Failed on {path}: {e}")

# 🧾 Save as CSV
df_summary = pd.DataFrame(summary_rows).set_index('participant_id').sort_index()
OUT_CSV = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_step2.csv'
df_summary.to_csv(OUT_CSV)
print(f"✅ Saved: {OUT_CSV} with shape {df_summary.shape}")


100%|██████████| 275/275 [00:02<00:00, 130.46it/s]

✅ Saved: /content/drive/MyDrive/PHQ8_Project/transcript_summary_step2.csv with shape (275, 10)





In [None]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer, util

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
transcript_files = sorted(glob.glob(f'{TRANSCRIPT_DIR}/*_transcript.csv'))
OUT_SEMANTIC = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_step3.csv'

# Topics with seed words
SEED_TOPICS = {
    'sleep':    ['sleep','insomnia','nap','tired','dream'],
    'appetite': ['hunger','hungry','eat','snack','appetite'],
    'depression': ['sad','hopeless','worthless','down','empty'],
    'anxiety':  ['anxious','nervous','worried','panic','fear'],
    'stress':   ['stress','stressed','pressure','overwhelmed'],
    'ptsd':     ['trauma','flashback','nightmare'],
    'harm':     ['suicide','hurt','kill','cut','harm'],
}
SIM_THRESHOLD = 0.6

# Load MiniLM encoder
model = SentenceTransformer('all-MiniLM-L6-v2')

# Pre-encode seed words per topic
seed_embeddings = {
    topic: model.encode(words, convert_to_tensor=True)
    for topic, words in SEED_TOPICS.items()
}

# Featurization function
def extract_semantic_counts(path):
    df = pd.read_csv(path)
    pid = re.search(r'(\d+)', os.path.basename(path)).group(1)
    texts = df['text'].dropna().astype(str).tolist()

    words = list(set(re.findall(r'\b\w+\b', " ".join(texts).lower())))
    if not words:
        return {'participant_id': pid, **{f'{t}_word_count': 0 for t in SEED_TOPICS}}

    word_vecs = model.encode(words, convert_to_tensor=True)

    topic_counts = {}
    for topic, seed_vecs in seed_embeddings.items():
        sim = util.cos_sim(word_vecs, seed_vecs)  # shape (len(words), len(seed_words))
        counts = (sim.max(dim=1).values > SIM_THRESHOLD).sum().item()
        topic_counts[f'{topic}_word_count'] = counts

    return {'participant_id': pid, **topic_counts}

# Loop over all files
semantic_rows = []
for path in tqdm(transcript_files):
    try:
        row = extract_semantic_counts(path)
        semantic_rows.append(row)
    except Exception as e:
        print(f"❌ Error with {path}: {e}")

# Save to CSV
df_semantic = pd.DataFrame(semantic_rows).set_index('participant_id').sort_index()
df_semantic.to_csv(OUT_SEMANTIC)
print(f"✅ Saved: {OUT_SEMANTIC} with shape {df_semantic.shape}")

100%|██████████| 275/275 [00:32<00:00,  8.34it/s]

✅ Saved: /content/drive/MyDrive/PHQ8_Project/transcript_summary_step3.csv with shape (275, 7)





In [None]:
import pandas as pd

# 📍 Paths to Step 2 and Step 3 outputs
PATH_STEP2 = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_step2.csv'
PATH_STEP3 = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_step3.csv'
FINAL_OUT  = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_all.csv'

# 🧾 Load both feature tables
df_timing = pd.read_csv(PATH_STEP2).set_index('participant_id')
df_semantic = pd.read_csv(PATH_STEP3).set_index('participant_id')

# 🔗 Join on participant_id
df_combined = df_timing.join(df_semantic, how='outer')  # outer to catch all even if mismatch

# 💾 Save final combined summary
df_combined.to_csv(FINAL_OUT)
print(f"✅ Final summary saved: {FINAL_OUT} — shape: {df_combined.shape}")


✅ Final summary saved: /content/drive/MyDrive/PHQ8_Project/transcript_summary_all.csv — shape: (275, 17)


In [None]:
import pandas as pd

# 📍 Paths to both inputs
MODALITY_CSV = '/content/drive/MyDrive/PHQ8_Project/final_modality_summary.csv'
TRANSCRIPT_CSV = '/content/drive/MyDrive/PHQ8_Project/transcript_summary_all.csv'
FINAL_MERGED = '/content/drive/MyDrive/PHQ8_Project/full_participant_summary.csv'

# 🔃 Load and merge
df_modal = pd.read_csv(MODALITY_CSV).set_index('participant_id')
df_trans = pd.read_csv(TRANSCRIPT_CSV).set_index('participant_id')

df_merged = df_modal.join(df_trans, how='outer').sort_index()

# 🧼 Missing check
missing = df_merged.isnull().sum()
print("🧩 Missing values by column (non-zero only):")
print(missing[missing > 0].sort_values(ascending=False))

# 💾 Save final dataset
df_merged.to_csv(FINAL_MERGED)
print(f"✅ Final full summary saved: {FINAL_MERGED} — shape: {df_merged.shape}")


🧩 Missing values by column (non-zero only):
OpenFace_f100_max    56
OpenFace_f0_mean     56
OpenFace_f0_std      56
OpenFace_f0_min      56
OpenFace_f96_max     56
                     ..
OpenFace_f2_mean     56
OpenFace_f1_max      56
OpenFace_f1_min      56
OpenFace_f1_std      56
OpenFace_f1_mean     56
Length: 404, dtype: int64
✅ Final full summary saved: /content/drive/MyDrive/PHQ8_Project/full_participant_summary.csv — shape: (275, 1229)


In [None]:
import os
import pandas as pd
import numpy as np
import glob

# ✅ Define your actual paths here
paths = {
    'transcript': '/content/drive/MyDrive/PHQ8_Project/transcript_summary_all.csv',
    'eGeMAPS': '/content/drive/MyDrive/PHQ8_Project/BoAW_eGeMAPS',
    'MFCC': '/content/drive/MyDrive/PHQ8_Project/BoAW_MFCC',
    'OpenFace': '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs'
}

# 🧠 Load transcript summary (participant_id should be a column)
transcript_df = pd.read_csv(paths['transcript'])

# 🆔 Helper: Extract participant ID from file name (e.g., "300_...csv")
def extract_pid(filename):
    return filename.split("_")[0]

# 📊 Stats computation function
def compute_statistics(df):
    df = df.select_dtypes(include=[np.number])  # only numeric columns
    stats = df.agg(['mean', 'std', 'min', 'max']).T
    stats.columns = [f"{stat}" for stat in stats.columns]
    return stats.reset_index(drop=True).iloc[0]

# 🔁 General processor for any modality folder (recursive)
def process_modality_folder(folder_path, modality_name):
    summary_dict = {}

    # 🔍 Get all .csv files in folder and subfolders
    csv_files = glob.glob(os.path.join(folder_path, "**/*.csv"), recursive=True)

    for file_path in csv_files:
        filename = os.path.basename(file_path)
        pid = extract_pid(filename)

        try:
            df = pd.read_csv(file_path)
            stats = compute_statistics(df)
            stats.index = [f"{modality_name}_{col}" for col in stats.index]
            summary_dict[pid] = stats
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    return pd.DataFrame.from_dict(summary_dict, orient='index').reset_index().rename(columns={'index': 'participant_id'})

# 🔄 Process all modalities
ege_df = process_modality_folder(paths['eGeMAPS'], "eGeMAPS")
mfcc_df = process_modality_folder(paths['MFCC'], "MFCC")
openface_df = process_modality_folder(paths['OpenFace'], "OpenFace")

# 🧬 Merge all with transcript summary
merged = transcript_df.copy()
merged['participant_id'] = merged['participant_id'].astype(str)

for df in [ege_df, mfcc_df, openface_df]:
    df['participant_id'] = df['participant_id'].astype(str)
    merged = pd.merge(merged, df, on='participant_id', how='left')

# 💾 Save final output
output_path = '/content/drive/MyDrive/PHQ8_Project/final_multimodal_summary.csv'
merged.to_csv(output_path, index=False)

print(f"✅ Final multimodal summary saved at: {output_path}")


❌ Error processing 600_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 602_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 604_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 605_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 606_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 607_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 609_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 615_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 618_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 619_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 620_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv: No objects to concatenate
❌ Error processing 622_BoVW_open

In [None]:
import os
import pandas as pd
import numpy as np
import glob

# ✅ Path setup (adjust if needed)
paths = {
    'transcript': '/content/drive/MyDrive/PHQ8_Project/transcript_summary_all.csv',
    'eGeMAPS': '/content/drive/MyDrive/PHQ8_Project/BoAW_eGeMAPS',
    'MFCC': '/content/drive/MyDrive/PHQ8_Project/BoAW_MFCC',
    'OpenFace': '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs'
}

# 🧠 Load transcript summary
transcript_df = pd.read_csv(paths['transcript'])
transcript_df['participant_id'] = transcript_df['participant_id'].astype(str)

# 🆔 Extract participant ID from file name (e.g., 600_...)
def extract_pid(filename):
    return filename.split("_")[0]

# 🔍 Smart separator detector
def smart_read_csv(file_path):
    with open(file_path, 'r') as f:
        sample = f.readline()
        delimiter = ',' if sample.count(',') > sample.count(';') else ';'
    return pd.read_csv(file_path, sep=delimiter)

# 📊 Compute basic stats
def compute_statistics(df):
    df = df.select_dtypes(include=[np.number])
    if df.empty or df.dropna(how='all').empty:
        raise ValueError("Empty or all-NaN dataframe")
    stats = df.agg(['mean', 'std', 'min', 'max']).T
    stats.columns = [f"{stat}" for stat in stats.columns]
    return stats.reset_index(drop=True).iloc[0]

# 🔁 Process each modality folder recursively
def process_modality_folder(folder_path, modality_name):
    summary_dict = {}
    csv_files = glob.glob(os.path.join(folder_path, "**/*.csv"), recursive=True)

    for file_path in csv_files:
        filename = os.path.basename(file_path)
        pid = extract_pid(filename)

        try:
            df = smart_read_csv(file_path)
            stats = compute_statistics(df)
            stats.index = [f"{modality_name}_{col}" for col in stats.index]
            summary_dict[pid] = stats
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    return pd.DataFrame.from_dict(summary_dict, orient='index').reset_index().rename(columns={'index': 'participant_id'})

# 🔄 Process all three modalities
ege_df = process_modality_folder(paths['eGeMAPS'], "eGeMAPS")
mfcc_df = process_modality_folder(paths['MFCC'], "MFCC")
openface_df = process_modality_folder(paths['OpenFace'], "OpenFace")

# 🧬 Merge with transcript
merged = transcript_df.copy()
for df in [ege_df, mfcc_df, openface_df]:
    df['participant_id'] = df['participant_id'].astype(str)
    merged = pd.merge(merged, df, on='participant_id', how='left')

# 💾 Save output
output_path = '/content/drive/MyDrive/PHQ8_Project/final_multimodal_summary.csv'
merged.to_csv(output_path, index=False)

print(f"✅ Final multimodal summary saved at: {output_path}")


✅ Final multimodal summary saved at: /content/drive/MyDrive/PHQ8_Project/final_multimodal_summary.csv


In [None]:
import pandas as pd, numpy as np, os, glob, re
from tqdm import tqdm

# 📁 Set path
OPENFACE_DIR = '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs'
OUT_CSV = '/content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary.csv'

# ✅ Safe CSV loader with semicolon fallback
def safe_read_openface(path):
    try:
        df = pd.read_csv(path)
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=';')
        if df.shape[1] < 10:
            raise ValueError("Too few columns")
        return df.select_dtypes(include=np.number)
    except Exception as e:
        print(f"❌ Skipping {os.path.basename(path)}: {e}")
        return None

# 📊 Summary function
def summarize_openface_file(path):
    pid = re.search(r'(\d+)', os.path.basename(path)).group(1)
    df = safe_read_openface(path)
    if df is None or df.empty:
        return None

    stats = {}
    for col in df.columns:
        stats[f'{col}_mean'] = df[col].mean()
        stats[f'{col}_std'] = df[col].std()
        stats[f'{col}_min'] = df[col].min()
        stats[f'{col}_max'] = df[col].max()
    stats['participant_id'] = pid
    return stats

# 🔄 Loop through all OpenFace files
paths = sorted(glob.glob(f'{OPENFACE_DIR}/*.csv'))
rows = [summarize_openface_file(p) for p in tqdm(paths)]
rows = [r for r in rows if r is not None]

# 💾 Save summary
df_summary = pd.DataFrame(rows).set_index('participant_id').sort_index()
df_summary.to_csv(OUT_CSV)
print(f"✅ BoVW summary saved: {OUT_CSV} with shape {df_summary.shape}")


100%|██████████| 276/276 [00:37<00:00,  7.30it/s]


✅ BoVW summary saved: /content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary.csv with shape (276, 744)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load a clean file to use as column reference
ref_path = '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/301_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv'
ref_df = pd.read_csv(ref_path)
ref_columns = ref_df.select_dtypes(include='number').columns.tolist()


In [None]:
import os

def safe_read_openface(path, ref_columns=None):
    try:
        df = pd.read_csv(path)
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=';')

        # Drop filename string column if present
        first_col = df.columns[0]
        if df[first_col].astype(str).str.contains('.csv').all():
            df = df.drop(columns=[first_col])

        # Keep only numeric columns
        df = df.select_dtypes(include='number')

        # Rename to match reference if shape matches
        if ref_columns and len(df.columns) == len(ref_columns):
            df.columns = ref_columns

        return df if not df.empty else None

    except Exception as e:
        print(f"❌ Skipping {os.path.basename(path)}: {e}")
        return None


In [None]:
import numpy as np
import re

def summarize_openface_file(path, ref_columns):
    pid = re.search(r'(\d+)', os.path.basename(path)).group(1)
    df = safe_read_openface(path, ref_columns)
    if df is None:
        return None

    summary = {}
    for col in df.columns:
        summary[f'{col}_mean'] = df[col].mean()
        summary[f'{col}_std'] = df[col].std()
        summary[f'{col}_min'] = df[col].min()
        summary[f'{col}_max'] = df[col].max()

    summary['participant_id'] = pid
    return summary


In [None]:
import glob
from tqdm import tqdm

OPENFACE_DIR = '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs'
OUT_CSV = '/content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary_all_cleaned.csv'

# List all OpenFace CSV files
all_files = sorted(glob.glob(f'{OPENFACE_DIR}/*.csv'))

# Build summary row for each file
rows = []
for path in tqdm(all_files):
    summary = summarize_openface_file(path, ref_columns)
    if summary:
        rows.append(summary)

# Create final dataframe
summary_df = pd.DataFrame(rows).set_index('participant_id').sort_index()
summary_df.to_csv(OUT_CSV)

print(f"✅ Final summary saved: {OUT_CSV} — Shape: {summary_df.shape}")


100%|██████████| 276/276 [00:47<00:00,  5.75it/s]

✅ Final summary saved: /content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary_all_cleaned.csv — Shape: (276, 404)





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load the OpenFace summary CSV
summary_path = '/content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary_all_cleaned.csv'
df = pd.read_csv(summary_path)

# Load a clean file (e.g., 301) for reference column names
ref_301 = pd.read_csv('/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/301_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv')
ref_cols = ref_301.select_dtypes(include='number').columns.tolist()


In [None]:
# Dictionary for renaming known OpenFace features
pretty_map = {
    'pose_Tx': 'head_pos_x',
    'pose_Ty': 'head_pos_y',
    'pose_Tz': 'head_pos_z',
    'pose_Rx': 'head_rot_x',
    'pose_Ry': 'head_rot_y',
    'pose_Rz': 'head_rot_z',
    'gaze_angle_x': 'gaze_dir_x',
    'gaze_angle_y': 'gaze_dir_y',
    'AU01_r': 'inner_brow_raise',
    'AU02_r': 'outer_brow_raise',
    'AU04_r': 'brow_lower',
    'AU05_r': 'upper_lid_raise',
    'AU06_r': 'cheek_raise',
    'AU07_r': 'lid_tighten',
    'AU09_r': 'nose_wrinkle',
    'AU10_r': 'upper_lip_raise',
    'AU12_r': 'lip_corner_pull',
    'AU14_r': 'dimpler',
    'AU15_r': 'lip_corner_depress',
    'AU17_r': 'chin_raise',
    'AU20_r': 'lip_stretch',
    'AU23_r': 'lip_tighten',
    'AU25_r': 'lips_part',
    'AU26_r': 'jaw_drop',
    'AU28_r': 'lip_suck',
    'AU45_r': 'blink'
}


In [None]:
rename_dict = {}
for raw_col in ref_cols:
    readable = pretty_map.get(raw_col, raw_col)
    for stat in ['mean', 'std', 'min', 'max']:
        full_raw = f'{raw_col}_{stat}'
        full_new = f'{readable}_{stat}'
        rename_dict[full_raw] = full_new


In [None]:
# Filter columns: only those that exist in our renaming dict + participant_id
valid_cols = ['participant_id'] + [col for col in df.columns if col in rename_dict]

# Subset and rename
df_clean = df[valid_cols].rename(columns=rename_dict)


In [None]:
out_path = '/content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary_cleaned_named.csv'
df_clean.to_csv(out_path, index=False)

print(f'✅ Saved cleaned summary to {out_path} — shape: {df_clean.shape}')


✅ Saved cleaned summary to /content/drive/MyDrive/PHQ8_Project/BoVW_openface_summary_cleaned_named.csv — shape: (276, 405)


# Clean Columns


In [None]:
# Use a clean file (e.g., 301) to extract reference columns
clean_path = '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/301_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv'
ref_df = pd.read_csv(clean_path)
ref_cols = ref_df.select_dtypes(include='number').columns.tolist()


In [None]:
def summarize_broken_file(path, ref_cols):
    try:
        # Load with correct delimiter
        df = pd.read_csv(path, sep=';', header=None)

        # Drop first column if it's metadata
        df = df.drop(columns=[0])

        # Assign correct column names
        df.columns = ref_cols[:df.shape[1]]

        # Compute summary statistics
        summary = df.describe().loc[['mean', 'std', 'min', 'max']]
        summary.columns = [f"{col}_{stat}" for col in summary.columns for stat in ['mean', 'std', 'min', 'max']]
        summary = summary.iloc[0:1]  # keep only one row per participant

        return summary
    except Exception as e:
        print(f"❌ Error: {path} — {e}")
        return None


In [None]:
# Path to broken OpenFace files
folder = '/content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/'

# Participant IDs that are broken
broken_ids = [
    '600', '602', '604', '605', '606', '607', '609', '615', '618', '619', '620', '622', '623', '624',
    '625', '626', '629', '631', '634', '635', '636', '637', '638', '640', '649', '650', '651', '652',
    '655', '656', '658', '659', '661', '663', '664', '666', '669', '676', '679', '682', '683', '688',
    '689', '691', '693', '696', '699', '705', '708', '709', '710', '712', '715', '716', '717', '718'
]  # Add all broken ones

all_rows = []

for pid in tqdm(broken_ids):
    file_path = os.path.join(folder, f'{pid}_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv')
    summary = summarize_broken_file(file_path, ref_cols)
    if summary is not None:
        summary.insert(0, 'participant_id', pid)
        all_rows.append(summary)

# Combine all
final_df = pd.concat(all_rows, ignore_index=True)
final_df.to_csv('/content/drive/MyDrive/BoVW_openface_fixed_summary.csv', index=False)
print("✅ Saved summary with corrected columns.")


  4%|▎         | 2/56 [00:00<00:10,  5.30it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/600_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/602_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


  5%|▌         | 3/56 [00:00<00:11,  4.81it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/604_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


  9%|▉         | 5/56 [00:01<00:10,  4.95it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/605_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/606_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 11%|█         | 6/56 [00:01<00:09,  5.12it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/607_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 12%|█▎        | 7/56 [00:01<00:10,  4.83it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/609_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 14%|█▍        | 8/56 [00:01<00:11,  4.36it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/615_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/618_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 18%|█▊        | 10/56 [00:02<00:10,  4.52it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/619_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 21%|██▏       | 12/56 [00:02<00:09,  4.77it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/620_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/622_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 25%|██▌       | 14/56 [00:02<00:09,  4.60it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/623_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/624_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 29%|██▊       | 16/56 [00:03<00:07,  5.02it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/625_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/626_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 32%|███▏      | 18/56 [00:03<00:07,  5.06it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/629_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/631_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 36%|███▌      | 20/56 [00:04<00:06,  5.43it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/634_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/635_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 38%|███▊      | 21/56 [00:04<00:07,  4.80it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/636_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 39%|███▉      | 22/56 [00:04<00:07,  4.53it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/637_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 41%|████      | 23/56 [00:04<00:07,  4.33it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/638_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 43%|████▎     | 24/56 [00:05<00:08,  3.99it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/640_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 45%|████▍     | 25/56 [00:05<00:08,  3.77it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/649_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 46%|████▋     | 26/56 [00:05<00:08,  3.53it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/650_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 48%|████▊     | 27/56 [00:06<00:07,  3.69it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/651_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 50%|█████     | 28/56 [00:06<00:07,  3.59it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/652_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 52%|█████▏    | 29/56 [00:06<00:07,  3.56it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/655_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 54%|█████▎    | 30/56 [00:06<00:07,  3.65it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/656_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 55%|█████▌    | 31/56 [00:07<00:07,  3.47it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/658_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 57%|█████▋    | 32/56 [00:07<00:07,  3.21it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/659_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 61%|██████    | 34/56 [00:08<00:06,  3.58it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/661_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/663_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 62%|██████▎   | 35/56 [00:08<00:05,  3.77it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/664_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 64%|██████▍   | 36/56 [00:08<00:05,  3.96it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/666_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 68%|██████▊   | 38/56 [00:08<00:04,  4.49it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/669_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/676_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 70%|██████▉   | 39/56 [00:09<00:03,  4.44it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/679_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/682_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 73%|███████▎  | 41/56 [00:09<00:03,  4.46it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/683_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 77%|███████▋  | 43/56 [00:10<00:02,  4.71it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/688_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/689_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 80%|████████  | 45/56 [00:10<00:02,  4.73it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/691_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/693_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 84%|████████▍ | 47/56 [00:10<00:01,  5.11it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/696_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/699_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 88%|████████▊ | 49/56 [00:11<00:01,  5.24it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/705_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/708_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 91%|█████████ | 51/56 [00:11<00:00,  5.11it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/709_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/710_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 93%|█████████▎| 52/56 [00:11<00:00,  4.92it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/712_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 96%|█████████▋| 54/56 [00:12<00:00,  4.83it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/715_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements
❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/716_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


 98%|█████████▊| 55/56 [00:12<00:00,  4.75it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/717_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements


100%|██████████| 56/56 [00:12<00:00,  4.41it/s]

❌ Error: /content/drive/MyDrive/PHQ8_Project/BoVW_Pose_Gaze_AUs/718_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv — Length mismatch: Expected axis has 101 elements, new values have 404 elements





ValueError: No objects to concatenate

In [None]:
import pandas as pd

# Step 1: Load both CSVs
full_df = pd.read_csv('full_participant_summary.csv')
bovw_df = pd.read_csv('BoVW_openface_summary_all_cleaned.csv')

# Step 2: Drop all columns related to 'openface' (case-insensitive just to be safe)
columns_to_drop = [col for col in full_df.columns if 'openface' in col.lower()]
full_df_cleaned = full_df.drop(columns=columns_to_drop)

# Step 3: Merge BoVW features
# Assuming both files have a common key like 'participant_id' or similar
# Adjust `on='participant_id'` below with your actual key column
merged_df = pd.merge(full_df_cleaned, bovw_df, on='participant_id', how='left')

# Step 4: Save the intermediate merged file
merged_df.to_csv('/content/updated_participant_summary.csv', index=False)

print("✅ Merging complete and openface features replaced!")


✅ Merging complete and openface features replaced!


In [None]:
import pandas as pd

# Load already updated summary
updated_df = pd.read_csv('updated_participant_summary.csv')

# Load transcript features
transcript_df = pd.read_csv('transcript_summary_all.csv')

# Merge on 'participant_id'
final_df = pd.merge(updated_df, transcript_df, on='participant_id', how='left')

# Save the final combined CSV
final_df.to_csv('/content/participant_summary.csv', index=False)

print("🎉 Final participant_summary.csv is ready with transcript features added!")


🎉 Final participant_summary.csv is ready with transcript features added!


In [None]:
import pandas as pd

# Load the CSV
df = pd.read_csv('participant_summary.csv')

# Rename columns starting with '0.0.' from 1 to 99 to openface_f{number}
new_columns = []
for col in df.columns:
    if col.startswith('0.0.'):
        suffix = col.split('0.0.')[-1]  # e.g. "1_mean", "23_max"
        parts = suffix.split('_')
        feature_num = parts[0]
        stat = parts[1] if len(parts) > 1 else ''
        new_col = f'openface_f{feature_num}_{stat}'
        new_columns.append(new_col)
    else:
        new_columns.append(col)

df.columns = new_columns

# Save the updated dataframe if you want
df.to_csv('participant_summary_renamed.csv', index=False)


# Raw openface files merging


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

def count_openface_files(folder_path):
    count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if 'openface' in file.lower():
                count += 1
    return count

# Replace this with your actual path to the phq8 folder
openface_folder = '/content/drive/MyDrive/PHQ8_Project/openface_files'
print("Number of OpenFace files:", count_openface_files(openface_folder))


Number of OpenFace files: 268


In [None]:
import pandas as pd
csv_path = 'participant_summary_.csv'
df = pd.read_csv(csv_path)

# Replace 'participant_id' with the actual column name in your CSV
participant_column = 'participant_id'  # adjust this as needed

# List of participant IDs from CSV
participants = df[participant_column].astype(str).tolist()

# Get the list of filenames in openface folder
openface_files = os.listdir(openface_folder)

# Check which participants are missing openface files
# Assuming openface files contain participant IDs in their filenames
missing_participants = []
for p in participants:
    # Check if any file contains participant ID string
    if not any(p in filename for filename in openface_files):
        missing_participants.append(p)

print(f"Total participants in CSV: {len(participants)}")
print(f"Participants missing OpenFace files: {len(missing_participants)}")
print("List of missing participants:")
print(missing_participants)

Total participants in CSV: 275
Participants missing OpenFace files: 0
List of missing participants:
[]


In [None]:
import os
import pandas as pd

openface_folder = '/content/drive/MyDrive/PHQ8_Project/openface_files'
exclude_columns = ['timestamp', 'confidence', 'frame', 'success']

def process_openface_file(file_path):
    df = pd.read_csv(file_path)

    if 'success' in df.columns:
        df = df[df['success'] == 1]

    feature_cols = [col for col in df.columns if col not in exclude_columns]

    stats = {}
    for col in feature_cols:
        stats[f'{col}_mean'] = df[col].mean()
        stats[f'{col}_std'] = df[col].std()
        stats[f'{col}_min'] = df[col].min()
        stats[f'{col}_max'] = df[col].max()

    return stats

summary_list = []

for filename in os.listdir(openface_folder):
    if filename.endswith('.csv'):
        participant_id = os.path.splitext(filename)[0]
        file_path = os.path.join(openface_folder, filename)
        stats = process_openface_file(file_path)
        stats['participant_id'] = participant_id
        summary_list.append(stats)

summary_df = pd.DataFrame(summary_list)
summary_df.to_csv('openface_summary_all_participants.csv', index=False)
print("Summary CSV created with shape:", summary_df.shape)


Summary CSV created with shape: (275, 197)


In [None]:
import pandas as pd

# Load CSVs
participant_summary_path = 'participant_summary_.csv'
openface_summary_path = 'openface_summary_cleaned.csv'

participants_df = pd.read_csv(participant_summary_path)
openface_summary_df = pd.read_csv(openface_summary_path)

# Assuming 'participant_id' is the column in both files (adjust if needed)
participant_ids = set(participants_df['participant_id'].astype(str))
openface_ids = set(openface_summary_df['participant_id'].astype(str))

# Check if all openface participants exist in participant summary
missing_in_summary = openface_ids - participant_ids

if missing_in_summary:
    print(f"Participants present in OpenFace summary but missing in participant summary: {len(missing_in_summary)}")
    print(missing_in_summary)
else:
    print("All participants in OpenFace summary are present in participant summary CSV.")


All participants in OpenFace summary are present in participant summary CSV.


In [None]:
import pandas as pd

# Load the CSV
file_path = 'openface_summary_all_participants.csv'
df = pd.read_csv(file_path)

# Show columns for confirmation
print("Original columns:", df.columns.tolist())

# Get the name of the last column
last_col = df.columns[-1]

# Extract the participant_id as the numeric part at the start of last_col's values
# For example, from "300_someOtherText" keep "300"
# Assuming participant_id is the digits at the start before any non-digit characters
df['participant_id'] = df[last_col].astype(str).str.extract(r'^(\d+)')

# Reorder columns: participant_id first, then all except the last column
cols = ['participant_id'] + list(df.columns[:-1])

df = df[cols]

# Optional: sort by participant_id (convert to int for sorting)
df['participant_id'] = df['participant_id'].astype(int)
df = df.sort_values(by='participant_id').reset_index(drop=True)

# Save cleaned CSV
df.to_csv('openface_summary_cleaned.csv', index=False)

print("Cleaned CSV saved with participant_id as first column.")
print(df.head())


Original columns: ['pose_Tx_mean', 'pose_Tx_std', 'pose_Tx_min', 'pose_Tx_max', 'pose_Ty_mean', 'pose_Ty_std', 'pose_Ty_min', 'pose_Ty_max', 'pose_Tz_mean', 'pose_Tz_std', 'pose_Tz_min', 'pose_Tz_max', 'pose_Rx_mean', 'pose_Rx_std', 'pose_Rx_min', 'pose_Rx_max', 'pose_Ry_mean', 'pose_Ry_std', 'pose_Ry_min', 'pose_Ry_max', 'pose_Rz_mean', 'pose_Rz_std', 'pose_Rz_min', 'pose_Rz_max', 'gaze_0_x_mean', 'gaze_0_x_std', 'gaze_0_x_min', 'gaze_0_x_max', 'gaze_0_y_mean', 'gaze_0_y_std', 'gaze_0_y_min', 'gaze_0_y_max', 'gaze_0_z_mean', 'gaze_0_z_std', 'gaze_0_z_min', 'gaze_0_z_max', 'gaze_1_x_mean', 'gaze_1_x_std', 'gaze_1_x_min', 'gaze_1_x_max', 'gaze_1_y_mean', 'gaze_1_y_std', 'gaze_1_y_min', 'gaze_1_y_max', 'gaze_1_z_mean', 'gaze_1_z_std', 'gaze_1_z_min', 'gaze_1_z_max', 'gaze_angle_x_mean', 'gaze_angle_x_std', 'gaze_angle_x_min', 'gaze_angle_x_max', 'gaze_angle_y_mean', 'gaze_angle_y_std', 'gaze_angle_y_min', 'gaze_angle_y_max', 'AU01_r_mean', 'AU01_r_std', 'AU01_r_min', 'AU01_r_max', 'AU02_

In [None]:
import pandas as pd

# Load the participant summary CSV
participant_path = 'participant_summary_.csv'
df = pd.read_csv(participant_path)

# Find the position/index of the first column starting with 'openface_f1_mean'
cols = df.columns.tolist()
start_index = None
for i, col in enumerate(cols):
    if col.startswith('openface_f1_mean'):
        start_index = i
        break

if start_index is not None:
    # Keep columns before this index only
    df_reduced = df.iloc[:, :start_index]
else:
    # If no such column found, keep all columns
    df_reduced = df

print(f"Original columns: {len(cols)}")
print(f"Columns after removing from 'openface_f1_mean' onwards: {df_reduced.shape[1]}")

# Save the reduced dataframe if needed
df_reduced.to_csv('participant_summary_reduced.csv', index=False)

print("Reduced participant summary saved.")
print(df_reduced.head())


Original columns: 1218
Columns after removing from 'openface_f1_mean' onwards: 801
Reduced participant summary saved.
   participant_id  eGeMAPS_f1_mean  eGeMAPS_f1_std  eGeMAPS_f1_min  \
0             300         0.339651        0.412770               0   
1             301         0.417114        0.383984               0   
2             302         0.431654        0.399046               0   
3             303         0.688838        0.391137               0   
4             304         0.594773        0.377326               0   

   eGeMAPS_f1_max  eGeMAPS_f2_mean  eGeMAPS_f2_std  eGeMAPS_f2_min  \
0        1.633468         0.309486        0.527131               0   
1        1.462398         0.508682        0.565141               0   
2        1.414973         0.008801        0.083450               0   
3        1.544068         1.028125        0.639955               0   
4        1.477121         0.658804        0.659056               0   

   eGeMAPS_f2_max  eGeMAPS_f3_mean  ... 

In [None]:
import pandas as pd

# Load CSVs
participant_reduced_path = 'participant_summary_reduced.csv'  # file from previous step
openface_cleaned_path = 'openface_summary_cleaned.csv'
transcript_summary_path = 'transcript_summary_all.csv'

participant_df = pd.read_csv(participant_reduced_path)
openface_df = pd.read_csv(openface_cleaned_path)
transcript_df = pd.read_csv(transcript_summary_path)

# Check participant_id column names (adjust if different)
print("Participant columns in each file:")
print("Participant reduced:", participant_df.columns[0])
print("OpenFace cleaned:", openface_df.columns[0])
print("Transcript summary:", transcript_df.columns[0])

# Merge participant reduced with openface features
merged_df = participant_df.merge(openface_df, on='participant_id', how='left')

# Merge the above with transcript features
merged_df = merged_df.merge(transcript_df, on='participant_id', how='left')

# Save final merged CSV
merged_df.to_csv('final_merged_summary.csv', index=False)

print("Final merged CSV saved with shape:", merged_df.shape)
print(merged_df.head())


Participant columns in each file:
Participant reduced: participant_id
OpenFace cleaned: participant_id
Transcript summary: participant_id
Final merged CSV saved with shape: (275, 1014)
   participant_id  eGeMAPS_f1_mean  eGeMAPS_f1_std  eGeMAPS_f1_min  \
0             300         0.339651        0.412770               0   
1             301         0.417114        0.383984               0   
2             302         0.431654        0.399046               0   
3             303         0.688838        0.391137               0   
4             304         0.594773        0.377326               0   

   eGeMAPS_f1_max  eGeMAPS_f2_mean  eGeMAPS_f2_std  eGeMAPS_f2_min  \
0        1.633468         0.309486        0.527131               0   
1        1.462398         0.508682        0.565141               0   
2        1.414973         0.008801        0.083450               0   
3        1.544068         1.028125        0.639955               0   
4        1.477121         0.658804        0.

In [None]:
import pandas as pd

# Load the final merged CSV
file_path = 'final_merged_summary.csv'
df = pd.read_csv(file_path)

# Check for missing values
missing_summary = df.isnull().sum()

# Count total missing values per column and overall
total_missing = missing_summary.sum()
columns_with_missing = missing_summary[missing_summary > 0]

print(f"Total missing values in dataset: {total_missing}")
print(f"Columns with missing values and their counts:")
print(columns_with_missing)

# Optionally, show percentage missing per column
missing_percent = (columns_with_missing / len(df)) * 100
print("\nPercentage of missing values per column:")
print(missing_percent)

# If you want, display the top few rows with missing values
rows_with_missing = df[df.isnull().any(axis=1)]
print(f"\nNumber of rows with missing values: {len(rows_with_missing)}")
print(rows_with_missing.head())


Total missing values in dataset: 0
Columns with missing values and their counts:
Series([], dtype: int64)

Percentage of missing values per column:
Series([], dtype: float64)

Number of rows with missing values: 0
Empty DataFrame
Columns: [participant_id, eGeMAPS_f1_mean, eGeMAPS_f1_std, eGeMAPS_f1_min, eGeMAPS_f1_max, eGeMAPS_f2_mean, eGeMAPS_f2_std, eGeMAPS_f2_min, eGeMAPS_f2_max, eGeMAPS_f3_mean, eGeMAPS_f3_std, eGeMAPS_f3_min, eGeMAPS_f3_max, eGeMAPS_f4_mean, eGeMAPS_f4_std, eGeMAPS_f4_min, eGeMAPS_f4_max, eGeMAPS_f5_mean, eGeMAPS_f5_std, eGeMAPS_f5_min, eGeMAPS_f5_max, eGeMAPS_f6_mean, eGeMAPS_f6_std, eGeMAPS_f6_min, eGeMAPS_f6_max, eGeMAPS_f7_mean, eGeMAPS_f7_std, eGeMAPS_f7_min, eGeMAPS_f7_max, eGeMAPS_f8_mean, eGeMAPS_f8_std, eGeMAPS_f8_min, eGeMAPS_f8_max, eGeMAPS_f9_mean, eGeMAPS_f9_std, eGeMAPS_f9_min, eGeMAPS_f9_max, eGeMAPS_f10_mean, eGeMAPS_f10_std, eGeMAPS_f10_min, eGeMAPS_f10_max, eGeMAPS_f11_mean, eGeMAPS_f11_std, eGeMAPS_f11_min, eGeMAPS_f11_max, eGeMAPS_f12_mean, eGe

In [None]:
import pandas as pd

# Load your file
df = pd.read_csv('final_merged_summary.csv')

# AU mapping dictionary
au_map = {
    'AU01': 'Inner_Brow_Raiser',
    'AU02': 'Outer_Brow_Raiser',
    'AU04': 'Brow_Lowerer',
    'AU05': 'Upper_Lid_Raiser',
    'AU06': 'Cheek_Raiser',
    'AU07': 'Lid_Tightener',
    'AU09': 'Nose_Wrinkler',
    'AU10': 'Upper_Lip_Raiser',
    'AU12': 'Lip_Corner_Puller',
    'AU14': 'Dimpler',
    'AU15': 'Lip_Corner_Depressor',
    'AU17': 'Chin_Raiser',
    'AU20': 'Lip_Stretcher',
    'AU23': 'Lip_Tightener',
    'AU25': 'Lips_Part',
    'AU26': 'Jaw_Drop',
    'AU28': 'Lip_Suck',
    'AU45': 'Blink'
}

def rename_openface_column(col):
    # Pose
    if col.startswith('pose_Tx'): return col.replace('pose_Tx', 'head_pos_x')
    if col.startswith('pose_Ty'): return col.replace('pose_Ty', 'head_pos_y')
    if col.startswith('pose_Tz'): return col.replace('pose_Tz', 'head_pos_z')
    if col.startswith('pose_Rx'): return col.replace('pose_Rx', 'head_rot_x')
    if col.startswith('pose_Ry'): return col.replace('pose_Ry', 'head_rot_y')
    if col.startswith('pose_Rz'): return col.replace('pose_Rz', 'head_rot_z')

    # Gaze
    if col.startswith('gaze_0_'): return col.replace('gaze_0_', 'left_gaze_')
    if col.startswith('gaze_1_'): return col.replace('gaze_1_', 'right_gaze_')

    # AU renaming
    for au_key, au_name in au_map.items():
        if col.startswith(au_key):
            # Remove '_r' or other suffixes, and append the statistic if any
            suffix = col[len(au_key):]  # e.g. '_mean', '_std'
            # Clean suffix for better naming if needed
            return au_name + suffix

    return col

# Rename columns
df.columns = [rename_openface_column(c) for c in df.columns]

# Save file with renamed columns
df.to_csv('final_merged_summary_renamed_AUs.csv', index=False)

print("Renamed columns with meaningful AU names. Sample:")
print(df.columns[:30])


Renamed columns with meaningful AU names. Sample:
Index(['participant_id', 'eGeMAPS_f1_mean', 'eGeMAPS_f1_std', 'eGeMAPS_f1_min',
       'eGeMAPS_f1_max', 'eGeMAPS_f2_mean', 'eGeMAPS_f2_std', 'eGeMAPS_f2_min',
       'eGeMAPS_f2_max', 'eGeMAPS_f3_mean', 'eGeMAPS_f3_std', 'eGeMAPS_f3_min',
       'eGeMAPS_f3_max', 'eGeMAPS_f4_mean', 'eGeMAPS_f4_std', 'eGeMAPS_f4_min',
       'eGeMAPS_f4_max', 'eGeMAPS_f5_mean', 'eGeMAPS_f5_std', 'eGeMAPS_f5_min',
       'eGeMAPS_f5_max', 'eGeMAPS_f6_mean', 'eGeMAPS_f6_std', 'eGeMAPS_f6_min',
       'eGeMAPS_f6_max', 'eGeMAPS_f7_mean', 'eGeMAPS_f7_std', 'eGeMAPS_f7_min',
       'eGeMAPS_f7_max', 'eGeMAPS_f8_mean'],
      dtype='object')
