### Prepare Train data

In [None]:
# Step 1: Download and extract MELD if not done already
!wget -q --show-progress https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz -O MELD.Raw.tar.gz
!tar -xzf MELD.Raw.tar.gz

# Step 2: Extract train audio files
!tar -xzf /content/MELD.Raw/train.tar.gz -C /content/MELD.Raw/



In [None]:
!tar -xzf /content/MELD.Raw/train.tar.gz -C /content/MELD.Raw/


In [None]:
import os
from glob import glob

video_dir = "/content/MELD.Raw/train_splits"
audio_dir = "/content/MELD.Raw/train_wav"
os.makedirs(audio_dir, exist_ok=True)

mp4_files = glob(os.path.join(video_dir, "*.mp4"))

from tqdm import tqdm
for mp4_path in tqdm(mp4_files, desc="Converting MP4 to WAV"):
    base = os.path.basename(mp4_path).replace(".mp4", ".wav")
    wav_path = os.path.join(audio_dir, base)
    # Use ffmpeg to convert (overwrite if exists)
    os.system(f'ffmpeg -y -i "{mp4_path}" -ar 16000 -ac 1 "{wav_path}" > /dev/null 2>&1')


Converting MP4 to WAV: 100%|██████████| 9989/9989 [22:59<00:00,  7.24it/s]


In [None]:
import os
import pandas as pd
import numpy as np
import json
import librosa
import pyloudnorm as pyln
import parselmouth
from tqdm import tqdm

# Load the MELD training set
train_df = pd.read_csv("/content/MELD.Raw/train_sent_emo.csv")

# Limit to first 1000 rows for manageable fine-tuning
train_df = train_df.head(4000).copy()

# === Step 1: Extract Acoustic Features ===
def extract_features(path, transcript):
    try:
        y, sr = librosa.load(path, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        if duration < 0.25 or np.max(np.abs(y)) < 1e-4:
            return [np.nan, np.nan, np.nan]

        snd = parselmouth.Sound(path)
        pitch = snd.to_pitch()
        f0 = pitch.selected_array['frequency']
        f0 = f0[f0 != 0]
        mean_pitch = np.mean(f0) if len(f0) > 0 else np.nan

        meter = pyln.Meter(sr)
        loudness = meter.integrated_loudness(y)

        num_words = len(transcript.split())
        num_syllables = num_words * 1.5
        speaking_rate = num_syllables / duration

        return [mean_pitch, loudness, speaking_rate]

    except Exception as e:
        return [np.nan, np.nan, np.nan]

# Build audio path column
audio_dir = "/content/MELD.Raw/train_wav"
def get_audio_path(row):
    fname = f"dia{row['Dialogue_ID']}_utt{row['Utterance_ID']}.wav"
    fpath = os.path.join(audio_dir, fname)
    return fpath if os.path.exists(fpath) else None

train_df["audio_path"] = train_df.apply(get_audio_path, axis=1)

# Extract features
acoustic_features = []
for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    feats = extract_features(row["audio_path"], row["Utterance"])
    acoustic_features.append(feats)

acoustic_df = pd.DataFrame(acoustic_features, columns=["pitch", "loudness", "speaking_rate"])
train_df = pd.concat([train_df.reset_index(drop=True), acoustic_df.reset_index(drop=True)], axis=1)

# === Step 2: Add Context Columns ===
def add_context_columns(df):
    df_sorted = df.sort_values(by=["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)
    df_sorted["prev_utt"] = df_sorted.groupby("Dialogue_ID")["Utterance"].shift(1).fillna("")
    df_sorted["next_utt"] = df_sorted.groupby("Dialogue_ID")["Utterance"].shift(-1).fillna("")
    df_sorted["prev_speaker"] = df_sorted.groupby("Dialogue_ID")["Speaker"].shift(1).fillna("")
    df_sorted["next_speaker"] = df_sorted.groupby("Dialogue_ID")["Speaker"].shift(-1).fillna("")
    return df_sorted

train_df = add_context_columns(train_df)

# === Step 3: Verbalize Features and Build Prompts ===
def verbalize_features(pitch, loudness, speaking_rate):
    thresholds = {
        "pitch": (194, 267),
        "loudness": (-31.50, -28.76),
        "speaking_rate": (2.86, 4.74),
    }
    features = {"pitch": pitch, "loudness": loudness, "speaking_rate": speaking_rate}
    phrases = []
    for k, v in features.items():
        if np.isnan(v):
            level = "unknown"
        elif v < thresholds[k][0]:
            level = "low"
        elif v < thresholds[k][1]:
            level = "medium"
        else:
            level = "high"
        if k == "pitch":
            phrases.append(f"a {level} pitch")
        elif k == "loudness":
            phrases.append(f"a {level} volume")
        elif k == "speaking_rate":
            phrases.append(f"a {level} pace")
    return "The speaker speaks with " + ", ".join(phrases) + "."

def build_prompt(row):
    context = ""
    if row["prev_utt"]:
        context += f'{row["prev_speaker"]}: "{row["prev_utt"]}"\n'
    context += f'Current speaker: "{row["Speaker"]}"\n'
    context += f'Utterance: "{row["Utterance"]}"\n'
    if row["next_utt"]:
        context += f'{row["next_speaker"]}: "{row["next_utt"]}"\n'

    acoustic = verbalize_features(row.get("pitch", np.nan), row.get("loudness", np.nan), row.get("speaking_rate", np.nan))

    prompt = f"""
You are an expert in detecting emotions from speech. Prioritize vocal cues such as pitch, loudness, and speaking rate, especially when the text is ambiguous.

Conversation Context:
{context.strip()}

Speech Features:
{acoustic}

Choose the dominant emotion of the current speaker from:
["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]

Emotion:
""".strip()
    return prompt

# Filter and build prompts
formatted_rows = []
for _, row in train_df.iterrows():
    if row["Emotion"] not in ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]:
        continue
    prompt = build_prompt(row)
    response = row["Emotion"].strip().lower()
    formatted_rows.append({"prompt": prompt, "response": response})

# Save JSONL
jsonl_path = "/content/meld_train_1000_lora.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for row in formatted_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

jsonl_path


100%|██████████| 4000/4000 [02:15<00:00, 29.45it/s]


'/content/meld_train_1000_lora.jsonl'

In [None]:
train_df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,audio_path,pitch,loudness,speaking_rate,prev_utt,next_utt,prev_speaker,next_speaker
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731",/content/MELD.Raw/train_wav/dia0_utt0.wav,123.795280,-25.099835,4.229308,,You mustve had your hands full.,,The Interviewer
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442",/content/MELD.Raw/train_wav/dia0_utt1.wav,131.118371,-23.772574,6.114130,also I was the point person on my companys tr...,That I did. That I did.,Chandler,Chandler
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389",/content/MELD.Raw/train_wav/dia0_utt2.wav,130.369968,-28.452166,3.057065,You mustve had your hands full.,So lets talk a little bit about your duties.,The Interviewer,The Interviewer
3,4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572",/content/MELD.Raw/train_wav/dia0_utt3.wav,163.262921,-24.244412,4.867825,That I did. That I did.,My duties? All right.,Chandler,Chandler
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917",/content/MELD.Raw/train_wav/dia0_utt4.wav,228.372679,-25.027272,0.925167,So lets talk a little bit about your duties.,"Now youll be heading a whole division, so you...",The Interviewer,The Interviewer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,4213,"Umm, y'know how the other day you were talking...",Joey,neutral,neutral,431,3,5,2,"0:14:23,529","0:14:29,421",/content/MELD.Raw/train_wav/dia431_utt3.wav,112.856880,-34.299267,6.623641,Yeah?,Yeah?,Phoebe,Phoebe
3996,4214,Yeah?,Phoebe,neutral,neutral,431,4,5,2,"0:14:29,421","0:14:29,930",/content/MELD.Raw/train_wav/dia431_utt4.wav,261.572288,-36.518944,3.056935,"Umm, y'know how the other day you were talking...","All right, well, we felt really bad about that...",Joey,Joey
3997,4215,"All right, well, we felt really bad about that...",Joey,joy,positive,431,5,5,2,"00:14:30,077","00:14:36,624",/content/MELD.Raw/train_wav/dia431_utt5.wav,148.698456,-30.917141,4.580633,Yeah?,"Ohh, that's so nice! How great! Well, where? W...",Phoebe,Phoebe
3998,4216,"Ohh, that's so nice! How great! Well, where? W...",Phoebe,anger,negative,431,6,5,2,"00:14:37,042","00:14:41,045",/content/MELD.Raw/train_wav/dia431_utt6.wav,379.141480,-29.607524,4.114008,"All right, well, we felt really bad about that...","Well, we thought we would all go to a picnic ,...",Joey,Monica


### Prepare Test Data

In [None]:
# Step 1: Download and extract MELD dataset
!wget -q --show-progress https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz -O MELD.Raw.tar.gz
!tar -xzf MELD.Raw.tar.gz
!tar -xzf /content/MELD.Raw/test.tar.gz -C /content/MELD.Raw/



In [None]:
# Step 2: Extract audio from MP4s
import os
from glob import glob
from tqdm import tqdm

input_dir = "/content/MELD.Raw/output_repeated_splits_test/"
mp4_files = glob(os.path.join(input_dir, "*.mp4"))

for mp4_path in tqdm(mp4_files):
    base = os.path.splitext(os.path.basename(mp4_path))[0]
    wav_path = os.path.join(input_dir, f"{base}.wav")
    os.system(f"ffmpeg -y -i '{mp4_path}' -ac 1 -ar 16000 '{wav_path}'")

100%|██████████| 2747/2747 [06:28<00:00,  7.07it/s]


In [None]:
# Step 3: Load test metadata and map to audio paths
import pandas as pd

test_df = pd.read_csv("/content/MELD.Raw/test_sent_emo.csv")
audio_dir = "/content/MELD.Raw/output_repeated_splits_test"

def get_audio_path(row):
    fname = f"dia{row['Dialogue_ID']}_utt{row['Utterance_ID']}.wav"
    fpath = os.path.join(audio_dir, fname)
    return fpath if os.path.exists(fpath) else None

test_df["audio_path"] = test_df.apply(get_audio_path, axis=1)

In [None]:
# Step 4: Add dialogue context (previous/next utterance)
def add_context_columns(df):
    df_sorted = df.sort_values(by=["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)
    df_sorted["prev_utt"] = df_sorted.groupby("Dialogue_ID")["Utterance"].shift(1)
    df_sorted["next_utt"] = df_sorted.groupby("Dialogue_ID")["Utterance"].shift(-1)
    df_sorted["prev_speaker"] = df_sorted.groupby("Dialogue_ID")["Speaker"].shift(1)
    df_sorted["next_speaker"] = df_sorted.groupby("Dialogue_ID")["Speaker"].shift(-1)
    return df_sorted.fillna("")

test_df = add_context_columns(test_df)

In [None]:
!pip install praat-parselmouth
!pip install pyloudnorm

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.6
Collecting pyloudnorm
  Downloading pyloudnorm-0.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading pyloudnorm-0.1.1-py3-none-any.whl (9.6 kB)
Installing collected packages: pyloudnorm
Successfully installed pyloudnorm-0.1.1


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import pyloudnorm as pyln
import parselmouth
from tqdm import tqdm

# Step 1: Define feature extractor
def extract_features(path, transcript):
    try:
        y, sr = librosa.load(path, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        if duration < 0.25 or np.max(np.abs(y)) < 1e-4:
            print(f"Skip (short/silent): {path}")
            return [np.nan] * 3

        snd = parselmouth.Sound(path)

        # Pitch via to_pitch
        pitch = parselmouth.praat.call(snd, "To Pitch", 0.0, 75, 600)
        avg_pitch = parselmouth.praat.call(pitch, "Get mean", 0, 0, "Hertz")

        # Loudness (only if long enough)
        meter = pyln.Meter(sr)
        if len(y) < sr * 0.4:
            loudness = np.nan
        else:
            loudness = meter.integrated_loudness(y)

        # Speaking rate
        num_words = len(transcript.split())
        num_syllables = num_words * 1.5
        speaking_rate = num_syllables / duration

        return [avg_pitch, loudness, speaking_rate]

    except Exception as e:
        print(f"Error {path}: {e}")
        return [np.nan] * 3



# Apply extraction
feature_columns = ["pitch", "loudness", "speaking_rate"]
features = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    feats = extract_features(row["audio_path"], row["Utterance"])
    features.append(feats)

feature_df = pd.DataFrame(features, columns=feature_columns)
test_df = pd.concat([test_df, feature_df], axis=1)

print("MELD Acoustic Features", test_df)


  9%|▉         | 238/2610 [00:17<00:53, 44.08it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia25_utt4.wav


 10%|▉         | 250/2610 [00:17<00:54, 43.49it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia25_utt15.wav


 11%|█▏        | 297/2610 [00:18<00:57, 40.00it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia32_utt3.wav


 13%|█▎        | 334/2610 [00:19<00:52, 43.69it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia35_utt1.wav


 17%|█▋        | 444/2610 [00:23<00:41, 52.49it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia48_utt6.wav


 18%|█▊        | 471/2610 [00:24<00:34, 61.30it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia51_utt1.wav


 19%|█▉        | 494/2610 [00:24<00:48, 43.79it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia54_utt6.wav


 22%|██▏       | 583/2610 [00:26<00:36, 55.85it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia65_utt5.wav


 24%|██▍       | 630/2610 [00:27<00:42, 47.09it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia70_utt13.wav


 28%|██▊       | 731/2610 [00:30<00:35, 53.07it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia80_utt1.wav


 29%|██▊       | 745/2610 [00:30<00:36, 51.19it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia83_utt3.wav


 31%|███▏      | 820/2610 [00:32<00:33, 54.03it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia95_utt2.wav
Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia95_utt6.wav
Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia95_utt12.wav


 32%|███▏      | 843/2610 [00:33<00:29, 59.33it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia96_utt6.wav


 40%|████      | 1057/2610 [00:37<00:26, 59.01it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia116_utt4.wav


 43%|████▎     | 1121/2610 [00:39<00:29, 50.50it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia123_utt7.wav


 44%|████▎     | 1139/2610 [00:39<00:29, 49.24it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia124_utt8.wav


 45%|████▍     | 1170/2610 [00:39<00:23, 61.94it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia125_utt21.wav


 45%|████▌     | 1178/2610 [00:40<00:21, 66.55it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia126_utt15.wav


 48%|████▊     | 1260/2610 [00:42<00:26, 50.99it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia132_utt17.wav


 50%|█████     | 1311/2610 [00:43<00:20, 62.55it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia139_utt5.wav


 53%|█████▎    | 1386/2610 [00:44<00:19, 63.51it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia147_utt3.wav


 54%|█████▎    | 1401/2610 [00:44<00:19, 60.70it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia151_utt3.wav


 55%|█████▌    | 1445/2610 [00:45<00:24, 48.54it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia155_utt4.wav


 57%|█████▋    | 1499/2610 [00:46<00:19, 56.09it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia160_utt11.wav


 63%|██████▎   | 1639/2610 [00:49<00:16, 59.77it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia173_utt2.wav
Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia173_utt8.wav


 64%|██████▍   | 1683/2610 [00:50<00:15, 58.11it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia175_utt9.wav


 66%|██████▌   | 1715/2610 [00:50<00:14, 60.83it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia179_utt3.wav


 67%|██████▋   | 1738/2610 [00:51<00:13, 64.86it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia182_utt6.wav


 87%|████████▋ | 2266/2610 [01:03<00:05, 58.51it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia241_utt14.wav
Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia242_utt1.wav


 88%|████████▊ | 2307/2610 [01:04<00:08, 37.06it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia246_utt5.wav


 90%|█████████ | 2350/2610 [01:06<00:07, 37.14it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia252_utt2.wav


 95%|█████████▌| 2481/2610 [01:08<00:01, 65.49it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia262_utt13.wav
Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia263_utt3.wav


 97%|█████████▋| 2530/2610 [01:09<00:01, 47.57it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia268_utt5.wav


 99%|█████████▉| 2587/2610 [01:11<00:00, 51.26it/s]

Skip (short/silent): /content/MELD.Raw/output_repeated_splits_test/dia278_utt1.wav


100%|██████████| 2610/2610 [01:11<00:00, 36.51it/s]

MELD Acoustic Features       Sr No.                                          Utterance Speaker  \
0          1  Why do all youre coffee mugs have numbers on ...    Mark   
1          2  Oh. Thats so Monica can keep track. That way ...  Rachel   
2          3                                       Y'know what?  Rachel   
3         19                     Come on, Lydia, you can do it.    Joey   
4         20                                              Push!    Joey   
...      ...                                                ...     ...   
2605    2760  Yeah, I mean, come on Ross, no one will even n...  Rachel   
2606    2761                      They’re not listening too me?    Ross   
2607    2762  Of course they’re listening to you! Everybody ...  Rachel   
2608    2763  Monica you really think I should try this phas...    Ross   
2609    2764                             I think you look fine.  Monica   

       Emotion Sentiment  Dialogue_ID  Utterance_ID  Season  Episode  \
0   




In [None]:
test_df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,audio_path,prev_utt,next_utt,prev_speaker,next_speaker,pitch,loudness,speaking_rate
0,1,Why do all youre coffee mugs have numbers on ...,Mark,surprise,positive,0,0,3,19,"00:14:38,127","00:14:40,378",/content/MELD.Raw/output_repeated_splits_test/...,,Oh. Thats so Monica can keep track. That way ...,,Rachel,168.779260,-33.651126,7.296647
1,2,Oh. Thats so Monica can keep track. That way ...,Rachel,anger,negative,0,1,3,19,"00:14:40,629","00:14:47,385",/content/MELD.Raw/output_repeated_splits_test/...,Why do all youre coffee mugs have numbers on ...,Y'know what?,Mark,Rachel,265.576333,-31.897259,4.879717
2,3,Y'know what?,Rachel,neutral,neutral,0,2,3,19,"00:14:56,353","00:14:57,520",/content/MELD.Raw/output_repeated_splits_test/...,Oh. Thats so Monica can keep track. That way ...,,Rachel,,228.093423,-38.097702,2.556864
3,19,"Come on, Lydia, you can do it.",Joey,neutral,neutral,1,0,1,23,"0:10:44,769","0:10:46,146",/content/MELD.Raw/output_repeated_splits_test/...,,Push!,,Joey,254.074486,-32.399092,5.236092
4,20,Push!,Joey,joy,positive,1,1,1,23,"0:10:46,146","0:10:46,833",/content/MELD.Raw/output_repeated_splits_test/...,"Come on, Lydia, you can do it.","Push 'em out, push 'em out, harder, harder.",Joey,Joey,360.002782,-30.861389,2.197199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,2760,"Yeah, I mean, come on Ross, no one will even n...",Rachel,neutral,neutral,279,11,6,4,"00:14:35,457","00:14:40,211",/content/MELD.Raw/output_repeated_splits_test/...,"Yeah, fade the accent out and people will thin...",They’re not listening too me?,Monica,Ross,325.463833,-29.843027,5.675473
2606,2761,They’re not listening too me?,Ross,surprise,negative,279,12,6,4,"00:14:42,256","00:14:43,840",/content/MELD.Raw/output_repeated_splits_test/...,"Yeah, I mean, come on Ross, no one will even n...",Of course they’re listening to you! Everybody ...,Rachel,Rachel,211.890060,-30.004593,4.687500
2607,2762,Of course they’re listening to you! Everybody ...,Rachel,neutral,neutral,279,13,6,4,"00:14:44,008","00:14:48,511",/content/MELD.Raw/output_repeated_splits_test/...,They’re not listening too me?,Monica you really think I should try this phas...,Ross,Ross,229.837320,-30.826437,3.316612
2608,2763,Monica you really think I should try this phas...,Ross,neutral,neutral,279,14,6,4,"00:14:48,138","00:14:52,390",/content/MELD.Raw/output_repeated_splits_test/...,Of course they’re listening to you! Everybody ...,I think you look fine.,Rachel,Monica,183.109861,-30.422399,3.867169


In [None]:
test_df.to_csv("MELD_test_features.csv", index=False)

In [None]:
# Filter the DataFrame to include only pitch, loudness, and speaking_rate
selected_features = ["pitch", "loudness", "speaking_rate"]

# Compute 25th and 75th percentiles for selected features
thresholds_selected = {}
for feature in selected_features:
    values = test_df[feature].dropna()
    low = values.quantile(0.33)
    print("hi")
    print(low)
    high = values.quantile(0.66)
    thresholds_selected[feature] = (low, high)

thresholds_selected_df = pd.DataFrame(thresholds_selected, index=["low_threshold", "high_threshold"]).T
thresholds_selected_df

hi
194.49673866596618
hi
-31.504471604658907
hi
2.8699551569506725


Unnamed: 0,low_threshold,high_threshold
pitch,194.496739,267.333902
loudness,-31.504472,-28.763915
speaking_rate,2.869955,4.738902


### LLama Model

In [None]:
!huggingface-cli login --token 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 #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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `LLM-token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `LLM-token`


In [None]:
!huggingface-cli whoami

[1muser: [0m nargesgholami


### Fine-tunining (LoRA)

In [None]:
!pip install -q transformers datasets accelerate peft


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install --upgrade bitsandbytes accelerate transformers peft


In [None]:
import os
os.kill(os.getpid(), 9)  # 🔁 Force restart kernel to clear bitsandbytes errors


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset, Dataset
import torch
import json
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# Load dataset
with open("/content/meld_train_4000_lora.jsonl", "r") as f:
    lines = [json.loads(l) for l in f]
dataset = Dataset.from_list(lines)

# 🧠 Enable 4-bit quantization config
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize
def tokenize(example):
    prompt = example["prompt"] + " " + example["response"]
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=False)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True  # ⬅️ important!
)

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
# 📌 Apply LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

# Training args
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_steps=100,
    logging_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Train
trainer.train()

# ✅ Save
model.save_pretrained("/content/finetuned_llama3_lora")
tokenizer.save_pretrained("/content/finetuned_llama3_lora")


NameError: name 'LoraConfig' is not defined

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set your repo path
repo_path = "nargesgholami/SED"

# Save and push model
model.push_to_hub(repo_path)
tokenizer.push_to_hub(repo_path)


README.md: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nargesgholami/SED/commit/03de319cfe7a35264dd00dd1da2b007beec8579e', commit_message='Upload tokenizer', commit_description='', oid='03de319cfe7a35264dd00dd1da2b007beec8579e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nargesgholami/SED', endpoint='https://huggingface.co', repo_type='model', repo_id='nargesgholami/SED'), pr_revision=None, pr_num=None)

### Prepare Prompt

In [None]:
import numpy as np
import librosa

def categorize(value, low, high):
    if np.isnan(value):
        return "unknown"
    elif value < low:
        return "low"
    elif value < high:
        return "medium"
    else:
        return "high"

def describe_feature(pitch, loudness, speaking_rate):
    levels = {}

    # Your learned thresholds
    thresholds = {
        "pitch": (194, 267),
        "loudness": (-31.504472, -28.763915),
        "speaking_rate": (2.869955, 4.738902),
    }

    # Categorize each
    levels["pitch"] = categorize(pitch, *thresholds["pitch"])
    levels["loudness"] = categorize(loudness, *thresholds["loudness"])
    levels["speaking_rate"] = categorize(speaking_rate, *thresholds["speaking_rate"])

    # Natural-language phrases
    pitch_phrase = {
        "low": "a low pitch",
        "medium": "a medium pitch",
        "high": "a high pitch",
        "unknown": "an unknown pitch"
    }[levels["pitch"]]

    loudness_phrase = {
        "low": "a low volume",
        "medium": "a moderate tone",
        "high": "a loud tone",
        "unknown": "an unknown tone"
    }[levels["loudness"]]

    rate_phrase = {
        "low": "a slow pace",
        "medium": "a medium pace",
        "high": "a fast pace",
        "unknown": "an unknown pace"
    }[levels["speaking_rate"]]

    return f"The speaker speaks with {pitch_phrase}, {loudness_phrase}, and {rate_phrase}."


# Step 6: Create prompt with SVFS-lite and context
def create_prompt_svfs_context(row):
    context = ""
    if pd.notna(row.get("next_utt")):
        context += f'{row["prev_speaker"]}: "{row["prev_utt"]}"\n'
    context += f'Current speaker: "{row["Speaker"]}"\n'
    context += f'Utterance: "{row["Utterance"]}"\n'
    if pd.notna(row.get("next_utt")):
        context += f'{row["next_speaker"]}: "{row["next_utt"]}"\n'

    return f"""
You are an expert in detecting emotions from speech. Prioritize vocal cues such as pitch, loudness, and speaking rate, especially when the text is ambiguous.

Conversation Context:
{context.strip()}

Speech Features:
{describe_feature(row['pitch'], row['loudness'], row['speaking_rate'])}

Note that fast speaking rate or raised pitch may indicate excitement or urgency, not necessarily anger.
Always choose a **single dominant** emotion from the list. Do not list multiple emotions.
["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]

Emotion:
""".strip()


In [None]:
# Step 8: Define prediction

def get_llm_prediction(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print("💕💕", decoded)
    return decoded.split("Emotion:")[-1].strip().split("\n")[0]



In [None]:
import pandas as pd
from tqdm import tqdm
test_df = pd.read_csv("MELD_test_features.csv")

In [None]:
# Step 9: Evaluate and save

def evaluate_and_save(model, tokenizer, df_chunks, base_name="meld_predictions"):
    for i, chunk in enumerate(df_chunks):
        predictions, truths, transcripts, audio_paths = [], [], [], []
        print(f"Processing chunk {i+1}/{len(df_chunks)}...")
        for _, row in tqdm(chunk.iterrows(), total=len(chunk)):
            prompt = create_prompt_svfs_context(row)
            pred = get_llm_prediction(model, tokenizer, prompt)
            print("\n pred is:", pred)
            print("Truth is:", row['Emotion'].lower())
            predictions.append(pred.lower())
            truths.append(row['Emotion'].lower())
            transcripts.append(row['Utterance'])
            audio_paths.append(row['audio_path'])

        pd.DataFrame({
            "transcript": transcripts,
            "audio_path": audio_paths,
            "true_emotion": truths,
            "predicted_emotion": predictions
        }).to_csv(f"{base_name}_chunk{i+1}.csv", index=False)

# Step 10: Run on 1 chunk for test
chunks = [test_df.iloc[:100]]  # replace 100 with len(test_df) // N later

evaluate_and_save(model, tokenizer, chunks)

Processing chunk 1/1...


  1%|          | 1/100 [00:01<03:16,  1.99s/it]


 pred is: neutral
Truth is: surprise


  2%|▏         | 2/100 [00:04<03:22,  2.06s/it]


 pred is: neutral
Truth is: anger


  3%|▎         | 3/100 [00:05<02:59,  1.85s/it]


 pred is: neutral
Truth is: neutral


  4%|▍         | 4/100 [00:07<02:44,  1.72s/it]


 pred is: joy
Truth is: neutral


  5%|▌         | 5/100 [00:08<02:43,  1.72s/it]


 pred is: joy
Truth is: joy


  6%|▌         | 6/100 [00:10<02:29,  1.59s/it]


 pred is: joy
Truth is: joy


  7%|▋         | 7/100 [00:11<02:20,  1.51s/it]


 pred is: joy
Truth is: joy


  8%|▊         | 8/100 [00:12<02:13,  1.45s/it]


 pred is: joy
Truth is: joy


  9%|▉         | 9/100 [00:14<02:09,  1.42s/it]


 pred is: neutral
Truth is: joy


 10%|█         | 10/100 [00:15<02:05,  1.40s/it]


 pred is: joy
Truth is: joy


 11%|█         | 11/100 [00:17<02:04,  1.40s/it]


 pred is: joy
Truth is: joy


 12%|█▏        | 12/100 [00:18<02:02,  1.40s/it]


 pred is: neutral
Truth is: neutral


 13%|█▎        | 13/100 [00:20<02:10,  1.50s/it]


 pred is: neutral
Truth is: neutral


 14%|█▍        | 14/100 [00:21<02:15,  1.57s/it]


 pred is: neutral
Truth is: sadness


 15%|█▌        | 15/100 [00:23<02:09,  1.52s/it]


 pred is: anger
Truth is: surprise


 16%|█▌        | 16/100 [00:24<02:03,  1.47s/it]


 pred is: anger
Truth is: anger


 17%|█▋        | 17/100 [00:26<02:01,  1.47s/it]


 pred is: joy
Truth is: anger


 18%|█▊        | 18/100 [00:27<01:58,  1.45s/it]


 pred is: joy
Truth is: anger


 19%|█▉        | 19/100 [00:28<01:56,  1.44s/it]


 pred is: joy
Truth is: joy


 20%|██        | 20/100 [00:30<01:54,  1.43s/it]


 pred is: joy
Truth is: joy


 21%|██        | 21/100 [00:32<01:58,  1.50s/it]


 pred is: neutral
Truth is: neutral


 22%|██▏       | 22/100 [00:33<02:00,  1.54s/it]


 pred is: neutral
Truth is: neutral


 23%|██▎       | 23/100 [00:35<01:54,  1.49s/it]


 pred is: neutral
Truth is: neutral


 24%|██▍       | 24/100 [00:36<01:49,  1.44s/it]


 pred is: neutral
Truth is: fear


 25%|██▌       | 25/100 [00:37<01:47,  1.44s/it]


 pred is: neutral
Truth is: neutral


 26%|██▌       | 26/100 [00:39<01:46,  1.43s/it]


 pred is: neutral
Truth is: neutral


 27%|██▋       | 27/100 [00:40<01:43,  1.42s/it]


 pred is: neutral
Truth is: anger


 28%|██▊       | 28/100 [00:42<01:42,  1.42s/it]


 pred is: disgust
Truth is: disgust


 29%|██▉       | 29/100 [00:43<01:41,  1.43s/it]


 pred is: joy
Truth is: neutral


 30%|███       | 30/100 [00:45<01:46,  1.52s/it]


 pred is: surprise
Truth is: surprise


 31%|███       | 31/100 [00:46<01:41,  1.47s/it]


 pred is: surprise
Truth is: neutral


 32%|███▏      | 32/100 [00:47<01:37,  1.43s/it]


 pred is: joy
Truth is: surprise


 33%|███▎      | 33/100 [00:49<01:35,  1.42s/it]


 pred is: surprise
Truth is: surprise


 34%|███▍      | 34/100 [00:50<01:33,  1.41s/it]


 pred is: surprise
Truth is: neutral


 35%|███▌      | 35/100 [00:52<01:32,  1.42s/it]


 pred is: fear
Truth is: fear


 36%|███▌      | 36/100 [00:53<01:30,  1.42s/it]


 pred is: anger
Truth is: disgust


 37%|███▋      | 37/100 [00:54<01:28,  1.41s/it]


 pred is: anger
Truth is: anger


 38%|███▊      | 38/100 [00:56<01:32,  1.49s/it]


 pred is: joy
Truth is: neutral


 39%|███▉      | 39/100 [00:58<01:30,  1.48s/it]


 pred is: anger
Truth is: anger


 40%|████      | 40/100 [00:59<01:26,  1.45s/it]


 pred is: neutral
Truth is: neutral


 41%|████      | 41/100 [01:00<01:23,  1.42s/it]


 pred is: neutral
Truth is: neutral


 42%|████▏     | 42/100 [01:02<01:20,  1.39s/it]


 pred is: neutral
Truth is: neutral


 43%|████▎     | 43/100 [01:03<01:18,  1.38s/it]


 pred is: neutral
Truth is: neutral


 44%|████▍     | 44/100 [01:04<01:17,  1.38s/it]


 pred is: neutral
Truth is: neutral


 45%|████▌     | 45/100 [01:06<01:14,  1.36s/it]


 pred is: neutral
Truth is: neutral


 46%|████▌     | 46/100 [01:07<01:15,  1.40s/it]


 pred is: neutral
Truth is: neutral


 47%|████▋     | 47/100 [01:09<01:19,  1.50s/it]


 pred is: joy
Truth is: neutral


 48%|████▊     | 48/100 [01:10<01:15,  1.45s/it]


 pred is: neutral
Truth is: neutral


 49%|████▉     | 49/100 [01:12<01:12,  1.42s/it]


 pred is: neutral
Truth is: neutral


 50%|█████     | 50/100 [01:13<01:09,  1.39s/it]


 pred is: surprise
Truth is: surprise


 51%|█████     | 51/100 [01:14<01:07,  1.38s/it]


 pred is: neutral
Truth is: neutral


 52%|█████▏    | 52/100 [01:16<01:06,  1.38s/it]


 pred is: neutral
Truth is: neutral


 53%|█████▎    | 53/100 [01:17<01:04,  1.38s/it]


 pred is: anger
Truth is: neutral


 54%|█████▍    | 54/100 [01:18<01:03,  1.38s/it]


 pred is: neutral
Truth is: neutral


 55%|█████▌    | 55/100 [01:20<01:06,  1.47s/it]


 pred is: neutral
Truth is: surprise


 56%|█████▌    | 56/100 [01:22<01:06,  1.51s/it]


 pred is: neutral
Truth is: neutral


 57%|█████▋    | 57/100 [01:23<01:02,  1.46s/it]


 pred is: neutral
Truth is: surprise


 58%|█████▊    | 58/100 [01:24<01:00,  1.44s/it]


 pred is: joy
Truth is: neutral


 59%|█████▉    | 59/100 [01:26<00:58,  1.43s/it]


 pred is: neutral
Truth is: neutral


 60%|██████    | 60/100 [01:27<00:55,  1.39s/it]


 pred is: joy
Truth is: sadness


 61%|██████    | 61/100 [01:28<00:54,  1.39s/it]


 pred is: joy
Truth is: joy


 62%|██████▏   | 62/100 [01:30<00:52,  1.38s/it]


 pred is: sadness
Truth is: neutral


 63%|██████▎   | 63/100 [01:31<00:52,  1.41s/it]


 pred is: neutral
Truth is: surprise


 64%|██████▍   | 64/100 [01:33<00:54,  1.51s/it]


 pred is: neutral
Truth is: neutral


 65%|██████▌   | 65/100 [01:34<00:51,  1.46s/it]


 pred is: neutral
Truth is: neutral


 66%|██████▌   | 66/100 [01:36<00:48,  1.42s/it]


 pred is: neutral
Truth is: joy


 67%|██████▋   | 67/100 [01:37<00:46,  1.41s/it]


 pred is: joy
Truth is: fear


 68%|██████▊   | 68/100 [01:39<00:45,  1.41s/it]


 pred is: neutral
Truth is: neutral


 69%|██████▉   | 69/100 [01:40<00:43,  1.40s/it]


 pred is: neutral
Truth is: sadness


 70%|███████   | 70/100 [01:41<00:41,  1.39s/it]


 pred is: surprise
Truth is: surprise


 71%|███████   | 71/100 [01:43<00:39,  1.38s/it]


 pred is: surprise
Truth is: surprise


 72%|███████▏  | 72/100 [01:44<00:40,  1.45s/it]


 pred is: neutral
Truth is: neutral


 73%|███████▎  | 73/100 [01:46<00:40,  1.49s/it]


 pred is: surprise
Truth is: surprise


 74%|███████▍  | 74/100 [01:47<00:37,  1.46s/it]


 pred is: joy
Truth is: anger


 75%|███████▌  | 75/100 [01:49<00:35,  1.44s/it]


 pred is: neutral
Truth is: sadness


 76%|███████▌  | 76/100 [01:50<00:34,  1.42s/it]


 pred is: disgust
Truth is: sadness


 77%|███████▋  | 77/100 [01:51<00:32,  1.42s/it]


 pred is: sadness
Truth is: neutral


 78%|███████▊  | 78/100 [01:53<00:31,  1.41s/it]


 pred is: sadness
Truth is: sadness


 79%|███████▉  | 79/100 [01:54<00:29,  1.41s/it]


 pred is: neutral
Truth is: neutral


 80%|████████  | 80/100 [01:56<00:28,  1.43s/it]


 pred is: anger
Truth is: anger


 81%|████████  | 81/100 [01:57<00:28,  1.52s/it]


 pred is: neutral
Truth is: neutral


 82%|████████▏ | 82/100 [01:59<00:26,  1.47s/it]


 pred is: anger
Truth is: joy


 83%|████████▎ | 83/100 [02:00<00:24,  1.45s/it]


 pred is: neutral
Truth is: neutral


 84%|████████▍ | 84/100 [02:02<00:22,  1.43s/it]


 pred is: neutral
Truth is: neutral


 85%|████████▌ | 85/100 [02:03<00:21,  1.41s/it]


 pred is: fear
Truth is: anger


 86%|████████▌ | 86/100 [02:04<00:19,  1.41s/it]


 pred is: neutral
Truth is: neutral


 87%|████████▋ | 87/100 [02:06<00:18,  1.40s/it]


 pred is: neutral
Truth is: neutral


 88%|████████▊ | 88/100 [02:07<00:16,  1.40s/it]


 pred is: anger
Truth is: neutral


 89%|████████▉ | 89/100 [02:09<00:16,  1.48s/it]


 pred is: joy
Truth is: sadness


 90%|█████████ | 90/100 [02:10<00:14,  1.49s/it]


 pred is: fear
Truth is: fear


 91%|█████████ | 91/100 [02:12<00:13,  1.47s/it]


 pred is: neutral
Truth is: neutral


 92%|█████████▏| 92/100 [02:13<00:11,  1.44s/it]


 pred is: neutral
Truth is: neutral


 93%|█████████▎| 93/100 [02:14<00:09,  1.42s/it]


 pred is: neutral
Truth is: neutral


 94%|█████████▍| 94/100 [02:16<00:08,  1.41s/it]


 pred is: anger
Truth is: anger


 95%|█████████▌| 95/100 [02:17<00:07,  1.41s/it]


 pred is: neutral
Truth is: neutral


 96%|█████████▌| 96/100 [02:19<00:05,  1.42s/it]


 pred is: anger
Truth is: sadness


 97%|█████████▋| 97/100 [02:20<00:04,  1.48s/it]


 pred is: anger
Truth is: anger


 98%|█████████▊| 98/100 [02:22<00:03,  1.54s/it]


 pred is: anger
Truth is: anger


 99%|█████████▉| 99/100 [02:23<00:01,  1.49s/it]


 pred is: neutral
Truth is: anger


100%|██████████| 100/100 [02:25<00:00,  1.45s/it]


 pred is: joy
Truth is: sadness





In [None]:
import pandas as pd
import re
from sklearn.metrics import accuracy_score, classification_report

# Load predictions
df = pd.read_csv("meld_predictions_chunk1.csv")

# Drop rows where either true or predicted emotion is missing
df = df.dropna(subset=["true_emotion", "predicted_emotion"])

# Clean and normalize true labels
df["true_emotion"] = df["true_emotion"].astype(str).str.lower().str.strip()

# Function to extract the first valid emotion from predicted_emotion
def extract_first_emotion(s):
    s = str(s).lower().strip()
    # Remove enclosing brackets or parentheses
    s = re.sub(r'^[\[\(\"\']+|[\]\)\"\']+$', '', s)
    # Split on comma or space and return the first token that looks like a word
    parts = re.split(r'[,\s]+', s)
    for p in parts:
        if p in ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'neutral']:
            return p
    return s  # fallback (may be empty or malformed)

# Normalize predicted_emotion
df["predicted_emotion"] = df["predicted_emotion"].apply(extract_first_emotion)

# Extract cleaned values
y_true = df["true_emotion"]
y_pred = df["predicted_emotion"]

# Compute accuracy
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.6326


In [None]:
# Show precision, recall, F1-score
labels = ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]
print(classification_report(y_true, y_pred, labels=labels))


              precision    recall  f1-score   support

         joy       0.52      0.69      0.59       402
     sadness       0.69      0.20      0.31       208
       anger       0.46      0.62      0.53       345
        fear       0.43      0.26      0.33        50
    surprise       0.69      0.40      0.50       281
     disgust       0.54      0.21      0.30        68
     neutral       0.74      0.78      0.76      1256

    accuracy                           0.63      2610
   macro avg       0.58      0.45      0.47      2610
weighted avg       0.65      0.63      0.62      2610



# ShEMO

In [None]:
!huggingface-cli login --token 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 #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

!huggingface-cli whoami

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `LLM-token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `LLM-token`
nargesgholami


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

repo_path = "nargesgholami/SED"

base_model = AutoModelForCausalLM.from_pretrained(
    repo_path,

    torch_dtype="auto"
)

model_base = PeftModel.from_pretrained(base_model, repo_path)

tokenizer_base = AutoTokenizer.from_pretrained(repo_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
!pip install pyloudnorm
!pip install praat-parselmouth

Collecting pyloudnorm
  Downloading pyloudnorm-0.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading pyloudnorm-0.1.1-py3-none-any.whl (9.6 kB)
Installing collected packages: pyloudnorm
Successfully installed pyloudnorm-0.1.1
Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.6


In [None]:
import librosa
import pyloudnorm as pyln
import parselmouth

def describe_feature(path):
    y, sr = librosa.load(path, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    meter = pyln.Meter(sr, block_size=0.2)
    loudness = meter.integrated_loudness(y)

    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch_values = pitches[magnitudes > np.median(magnitudes)]
    mean_pitch = np.mean(pitch_values)
    pitch_std = np.std(pitch_values)  # → pitch instability

    snd = parselmouth.Sound(path)
    point_process = parselmouth.praat.call(snd, "To PointProcess (periodic, cc)", 75, 500)
    jitter = parselmouth.praat.call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = parselmouth.praat.call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean()

    num_words = len(transcribe.split())
    num_syllables = num_words * 1.5
    speaking_rate = num_syllables / duration
    features = {
    "Pitch": [mean_pitch, [100, 200], ["low", "medium", "high"]],
    "Pitch Stability": [pitch_std, [10, 30], ["stable", "moderately variable", "unstable"]],
    "Loudness": [loudness, [-35, -20], ["soft", "normal", "loud"]],
    "Jitter": [jitter, [0.005, 0.01], ["stable", "slightly trembling", "highly unstable"]],
    "Shimmer": [shimmer, [0.03, 0.05], ["smooth", "mildly rough", "very rough"]],
    "Timbre": [centroid, [1500, 3000], ["dark", "balanced", "bright"]],
    "Speaking Rate": [speaking_rate, [4, 7], ["slow", "normal", "fast"]]
    }

    result = ''
    for name, info in features.items():
        value, thresholds, descriptions = info
        if value < thresholds[0]:
            desc = descriptions[0]
        elif value < thresholds[1]:
            desc = descriptions[1]
        else:
            desc = descriptions[2]
        result += f"- {name}: {desc}\n"
    return result

In [None]:
prompt_template_base = """
You are an expert emotion detector. Your task is to identify the dominant emotional tone of a given text.
Use both the meaning of the words and the vocal characteristics (volume, pitch, speed).

Only reply with the name of the dominant emotion. Do not add any explanation or example.

Choose from:
["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"]

Example 1:
Transcript:
"I'm so excited to see you after all these years!"

Extracted audio features:
- Pitch: high
- Pitch Stability: moderately variable
- Loudness: loud
- Jitter: slightly trembling
- Shimmer: mildly rough
- Timbre: bright
- Speaking Rate: fast

Emotion: joy

Example 2:
Transcript:
"I don't know... I just feel like nothing matters anymore."

Extracted audio features:
- Pitch: low
- Pitch Stability: stable
- Loudness: soft
- Jitter: stable
- Shimmer: smooth
- Timbre: dark
- Speaking Rate: slow

Emotion: sadness

Now, analyze the following:

"{Transcript}"

Extracted audio features:
{Features}
Emotion:
""".strip()

def create_prompt_base(text, sound_path):
    return prompt_template_base.format(Transcript=text, Features=describe_feature(sound_path))

def extract_last_emotion(prompt: str) -> str:
    lines = prompt.strip().splitlines()
    emotion_lines = [line for line in lines if line.strip().startswith("Emotion:")]

    if not emotion_lines:
        return "No emotion found"

    last_emotion = emotion_lines[-1].strip().replace("Emotion:", "").strip()
    return last_emotion


In [None]:
# ======================
# 🧠 DEFINE EMOTION MAP
# ======================
emotion_map = {
    'A': 'anger',
    'F': 'fear',
    'H': 'happiness',
    'N': 'neutral',
    'S': 'sadness',
    'W': 'surprise'
}

In [None]:
import os
import zipfile
import requests

def download_and_extract(url, base_output_dir, gender):
    filename = url.split('/')[-1]
    zip_path = os.path.join(base_output_dir, filename)
    gender_dir = os.path.join(base_output_dir, gender)

    os.makedirs(gender_dir, exist_ok=True)

    # Download the file
    print(f"Downloading {filename}...")
    response = requests.get(url)
    with open(zip_path, 'wb') as f:
        f.write(response.content)

    # Extract into gender-specific folder
    print(f"Extracting {filename} into {gender_dir}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(gender_dir)

    os.remove(zip_path)
    print(f"{filename} done!")

# Define base directory
base_output_dir = './shemo_dataset'
os.makedirs(base_output_dir, exist_ok=True)

# Download and extract into separate folders
download_and_extract('https://github.com/mansourehk/ShEMO/raw/master/female.zip', base_output_dir, 'female')
download_and_extract('https://github.com/mansourehk/ShEMO/raw/master/male.zip', base_output_dir, 'male')
download_and_extract('https://github.com/mansourehk/ShEMO/raw/master/transcript.zip', base_output_dir, 'transcript')


Downloading female.zip...
Extracting female.zip into ./shemo_dataset/female...
female.zip done!
Downloading male.zip...
Extracting male.zip into ./shemo_dataset/male...
male.zip done!
Downloading transcript.zip...
Extracting transcript.zip into ./shemo_dataset/transcript...
transcript.zip done!


In [None]:
import json

with open('translated_text_a_path.json', 'r') as file:
    translated_text = json.load(file)



In [None]:
import os
import pandas as pd

# ======================
# 📂 LOAD DATASET with TRANSCRIPTS
# ======================
root_dir = 'shemo_dataset'
transcript_dir = os.path.join(root_dir, 'transcript', 'final text')

paths = []
labels = []
transcripts = []

for gender in ['male', 'female']:
    folder = os.path.join(root_dir, gender)
    for file in os.listdir(folder):
        if file.endswith('.wav'):
            base_name = file.replace('.wav', '')  # e.g., F01A01
            emotion_code = file[3]  # extract emotion character
            emotion = emotion_map.get(emotion_code)
            if emotion:
                audio_path = os.path.join(folder, file)

                # Match the transcript file
                transcript_file = os.path.join(transcript_dir, f"{base_name}.ort")
                if os.path.exists(transcript_file):
                    with open(transcript_file, 'r', encoding='utf-8') as f:
                        transcript = f.read().strip()
                else:
                    transcript = None  # fallback if not found

                paths.append(audio_path)
                labels.append(emotion)
                transcripts.append(transcript)

# Create the DataFrame
df = pd.DataFrame({
    'audio_path': paths,
    'label': labels,
    'transcript': transcripts
})

# Optional: show counts and sample rows
print(df['label'].value_counts())
print(df.head())


label
anger        1059
neutral      1028
sadness       449
surprise      225
happiness     201
fear           38
Name: count, dtype: int64
                      audio_path    label  \
0  shemo_dataset/male/M27N56.wav  neutral   
1  shemo_dataset/male/M04N01.wav  neutral   
2  shemo_dataset/male/M32S03.wav  sadness   
3  shemo_dataset/male/M28A36.wav    anger   
4  shemo_dataset/male/M12A22.wav    anger   

                                          transcript  
0          یه نفر شیاد و کلاه‎بردار به اسم بیل لارنر  
1                               حالا دیگه اونجا نیست  
2  این خیانت رو مادمازل مارتن به من کرد و اِلا من...  
3  من دوک ژاندراگون هستم، اگر دارهای شما برای سر ...  
4         خب پس برای چی اومدی اینجا؟ که بهم لطف کنی؟  


In [None]:
import pandas as pd
import json

df_json = pd.DataFrame(translated_text)

merged_df = pd.merge(df, df_json, on='audio_path', how='outer')

In [None]:
# ======================
# 🧾 MAP LABELS TO INTEGERS
# ======================
label_map = {label: idx for idx, label in enumerate(merged_df['label'].unique())}
inverse_label_map = {v: k for k, v in label_map.items()}
merged_df['label_cont'] = merged_df['label']
merged_df['label'] = merged_df['label'].map(label_map)


In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
import librosa
import pandas as pd

class SpeechEmotionDataset(Dataset):
    def __init__(self, df, processor, max_length=48000):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        path = self.df.iloc[index]['audio_path']
        label = int(self.df.iloc[index]['label'])

        speech, _ = librosa.load(path, sr=16000)

        # Pad or truncate
        if len(speech) > self.max_length:
            speech = speech[:self.max_length]
        else:
            speech = np.pad(speech, (0, self.max_length - len(speech)))

        inputs = self.processor(
            speech,
            return_tensors='pt',
            sampling_rate=16000,
            padding=True,
            truncation=True,
            max_length=self.max_length
        )
        input_values = inputs.input_values.squeeze()

        return {
            'input_values': input_values,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example usage
#train_dataset = SpeechEmotionDataset(df, processor)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
def accuracy(dataset_path, labels, transcribe):
  preds = []
  for i in tqdm(range(0,len(dataset_path))):
    path = dataset_path[i]
    trs = transcribe[i]
    try:
      prompt = create_prompt_base(trs, path)
      #print("******",prompt)
      inputs = tokenizer_base(prompt, return_tensors="pt")
      output = model_base.generate(**inputs, max_new_tokens=3, do_sample=False, pad_token_id=tokenizer_base.eos_token_id)
      output_tokens = tokenizer_base.decode(output[0], skip_special_tokens=True)
      preds.append(extract_last_emotion(output_tokens))
    except:
      preds.append(None)
  print(labels, "   :", preds)
  print(' ')
  print("accuracy: ", accuracy_score(labels, preds))
  return preds

In [None]:
dataset_path = merged_df['audio_path'].to_list()[20:40]
labels = merged_df['label'].to_list()[20:40]
trans = merged_df['translated_text'].to_list()[20:40]
lab_cont = merged_df['label_cont'].to_list()[20:40]

In [None]:
path = dataset_path[2]
transcribe = trans[2]
prompt = create_prompt_base(transcribe, path)
inputs = tokenizer_base(prompt, return_tensors="pt")
output = model_base.generate(**inputs, max_new_tokens=3, do_sample=False, pad_token_id=tokenizer_base.eos_token_id)
output_tokens = tokenizer_base.decode(output[0], skip_special_tokens=True)
extract_last_emotion(output_tokens)

'neutral'

In [None]:
per = accuracy(dataset_path, labels,trans)

100%|██████████| 20/20 [3:43:55<00:00, 671.76s/it]

[0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]    : ['neutral', 'anger', 'neutral', 'neutral', 'neutral', 'surprise', 'sadness', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'neutral', 'surprise', 'neutral', 'neutral']
 
accuracy:  0.0





In [None]:
print(lab_cont)

['anger', 'anger', 'anger', 'fear', 'fear', 'fear', 'happiness', 'happiness', 'happiness', 'happiness', 'happiness', 'happiness', 'happiness', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral']


In [None]:
print(per)

['neutral', 'anger', 'neutral', 'neutral', 'neutral', 'surprise', 'sadness', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'neutral', 'surprise', 'neutral', 'neutral']


In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(lab_cont, per)

0.3

In [None]:
import json

with open('label_20_40.json', 'w') as file:
    json.dump(per, file)


In [None]:
dataset_path = df['audio_path'].to_list()[60:80]
labels = df['label'].to_list()[60:80]
trans = df['transcript'].to_list()[60:80]
lab_cont = df['label_cont'].to_list()[60:80]

In [None]:
path = dataset_path[2]
transcribe = trans[2]
prompt = create_prompt_base(transcribe, path)
inputs = tokenizer_base(prompt, return_tensors="pt")
output = model_base.generate(**inputs, max_new_tokens=3, do_sample=False, pad_token_id=tokenizer_base.eos_token_id)
output_tokens = tokenizer_base.decode(output[0], skip_special_tokens=True)
extract_last_emotion(output_tokens)

'sadness'

In [None]:
per = accuracy(dataset_path, labels,trans)

100%|██████████| 20/20 [1:50:21<00:00, 331.05s/it]

[2, 1, 0, 4, 0, 1, 0, 0, 2, 0, 4, 2, 2, 4, 0, 4, 4, 0, 0, 2]    : ['neutral', 'neutral', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'anger', 'neutral', 'anger', 'joy', 'fear', 'neutral', 'neutral', 'neutral', 'sadness', 'neutral', 'anger', 'neutral', 'neutral']
 
accuracy:  0.0





In [None]:
print(per)

['neutral', 'neutral', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'anger', 'neutral', 'anger', 'joy', 'fear', 'neutral', 'neutral', 'neutral', 'sadness', 'neutral', 'anger', 'neutral', 'neutral']


In [None]:
print(lab_cont)

['neutral', 'sadness', 'anger', 'happiness', 'anger', 'sadness', 'anger', 'anger', 'neutral', 'anger', 'happiness', 'neutral', 'neutral', 'happiness', 'anger', 'happiness', 'happiness', 'anger', 'anger', 'neutral']


In [None]:
import json

with open('label_60_80.json', 'w') as file:
    json.dump(per, file)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(lab_cont, per)

0.35