## Features extraction

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torchaudio
import torch
from transformers import (
    Wav2Vec2FeatureExtractor, Wav2Vec2Model,
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline,
    RobertaTokenizer, RobertaModel
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
from google.colab import drive
drive.mount('/content/drive')

ROOT_AUDIO_DIR = "/content/drive/MyDrive/speech_wav/audio"
TRANSCRIPTS_PATH = "/content/drive/MyDrive/speech_wav/text_sentences.xlsx"
LABELS_PATH = "/content/drive/MyDrive/speech_wav/text_sentences.xlsx"
OUTPUT_FEATURE_FILE = "/content/wav2vec_features.npy"
OUTPUT_NAMES_FILE = "/content/file_names.csv"
MAX_LENGTH = 300
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Audio: Wav2Vec2 model

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2FeatureExtractor # Import Wav2Vec2FeatureExtractor
import torch

# Load Wav2Vec2
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
wav2vec_model = Wav2Vec2Model.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition").to(DEVICE)
wav2vec_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.28k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [4]:
# Extract Audio Features
def extract_wav2vec_features(audio_path, max_len=300):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    inputs = feature_extractor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        outputs = wav2vec_model(**inputs.to(DEVICE))
    features = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    if features.shape[0] < max_len:
        features = np.pad(features, ((0, max_len - features.shape[0]), (0, 0)), mode='constant')
    else:
        features = features[:max_len]
    return features

### text: Roberta

In [5]:
!pip install transformers
from transformers import RobertaTokenizer, RobertaModel

# Load roberta
bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
bert_model = RobertaModel.from_pretrained("roberta-base").to(DEVICE)
bert_model.eval()



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [6]:
# Extract Text Features
def extract_text_features(text):
    inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs.to(DEVICE))
    return outputs.last_hidden_state[:, 0, :].squeeze(0).cpu().numpy()  # [CLS] token output

## audio + text

In [7]:
# Load Transcripts
transcripts_df = pd.read_excel('/content/drive/MyDrive/speech_wav/text_sentences.xlsx')

# Detect correct column for audio matching
print("Transcript columns:", transcripts_df.columns.tolist())
audio_col = None
for col in transcripts_df.columns:
    if col.lower() in ["audio", "file", "clip", "filename", "clipname"]:
        audio_col = col
        break
if audio_col is None:
    raise ValueError("No suitable column found for matching audio file names in transcript file.")

Transcript columns: ['clipName', 'transcript', 'group', 'word', 'tone', 'indicator', 'type', 'sentence', 'word_count']


In [9]:
# Extract Features
features_audio, features_text, file_names = [], [], []
for folder in os.listdir(ROOT_AUDIO_DIR):
    folder_path = os.path.join(ROOT_AUDIO_DIR, folder)
    if not os.path.isdir(folder_path):
        continue
    for filename in tqdm(os.listdir(folder_path), desc=f"Processing {folder}"):
        if not filename.endswith(".wav"):
            continue
        clip_name = filename[:-4].strip()
        path = os.path.join(folder_path, filename)
        try:
            audio_feat = extract_wav2vec_features(path)
            text_row = transcripts_df[transcripts_df[audio_col] == clip_name]
            if len(text_row) == 0:
                print(f"Transcript not found for: {clip_name}")
                continue
            text_feat = extract_text_features(text_row.iloc[0]["transcript"])
            features_audio.append(audio_feat)
            features_text.append(text_feat)
            file_names.append(clip_name)
        except Exception as e:
            print(f" Failed: {clip_name} — {e}")

Processing IWTGH: 100%|██████████| 18/18 [01:25<00:00,  4.76s/it]
Processing IJRTB: 100%|██████████| 18/18 [01:24<00:00,  4.72s/it]
Processing WWIBA: 100%|██████████| 18/18 [01:23<00:00,  4.65s/it]
Processing TNAML: 100%|██████████| 18/18 [01:25<00:00,  4.74s/it]
Processing DYWDI: 100%|██████████| 18/18 [01:23<00:00,  4.66s/it]
Processing HDYFO: 100%|██████████| 18/18 [01:13<00:00,  4.09s/it]
Processing YSURE: 100%|██████████| 18/18 [01:33<00:00,  5.17s/it]
Processing NTHIN: 100%|██████████| 18/18 [01:06<00:00,  3.71s/it]
Processing WDYSM: 100%|██████████| 18/18 [01:27<00:00,  4.87s/it]
Processing TBMBW: 100%|██████████| 18/18 [01:47<00:00,  5.99s/it]


In [10]:
features_audio = np.array(features_audio)
features_text = np.array(features_text)
pd.DataFrame({"audio": file_names}).to_csv(OUTPUT_NAMES_FILE, index=False)
np.save("/content/features_audio.npy", features_audio)
np.save("/content/features_text.npy", features_text)

print(f"Successfully extracted features for {len(file_names)} samples.")

Successfully extracted features for 180 samples.


In [35]:
# Load and reshape audio features
features_audio = np.load("/content/features_audio.npy")
features_audio_2d = features_audio.reshape(features_audio.shape[0], -1)
feature_audio_df = pd.DataFrame(features_audio_2d)

# Drop columns where more than 95% of the values are zero
threshold = 0.95  # keep columns with >5% non-zero values
non_zero_mask = (feature_audio_df != 0).sum(axis=0) > (len(feature_audio_df) * threshold)
feature_audio_df = feature_audio_df.loc[:, non_zero_mask]
feature_audio_df.head()


# Load and reshape text features
features_text = np.load("/content/features_text.npy")
features_text_2d = features_text.reshape(features_text.shape[0], -1)
feature_text_df = pd.DataFrame(features_text_2d)


# Combine audio + text features
feature_df = pd.concat([feature_text_df, feature_audio_df], axis=1)
print(feature_df.head())

     0         1         2         3         4         5         6       \
0 -0.125237  0.145307  0.008368 -0.102581  0.126698 -0.064270 -0.013294   
1 -0.128443  0.172417  0.005144 -0.103018  0.139028 -0.067578 -0.021665   
2 -0.064666  0.083569 -0.004393 -0.118907  0.084864 -0.094979 -0.023517   
3 -0.125237  0.145307  0.008368 -0.102581  0.126698 -0.064270 -0.013294   
4 -0.125237  0.145307  0.008368 -0.102581  0.126698 -0.064270 -0.013294   

     7         8         9       ...    129014    129015    129016    129017  \
0  0.037022  0.007203 -0.096732  ... -0.288854  0.028038 -0.330415 -0.533572   
1  0.054693  0.014624 -0.098855  ... -0.285811  0.075828 -0.240275 -0.445118   
2  0.046185  0.035754 -0.088263  ... -0.136882 -0.029410  0.102305  0.091745   
3  0.037022  0.007203 -0.096732  ... -0.492177 -0.118839  0.408575 -0.175713   
4  0.037022  0.007203 -0.096732  ...  0.000000  0.000000  0.000000  0.000000   

     129018    129019    129020    129021    129022    129023  
0  0

In [37]:
# Load the audio file names
file_names_df = pd.read_csv(OUTPUT_NAMES_FILE)  # Should contain a column named 'audio'
file_names_df.head()

# load text_df
text_df = pd.read_excel(TRANSCRIPTS_PATH)

# merge all into one final dataframe
full_df = pd.concat([
    file_names_df.reset_index(drop=True),
    text_df.reset_index(drop=True),
    feature_df.reset_index(drop=True)
], axis=1)

full_df.head()

Unnamed: 0,audio,clipName,transcript,group,word,tone,indicator,type,sentence,word_count,...,129014,129015,129016,129017,129018,129019,129020,129021,129022,129023
0,IWTGH_1,IWTGH_1,I want to go home.,1,Neutral,Neutral,,Statement,Single,18,...,-0.288854,0.028038,-0.330415,-0.533572,0.488891,0.347258,-0.784329,0.299556,0.33995,-0.215756
1,IWTGH_7,IWTGH_2,I want to go home.,1,Neutral,Anger,,Statement,Single,18,...,-0.285811,0.075828,-0.240275,-0.445118,0.445552,0.385589,-0.690271,0.05057,0.501328,-0.174793
2,IWTGH_8,IWTGH_3,I want to go home.,1,Neutral,Disgust,,Statement,Single,18,...,-0.136882,-0.02941,0.102305,0.091745,0.150956,-0.948514,-0.257718,0.647594,-0.043676,0.157749
3,IWTGH_4,IWTGH_4,I want to go home.,1,Neutral,Fear,,Statement,Single,18,...,-0.492177,-0.118839,0.408575,-0.175713,-0.247137,-0.099237,0.106934,0.83618,-0.369091,-0.035886
4,IWTGH_2,IWTGH_5,I want to go home.,1,Neutral,Sad,,Statement,Single,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Save final DataFrame to user's Google Drive
output_path = "/content/drive/MyDrive/speech_wav/feature_wave2vec_roberta.csv"
# Save to CSV
full_df.to_csv(output_path, index=False)

# Confirm path
output_path

'/content/drive/MyDrive/speech_wav/feature_wave2vec_roberta.csv'