In [2]:
import librosa
import numpy as np
import os

In [4]:
from pydub import AudioSegment

def split_mp3(filename, num_segments):
    ''' Evenly split an mp3 file into num_segments segments '''
    os.makedirs(filename[:-4], exist_ok=True)
    audio = AudioSegment.from_mp3(filename)
    duration_ms = len(audio)
    segment_length = duration_ms // num_segments
    for i in range(0, duration_ms, segment_length):
        segment = audio[i:i + segment_length]
        segment_filename = filename[:-4] + f"/segment_{i // segment_length + 1}.mp3"
        segment.export(segment_filename, format="mp3")
        print(f"Exported {segment_filename}")

path_to_file = 'AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137.mp3'

num_segments = 15
split_mp3(path_to_file, num_segments)

Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_1.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_2.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_3.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_4.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_5.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_6.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_7.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_8.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_9.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_10.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_11.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_12.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_13.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_14.mp3
E

In [13]:
# go through all mp3 files in the directory and subdirectories and split them
def split_all(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.mp3'):
                filename = file.split('/')[-1][:-4]
                company = root.split('/')[-1]
                output_dir = 'AudioData/' + company + '/' + filename
                os.makedirs(output_dir, exist_ok=True)
                audio = AudioSegment.from_mp3(root + '/' + file)
                duration_ms = len(audio)
                segment_length = duration_ms // num_segments
                for i in range(0, duration_ms, segment_length):
                    segment = audio[i:i + segment_length]
                    segment_filename = output_dir + f"/segment_{i // segment_length + 1}.mp3"
                    segment.export(segment_filename, format="mp3")
                print(f"Exported {output_dir}")
                # delete segment 16 if exists
                if os.path.exists(output_dir + '/segment_16.mp3'):
                    os.remove(output_dir + '/segment_16.mp3', )

split_all('Audio/')

Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057
Exported AudioData/AAPL/2020-Jan-28-AAPL.OQ-137948852907
Exported AudioData/AAPL/2019-Jan-29-AAPL.OQ-140336997647
Exported AudioData/AAPL/2023-Aug-03-AAPL.OQ-138336763416
Exported AudioData/AAPL/2021-Jan-27-AAPL.OQ-137405328510
Exported AudioData/AAPL/2021-Apr-28-AAPL.OQ-139264952632
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137
Exported AudioData/AAPL/2020-Oct-29-AAPL.OQ-141103256170
Exported AudioData/AAPL/2022-Oct-27-AAPL.OQ-140449384074
Exported AudioData/AAPL/2022-Apr-28-AAPL.OQ-138491813375
Exported AudioData/AAPL/2019-Oct-30-AAPL.OQ-137802390954
Exported AudioData/AAPL/2023-Nov-02-AAPL.OQ-140502977515
Exported AudioData/AAPL/2021-Jul-27-AAPL.OQ-138310703827
Exported AudioData/AAPL/2021-Oct-28-AAPL.OQ-139435924054
Exported AudioData/AAPL/2023-Feb-02-AAPL.OQ-140682524715
Exported AudioData/AAPL/2022-Jan-27-AAPL.OQ-138984465849
Exported AudioData/AAPL/2024-Feb-01-AAPL.OQ-139920838259
Exported AudioData/AAPL/2019-Ju

In [3]:
def librosa_process(y, sr=16000):
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    return tempo, chroma_stft.mean(), rmse.mean(), spec_cent.mean(), spec_bw.mean(), rolloff.mean(), zcr.mean(), mfcc.mean()


from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)
import torch
import torch.nn as nn

class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

def wave2vec_process(y):
    y_processed = processor(y, sampling_rate=16000)
    y_processed = y_processed['input_values'][0]
    y_processed = y_processed.reshape(1, -1)
    y_processed = torch.from_numpy(y_processed).to(device)

    with torch.no_grad():
            y_processed, sent = model(y_processed)

    # convert to numpy
    y_processed = y_processed.detach().cpu().numpy()
    sent = sent.detach().cpu().numpy()

    return sent


  from .autonotebook import tqdm as notebook_tqdm
Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
from transformers import ClapModel, ClapProcessor


model_clap = ClapModel.from_pretrained("laion/larger_clap_general")
processor_clap = ClapProcessor.from_pretrained("laion/larger_clap_general")

def clap_process(y):
    inputs = processor_clap(audios=y, return_tensors="pt")
    audio_embed = model_clap.get_audio_features(**inputs)
    return audio_embed



In [5]:
# function to extract company and date from a format like 2020-Jul-30-AAPL.OQ-139668219181
def extract_company_date(file):
    date = file.split('-')[0] + '-' + file.split('-')[1] + '-' + file.split('-')[2]
    company = file.split('-')[3]
    return company, date


def get_features(path, librosa_ = True, wave2vec = False, embeddings=False):
    ''' Get features for each segment of the audio file'''
    clap_features = {}
    features = {}
    for i in range(1, 16):
        if librosa_:
            features[f'company_{i}'] = []
            features[f'date_{i}'] = []
            features[f'tempo_segment_{i}'] = []
            features[f'chroma_stft_segment_{i}'] = []
            features[f'rmse_segment_{i}'] = []
            features[f'spec_cent_segment_{i}'] = []
            features[f'spec_bw_segment_{i}'] = []
            features[f'rolloff_segment_{i}'] = []
            features[f'zcr_segment_{i}'] = []
            features[f'mfcc_segment_{i}'] = []

        if wave2vec:
            features[f'arousal_segment_{i}'] = []
            features[f'valence_segment_{i}'] = []
            features[f'dominance_segment_{i}'] = []
        
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.mp3'):
                company, date = extract_company_date(root.split('/')[-1])
                # extract date from root
                y, _ = librosa.load(root + '/' + file, sr=16000)
                if librosa_:
                    tempo, chroma, rmse, spec_cent, spec_bw, rolloff, zcr, mfcc = librosa_process(y)
                    segment = int(file.split('_')[-1][:-4])
                    features[f'company_{segment}'].append(company)
                    features[f'date_{segment}'].append(date)
                    features[f'tempo_segment_{segment}'].append(tempo)
                    features[f'chroma_stft_segment_{segment}'].append(chroma)
                    features[f'rmse_segment_{segment}'].append(rmse)
                    features[f'spec_cent_segment_{segment}'].append(spec_cent)
                    features[f'spec_bw_segment_{segment}'].append(spec_bw)
                    features[f'rolloff_segment_{segment}'].append(rolloff)
                    features[f'zcr_segment_{segment}'].append(zcr)
                    features[f'mfcc_segment_{segment}'].append(mfcc)
                if wave2vec:
                    arousal, valence, dominance = wave2vec_process(y)
                    features[f'arousal_segment_{segment}'].append(arousal)
                    features[f'valence_segment_{segment}'].append(valence)
                    features[f'dominance_segment_{segment}'].append(dominance)
                if embeddings:
                    clap_features[(company, date, segment)] = clap_process(y)
    return features, clap_features
                        


    

In [6]:
features, clap_features = get_features('AudioData/', librosa_=True, wave2vec=False, embeddings=False)

In [7]:
for key in features:
    print(key, len(features[key]))

company_1 87
date_1 87
tempo_segment_1 87
chroma_stft_segment_1 87
rmse_segment_1 87
spec_cent_segment_1 87
spec_bw_segment_1 87
rolloff_segment_1 87
zcr_segment_1 87
mfcc_segment_1 87
company_2 87
date_2 87
tempo_segment_2 87
chroma_stft_segment_2 87
rmse_segment_2 87
spec_cent_segment_2 87
spec_bw_segment_2 87
rolloff_segment_2 87
zcr_segment_2 87
mfcc_segment_2 87
company_3 87
date_3 87
tempo_segment_3 87
chroma_stft_segment_3 87
rmse_segment_3 87
spec_cent_segment_3 87
spec_bw_segment_3 87
rolloff_segment_3 87
zcr_segment_3 87
mfcc_segment_3 87
company_4 87
date_4 87
tempo_segment_4 87
chroma_stft_segment_4 87
rmse_segment_4 87
spec_cent_segment_4 87
spec_bw_segment_4 87
rolloff_segment_4 87
zcr_segment_4 87
mfcc_segment_4 87
company_5 87
date_5 87
tempo_segment_5 87
chroma_stft_segment_5 87
rmse_segment_5 87
spec_cent_segment_5 87
spec_bw_segment_5 87
rolloff_segment_5 87
zcr_segment_5 87
mfcc_segment_5 87
company_6 87
date_6 87
tempo_segment_6 87
chroma_stft_segment_6 87
rmse_seg

In [8]:
# dic --> dataframe
import pandas as pd

# create dataframe from dictionary of lists
df = pd.DataFrame(features)
print(df.head())

df.to_csv('AudioData/features_audio.csv', index=False)

  company_1       date_1  tempo_segment_1  chroma_stft_segment_1  \
0   AAPL.OQ  2020-Jul-30       117.187500               0.365996   
1   AAPL.OQ  2022-Jul-28       117.187500               0.364630   
2   AAPL.OQ  2023-Feb-02        98.684211               0.352188   
3   AAPL.OQ  2023-Nov-02       125.000000               0.351912   
4   AAPL.OQ  2021-Jul-27       117.187500               0.371816   

   rmse_segment_1  spec_cent_segment_1  spec_bw_segment_1  rolloff_segment_1  \
0        0.027201          1436.623670        1135.887954        2656.791836   
1        0.016014          1093.471158        1067.038987        2113.469127   
2        0.042191          1130.034723        1079.233129        2186.137105   
3        0.027063          1052.445032        1035.682371        2025.102747   
4        0.039081          1368.223525        1144.997330        2593.960984   

   zcr_segment_1  mfcc_segment_1  ... company_15      date_15  \
0       0.138336      -20.716206  ...    AAPL

In [None]:
# save clap features as json file
import json

with open('AudioData/clap_features.json', 'w') as f:
    json.dump(clap_features, f)

In [5]:
for root, dirs, files in os.walk('Audio/'):
    print(root, dirs, files)

Audio/ ['AAPL', 'GOOGL', 'MSFT', 'NVDA'] []
Audio/AAPL [] ['2020-Apr-30-AAPL.OQ-140195689057.mp3', '2020-Jan-28-AAPL.OQ-137948852907.mp3', '2019-Jan-29-AAPL.OQ-140336997647.mp3', '2023-Aug-03-AAPL.OQ-138336763416.mp3', '2021-Jan-27-AAPL.OQ-137405328510.mp3', '2021-Apr-28-AAPL.OQ-139264952632.mp3', '2019-Apr-30-AAPL.OQ-139934481137.mp3', '2020-Oct-29-AAPL.OQ-141103256170.mp3', '2022-Oct-27-AAPL.OQ-140449384074.mp3', '2022-Apr-28-AAPL.OQ-138491813375.mp3', '2019-Oct-30-AAPL.OQ-137802390954.mp3', '2023-Nov-02-AAPL.OQ-140502977515.mp3', '2021-Jul-27-AAPL.OQ-138310703827.mp3', '2021-Oct-28-AAPL.OQ-139435924054.mp3', '2023-Feb-02-AAPL.OQ-140682524715.mp3', '2022-Jan-27-AAPL.OQ-138984465849.mp3', '2024-Feb-01-AAPL.OQ-139920838259.mp3', '2019-Jul-30-AAPL.OQ-139263252221.mp3', '2023-May-04-AAPL.OQ-139016265469.mp3', '2022-Jul-28-AAPL.OQ-139999304673.mp3', '2020-Jul-30-AAPL.OQ-139668219181.mp3']
Audio/GOOGL [] ['2023-Apr-25-GOOGL.OQ-140631444702.mp3', '2019-Apr-29-GOOGL.OQ-138437793323.mp3', '20

# TESTS

In [15]:
# Load audio file
filename = 'AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137.mp3'

In [55]:
filename = 'AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_1.mp3'

In [56]:
y, sr = librosa.load(filename, sr=16000, dtype=np.float32)

In [30]:
y

array([ 1.2667331e-06,  7.3400183e-06,  8.1522548e-06, ...,
        2.5624418e-04,  2.8384631e-04, -1.6588336e-04], dtype=float32)

In [36]:
def librosa_process(y, sr=16000):
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    # return single value for each
    return tempo, chroma_stft.mean(), rmse.mean(), spec_cent.mean(), spec_bw.mean(), rolloff.mean(), zcr.mean(), mfcc.mean()


In [37]:
librosa_process(y)

(144.23076923076923,
 0.37024707,
 0.009967313,
 1594.1986496905806,
 1222.5049879092867,
 2933.5234284262488,
 0.1484131851816565,
 -27.36655)

In [60]:
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)
processor(y, sampling_rate=16000)

Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_values': [array([ 0.00112327,  0.00156151,  0.00162012, ...,  0.0195219 ,
        0.02151361, -0.01093792], dtype=float32)], 'attention_mask': [array([1, 1, 1, ..., 1, 1, 1], dtype=int32)]}

In [50]:
from transformers import pipeline

audio = y  # Your audio input

# Initialize the zero-shot audio classification pipeline with a suitable model
audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_general")

# Define candidate labels that represent the audio features you are interested in
candidate_labels = [
    "positive sentiment",  # Looking for positive emotional content
    "negative sentiment",  # Looking for negative emotional content
    "neutral sentiment",   # Checking for neutral emotional content
    "fast speech",         # Identifying fast speech rates
    "slow speech",         # Identifying slow speech rates
    "calm tone",           # Identifying calm tones
    "aggressive tone"      # Identifying aggressive tones
]

# Classify the audio with the newly defined candidate labels
output = audio_classifier(audio, candidate_labels=candidate_labels)

# Print the classification output
print(output)


[{'score': 0.7991957068443298, 'label': 'positive sentiment'}, {'score': 0.08826709538698196, 'label': 'aggressive tone'}, {'score': 0.07042617350816727, 'label': 'calm tone'}, {'score': 0.02783028595149517, 'label': 'negative sentiment'}, {'score': 0.010820894502103329, 'label': 'neutral sentiment'}, {'score': 0.0018461584113538265, 'label': 'slow speech'}, {'score': 0.001613809261471033, 'label': 'fast speech'}]


https://huggingface.co/Rajaram1996/Hubert_emotion/tree/main

https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition