In [1]:
import librosa
import numpy as np

In [4]:
from pydub import AudioSegment
import os

def split_mp3(filename, num_segments):
    ''' Evenly split an mp3 file into num_segments segments '''
    os.makedirs(filename[:-4], exist_ok=True)
    audio = AudioSegment.from_mp3(filename)
    duration_ms = len(audio)
    segment_length = duration_ms // num_segments
    for i in range(0, duration_ms, segment_length):
        segment = audio[i:i + segment_length]
        segment_filename = filename[:-4] + f"/segment_{i // segment_length + 1}.mp3"
        segment.export(segment_filename, format="mp3")
        print(f"Exported {segment_filename}")

path_to_file = 'AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137.mp3'

num_segments = 15
split_mp3(path_to_file, num_segments)

Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_1.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_2.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_3.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_4.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_5.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_6.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_7.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_8.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_9.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_10.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_11.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_12.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_13.mp3
Exported AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137/segment_14.mp3
E

In [12]:
# go through all mp3 files in the directory and subdirectories and split them
def split_all(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.mp3'):
                filename = file.split('/')[-1][:-4]
                company = root.split('/')[-1]
                output_dir = 'AudioData/' + company + '/' + filename
                os.makedirs(output_dir, exist_ok=True)
                audio = AudioSegment.from_mp3(root + '/' + file)
                duration_ms = len(audio)
                segment_length = duration_ms // num_segments
                for i in range(0, duration_ms, segment_length):
                    segment = audio[i:i + segment_length]
                    segment_filename = output_dir + f"/segment_{i // segment_length + 1}.mp3"
                    segment.export(segment_filename, format="mp3")
                print(f"Exported {output_dir}")
                # delete segment 16 if exists
                if os.path.exists(output_dir + '/segment_16.mp3'):
                    os.remove(output_dir + '/segment_16.mp3', )

split_all('Audio/')

Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_1.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_2.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_3.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_4.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_5.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_6.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_7.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_8.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_9.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_10.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_11.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_12.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_13.mp3
Exported AudioData/AAPL/2020-Apr-30-AAPL.OQ-140195689057/segment_14.mp3
E

In [5]:
for root, dirs, files in os.walk('Audio/'):
    print(root, dirs, files)

Audio/ ['AAPL', 'GOOGL', 'MSFT', 'NVDA'] []
Audio/AAPL [] ['2020-Apr-30-AAPL.OQ-140195689057.mp3', '2020-Jan-28-AAPL.OQ-137948852907.mp3', '2019-Jan-29-AAPL.OQ-140336997647.mp3', '2023-Aug-03-AAPL.OQ-138336763416.mp3', '2021-Jan-27-AAPL.OQ-137405328510.mp3', '2021-Apr-28-AAPL.OQ-139264952632.mp3', '2019-Apr-30-AAPL.OQ-139934481137.mp3', '2020-Oct-29-AAPL.OQ-141103256170.mp3', '2022-Oct-27-AAPL.OQ-140449384074.mp3', '2022-Apr-28-AAPL.OQ-138491813375.mp3', '2019-Oct-30-AAPL.OQ-137802390954.mp3', '2023-Nov-02-AAPL.OQ-140502977515.mp3', '2021-Jul-27-AAPL.OQ-138310703827.mp3', '2021-Oct-28-AAPL.OQ-139435924054.mp3', '2023-Feb-02-AAPL.OQ-140682524715.mp3', '2022-Jan-27-AAPL.OQ-138984465849.mp3', '2024-Feb-01-AAPL.OQ-139920838259.mp3', '2019-Jul-30-AAPL.OQ-139263252221.mp3', '2023-May-04-AAPL.OQ-139016265469.mp3', '2022-Jul-28-AAPL.OQ-139999304673.mp3', '2020-Jul-30-AAPL.OQ-139668219181.mp3']
Audio/GOOGL [] ['2023-Apr-25-GOOGL.OQ-140631444702.mp3', '2019-Apr-29-GOOGL.OQ-138437793323.mp3', '20

In [55]:
# Load audio file
filename = 'AudioData/AAPL/2019-Apr-30-AAPL.OQ-139934481137.mp3'

In [38]:
filename = 'AudioData/file_example_MP3_700KB.mp3'

In [56]:
import soundfile as sf

_ = sf.read(filename)

In [57]:
y, sr = librosa.load(filename, sr=16000, dtype=np.float32)

In [58]:
y

array([ 1.0817012e-12,  3.0290646e-13, -7.8385882e-13, ...,
       -3.1494394e-06, -1.7602131e-06,  0.0000000e+00], dtype=float32)

In [59]:
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)
import torch
import torch.nn as nn

class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits


In [60]:
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

y_processed = processor(y, sampling_rate=16000)
y_processed = y_processed['input_values'][0]
y_processed = y_processed.reshape(1, -1)
y_processed = torch.from_numpy(y_processed).to(device)

with torch.no_grad():
        y_processed, sent = model(y_processed)

# convert to numpy
y_processed = y_processed.detach().cpu().numpy()
sent = sent.detach().cpu().numpy()



Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
sent

array([[0.55590785, 0.5495001 , 0.39147374]], dtype=float32)

In [50]:
from transformers import pipeline

audio = y  # Your audio input

# Initialize the zero-shot audio classification pipeline with a suitable model
audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_general")

# Define candidate labels that represent the audio features you are interested in
candidate_labels = [
    "positive sentiment",  # Looking for positive emotional content
    "negative sentiment",  # Looking for negative emotional content
    "neutral sentiment",   # Checking for neutral emotional content
    "fast speech",         # Identifying fast speech rates
    "slow speech",         # Identifying slow speech rates
    "calm tone",           # Identifying calm tones
    "aggressive tone"      # Identifying aggressive tones
]

# Classify the audio with the newly defined candidate labels
output = audio_classifier(audio, candidate_labels=candidate_labels)

# Print the classification output
print(output)


[{'score': 0.7991957068443298, 'label': 'positive sentiment'}, {'score': 0.08826709538698196, 'label': 'aggressive tone'}, {'score': 0.07042617350816727, 'label': 'calm tone'}, {'score': 0.02783028595149517, 'label': 'negative sentiment'}, {'score': 0.010820894502103329, 'label': 'neutral sentiment'}, {'score': 0.0018461584113538265, 'label': 'slow speech'}, {'score': 0.001613809261471033, 'label': 'fast speech'}]


https://huggingface.co/Rajaram1996/Hubert_emotion/tree/main

https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition

In [46]:
from transformers import ClapModel, ClapProcessor


model = ClapModel.from_pretrained("laion/larger_clap_general")
processor = ClapProcessor.from_pretrained("laion/larger_clap_general")

inputs = processor(audios=audio, return_tensors="pt")
audio_embed = model.get_audio_features(**inputs)


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [48]:
audio_embed.shape

torch.Size([1, 512])

In [11]:
# Detect voiced segments (a simple approach using energy)
intensity = librosa.feature.rms(y=y)
voiced_threshold = np.median(intensity)  # Simple threshold, adjust based on your audio
voiced_segments = intensity > voiced_threshold

# Calculate speech speed (this is a very rudimentary calculation)
# For a more accurate approach, consider using speech recognition to count words or syllables
speech_duration_seconds = np.sum(voiced_segments) / sr
print(f"Estimated speech duration: {speech_duration_seconds} seconds")

# Note: This doesn't count words or syllables directly. You might need a speech-to-text service for accurate speech speed (words per minute).


Estimated speech duration: 0.041020408163265305 seconds


In [12]:
import parselmouth

# Load the audio file
sound = parselmouth.Sound(filename)

# Analyze pitch
pitch = sound.to_pitch()

# Get mean pitch in the audio file
mean_pitch = pitch.selected_array['frequency'].mean()
print(f"Mean pitch: {mean_pitch} Hz")

# You can extend this to analyze intensity, formants, etc., to infer more about the tone.


Mean pitch: 197.64412560750836 Hz
