In [2]:
import numpy as np
np.complex = complex

import librosa
import soundfile as sf
import pandas as pd
from textblob import TextBlob
import whisper
import torch
from time import sleep

files = {
    "clean": "harvard.wav",
    "noise": "harvard-noise.wav",
    "sine": "harvard-sine.wav"
}

model = whisper.load_model("base")
results = []

for label, path in files.items():
    y, sr = librosa.load(path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr).mean(axis=1)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    zcr = librosa.feature.zero_crossing_rate(y).mean()
    chroma = librosa.feature.chroma_stft(y=y, sr=sr).mean()
    mel = librosa.feature.melspectrogram(y=y, sr=sr).mean()
    temp_path = f"./temp_{label}.wav"
    sf.write(temp_path, y, sr)
    print(f"Processing {label}...")
    transcription = model.transcribe(temp_path)["text"]
    sentiment = TextBlob(transcription).sentiment

    results.append({
        "file": label,
        "transcription": transcription,
        "mfcc_mean": mfcc.mean(),
        "spectral_centroid": centroid,
        "zcr": zcr,
        "chroma": chroma,
        "mel": mel,
        "polarity": sentiment.polarity,
        "subjectivity": sentiment.subjectivity
    })

df = pd.DataFrame(results)
print(df[["file", "transcription", "mfcc_mean", "spectral_centroid", "zcr", "polarity", "subjectivity"]])


Processing clean...
Processing noise...
Processing sine...
    file                                      transcription  mfcc_mean  \
0  clean   The stale smell of old beer lingers. It takes... -11.280766   
1  noise   The stale smell of old beer lingers. It takes...  -7.492465   
2   sine   The stale smell of old beer lingers. It takes... -10.140648   

   spectral_centroid       zcr  polarity  subjectivity  
0        3243.359570  0.068710  0.023810      0.578571  
1        8960.240049  0.303777  0.023810      0.578571  
2         842.228110  0.020277 -0.054167      0.631250  
