In [1]:
import numpy as np
import sklearn
import tensorflow as tf

print("NumPy version:", np.__version__)
print("scikit-learn version:", sklearn.__version__)
print("TensorFlow version:", tf.__version__)


NumPy version: 1.25.0
scikit-learn version: 1.4.1.post1
TensorFlow version: 2.18.0


In [2]:
import os
import json
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt


In [3]:
# Path to JSON file and audio files
json_path = r'M:\Coding\NLP_Project\bad_words_timestamps2.json'
audio_dir = 'TempAudioData'

# Load annotations
with open(json_path, 'r', encoding='utf-8') as f:
    annotations = json.load(f)

# Sample Rate for loading audio files
SAMPLE_RATE = 22050


In [4]:
def extract_features(audio_path, start, end, sample_rate=SAMPLE_RATE):
    # Load the audio file and extract the segment
    audio, sr = librosa.load(audio_path, sr=sample_rate)
    segment = audio[int(start * sr):int(end * sr)]
    
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
    mfcc = np.mean(mfcc, axis=1)  # Get the mean across time dimension for each MFCC coefficient
    
    # Extract Mel Spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    mel_spectrogram_db = np.mean(mel_spectrogram_db, axis=1)  # Average over time
    
    return mfcc, mel_spectrogram_db


In [5]:
X_mfcc, X_spectrogram, y = [], [], []

for entry in tqdm(annotations):
    file_name = entry['file']
    word = entry['word']
    start = entry['start']
    end = entry['end']
    
    audio_path = os.path.join(audio_dir, file_name)
    mfcc, spectrogram = extract_features(audio_path, start, end)
    
    X_mfcc.append(mfcc)
    X_spectrogram.append(spectrogram)
    y.append(word)  # Label as abusive word


100%|██████████| 1960/1960 [00:27<00:00, 70.53it/s]


In [6]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Split the dataset
X_mfcc = np.array(X_mfcc)
# Reshape spectrogram data for CNN model
# Assume X_spectrogram has shape (num_samples, num_mel_bands) after feature extraction
X_spectrogram = np.expand_dims(X_spectrogram, axis=-1)  # Add channel dimension for CNN
input_shape_cnn = (X_spectrogram.shape[1], X_spectrogram.shape[2], 1)  # (height, width, channels)

X_train, X_test, y_train, y_test = train_test_split(X_mfcc, y_categorical, test_size=0.2, random_state=42)


In [7]:
# CNN model for Spectrograms
# CNN model for Spectrograms with adjusted input shape
def create_cnn_model(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    return model

# Define and compile CNN model



# RNN model for MFCC features
def create_rnn_model(input_shape):
    model = Sequential([
        Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape),
        Bidirectional(LSTM(64)),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    return model


In [8]:
# Reshape MFCC data for RNN model
X_train_mfcc = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_mfcc = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# CNN model for spectrogram data
input_shape_cnn = (X_spectrogram.shape[1], 1, 1)  # Reshape as needed for CNN
cnn_model = create_cnn_model(input_shape_cnn)
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# RNN model for MFCC data
input_shape_rnn = (X_train_mfcc.shape[1], 1)
rnn_model = create_rnn_model(input_shape_rnn)
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training CNN on Spectrogram data
cnn_history = cnn_model.fit(X_train_spectrogram, y_train, validation_data=(X_test_spectrogram, y_test), epochs=20, batch_size=32)

# Training RNN on MFCC data
rnn_history = rnn_model.fit(X_train_mfcc, y_train, validation_data=(X_test_mfcc, y_test), epochs=20, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Computed output size would be negative. Received `inputs shape=(None, 128, 1, 1)`, `kernel shape=(3, 3, 1, 32)`, `dilation_rate=[1 1]`.

In [9]:
# Evaluate CNN model
cnn_eval = cnn_model.evaluate(X_test_spectrogram, y_test, verbose=0)
print(f"CNN Model Test Accuracy: {cnn_eval[1] * 100:.2f}%")

# Evaluate RNN model
rnn_eval = rnn_model.evaluate(X_test_mfcc, y_test, verbose=0)
print(f"RNN Model Test Accuracy: {rnn_eval[1] * 100:.2f}%")


NameError: name 'cnn_model' is not defined

In [None]:
# Plot accuracy for CNN
plt.plot(cnn_history.history['accuracy'], label='train_accuracy')
plt.plot(cnn_history.history['val_accuracy'], label='val_accuracy')
plt.title('CNN Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot accuracy for RNN
plt.plot(rnn_history.history['accuracy'], label='train_accuracy')
plt.plot(rnn_history.history['val_accuracy'], label='val_accuracy')
plt.title('RNN Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
