In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to extract MFCC features from audio files
def extract_features(audio_file):
    try:
        # Load audio file
        audio, sample_rate = librosa.load(audio_file, sr=None)
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        return mfccs.T  # Transpose to get time-steps as rows
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None  # Return None for failed cases

# Load dataset
data = pd.read_csv('/kaggle/input/annotated-data2/CSV_file.csv')  # Adjust path if necessary

features = []
labels = []

# Directory where audio files are stored
audio_directory = '/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/'  # Adjust this path as needed

# Extract features for each audio file
for index, row in data.iterrows():
    audio_file_path = os.path.join(audio_directory, row['Name'])
#     print(audio_file_path)
    mfccs = extract_features(audio_file_path)
    
    if mfccs is not None:
        features.append(mfccs)
        labels.append(row['Label'])
    else:
        print(f"Failed to process: {audio_file_path}")

# Convert to numpy arrays and pad sequences to the same length
X = pad_sequences(features, padding='post', dtype='float32')
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(np.unique(y_encoded))
y_encoded = np.eye(num_classes)[y_encoded]  # One-hot encoding

print(f"Shape of X: {X.shape}")
print(f"Shape of y_encoded: {y_encoded.shape}")

# Train/test split
if len(X) > 0 and len(y) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.4, random_state=42)

    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Build the RCNN model
    input_shape = X_train.shape[1:]  # Shape (time_steps, n_mfcc)
    model = Sequential()
    # Convolutional layers
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    
    # Optional second Conv1D layer (can be commented out if not needed)
    model.add(Conv1D(128, kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    
    # Recurrent (LSTM) layer
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.5))
    
    # Fully connected layers
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy}")
else:
    print("No valid features or labels to train the model.")


In [12]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Function to extract MFCC features from audio files
def extract_features(audio_file):
    try:
        # Load audio file with downsampling
        audio, sample_rate = librosa.load(audio_file)  # Downsampling to 16kHz
        
        # Extract MFCCs with fewer coefficients
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)  # Reduced MFCCs
        return mfccs.T  # Transpose to get time-steps as rows
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None  # Return None for failed cases

# Load dataset
data = pd.read_csv('/kaggle/input/annotated-data2/CSV_file.csv')  # Adjust path if necessary

features = []
labels = []

# Directory where audio files are stored
audio_directory = '/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/'  # Adjust this path as needed

# Extract features for each audio file
for index, row in data.iterrows():
    audio_file_path = os.path.join(audio_directory, row['Name'])
    mfccs = extract_features(audio_file_path)
    
    if mfccs is not None:
        features.append(mfccs)
        labels.append(row['Label'])
    else:
        print(f"Failed to process: {audio_file_path}")

# Convert to numpy arrays and pad sequences to the same length
X = pad_sequences(features, padding='post', dtype='float32', maxlen=200)  # Limit padding length to 200
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(np.unique(y_encoded))
y_encoded = np.eye(num_classes)[y_encoded]  # One-hot encoding

print(f"Shape of X: {X.shape}")
print(f"Shape of y_encoded: {y_encoded.shape}")

# Train/test split
if len(X) > 0 and len(y) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.4, random_state=42)

    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Set up a mirrored strategy for distributed training
    strategy = tf.distribute.MirroredStrategy(devices=["/GPU:0", "/GPU:1"])  # T4x2 GPUs on Kaggle
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    with strategy.scope():
    # Build the WaveNet model
        input_shape = X_train.shape[1:]  # Shape (time_steps, n_mfcc)
        model = Sequential()

        # WaveNet-inspired dilated convolutions
        model.add(Conv1D(64, kernel_size=2, dilation_rate=2, activation='relu', input_shape=input_shape))
        model.add(Conv1D(128, kernel_size=2, dilation_rate=4, activation='relu'))
        model.add(Conv1D(256, kernel_size=2, dilation_rate=8, activation='relu'))
        model.add(Conv1D(512, kernel_size=2, dilation_rate=16, activation='relu'))

        # Flatten the output from the Conv1D layers before passing to dense layers
        model.add(Flatten())

        # Dense layers
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(num_classes, activation='softmax'))

        # Compile the model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=8, validation_data=(X_test, y_test))  # Smaller batch size

# Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy}")

Shape of X: (10120, 200, 40)
Shape of y_encoded: (10120, 2)
Shape of X_train: (6072, 200, 40)
Shape of y_train: (6072, 2)
Shape of X_test: (4048, 200, 40)
Shape of y_test: (4048, 2)
Number of devices: 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.6239 - loss: 7.1155 - val_accuracy: 0.6423 - val_loss: 0.6537
Epoch 2/5
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.6675 - loss: 0.6510 - val_accuracy: 0.6423 - val_loss: 0.6535
Epoch 3/5
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.6610 - loss: 0.6550 - val_accuracy: 0.6423 - val_loss: 0.6525
Epoch 4/5
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.6714 - loss: 0.6341 - val_accuracy: 0.6423 - val_loss: 0.6535
Epoch 5/5
[1m759/759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.6683 - loss: 0.6747 - val_accuracy: 0.6423 - val_loss: 0.6527
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6711 - loss: 0.6347
Test accuracy: 0.6724308133125305


In [16]:
from pydub import AudioSegment
import numpy as np
import librosa
import tensorflow as tf

# Load your audio file
audio_file_path = r'/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/Audio_3009.wav'
audio = AudioSegment.from_wav(audio_file_path)

# Define the segment duration and the list of timestamps to mute
segment_duration = 500  # Duration of each segment in milliseconds (e.g., 500ms)
bad_word_indices = []  # This will contain the indices of segments predicted as abusive

# Function to extract features from audio segments
def extract_features_from_segment(segment):
    try:
        # Convert the AudioSegment to numpy array
        samples = np.array(segment.get_array_of_samples()).astype(np.float32)
        # Extract MFCCs (same settings as used in training, using 40 MFCCs for example)
        mfccs = librosa.feature.mfcc(y=samples, sr=segment.frame_rate, n_mfcc=40)  # Use 40 MFCCs
        return mfccs.T  # Return the MFCCs for each time-step (transposed)
    except Exception as e:
        print(f"Error processing segment: {e}")
        return None

# Split the audio into segments and process them
segments = []
for i in range(0, len(audio), segment_duration):
    segment = audio[i:i + segment_duration]
    mfccs = extract_features_from_segment(segment)
    
    if mfccs is not None:
        segments.append(mfccs)

# Convert list of segments to a numpy array and ensure consistent shape by padding/truncating
max_sequence_length = 5349  # Define the length that matches your model's expected input size
X_test = []

# Ensure all MFCCs have the same length (pad or truncate each segment)
for mfcc in segments:
    if mfcc.shape[0] < max_sequence_length:
        # Padding with zeros if MFCC sequence is shorter than max_sequence_length
        padded_mfcc = np.pad(mfcc, ((0, max_sequence_length - mfcc.shape[0]), (0, 0)), mode='constant')
    else:
        # Truncate if the MFCC sequence is longer than max_sequence_length
        padded_mfcc = mfcc[:max_sequence_length, :]
    X_test.append(padded_mfcc)

# Convert the list to a numpy array
X_test = np.array(X_test)

# Ensure the input has shape (num_samples, sequence_length, num_features)
# Here we're using 40 MFCC features per time-step
print(f"X_test shape before reshaping: {X_test.shape}")

# Ensure the input has the shape (batch_size, time_steps, num_features)
# Here we're using 40 MFCC features per time-step
# Reshape if necessary
if X_test.shape[-1] != 40:
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 40)  # Ensure it has 40 features per timestep

print(f"X_test shape after reshaping: {X_test.shape}")

# Now let's check the model's expected output shape before flattening
# If the error persists, check the actual shape of your model output
try:
    # This will give us an idea of the shape before it is passed to the Dense layers
    intermediate_output = model.layers[0].output  # First layer to check
    print(f"Intermediate output shape before Dense layers: {intermediate_output.shape}")

    # Make predictions on the audio segments
    predictions = model.predict(X_test)

    # Debugging predictions
    print(f"Predictions: {predictions}")

    for i, pred in enumerate(predictions):
        print(f"Segment {i} prediction: {np.argmax(pred)}")  # Debugging prediction for each segment

        if np.argmax(pred) == 1:  # Check if the segment is predicted as abusive
            start_time = i * segment_duration  # Start time in milliseconds
            end_time = start_time + segment_duration  # End time in milliseconds
            bad_word_indices.append((start_time, end_time))
    
    print(f"Bad word indices: {bad_word_indices}")  # Debugging the identified bad words

    # Mute the identified segments
    for start, end in bad_word_indices:
        audio = audio[:start] + AudioSegment.silent(duration=end - start) + audio[end:]

    # Save the modified audio
    audio.export("muted_audio_for_3009_500ms_audio.wav", format="wav")

except Exception as e:
    print(f"Error during prediction: {e}")
else:
    print("No valid segments to predict.")


X_test shape before reshaping: (163, 5349, 40)
X_test shape after reshaping: (163, 5349, 40)
Intermediate output shape before Dense layers: (None, 198, 64)
Error during prediction: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_6" is incompatible with the layer: expected axis -1 of input shape to have value 87040, but received input with shape (16, 2723328)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(16, 5349, 40), dtype=float32)
  • training=False
  • mask=None


In [3]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Function to extract MFCC features from audio files
def extract_features(audio_file):
    try:
        # Load audio file
        audio, sample_rate = librosa.load(audio_file, sr=None)
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=15)
        mfccs = np.mean(mfccs.T, axis=0)  # Take mean of MFCCs over time
        return mfccs
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None  # Return None for failed cases

# Load your dataset of audio files and labels
data = pd.read_csv('/kaggle/input/annotated-data2/CSV_file.csv') # Ensure this path is correct
df = pd.DataFrame(data)# Adjust path if necessary

features = []
labels = []

# Directory where audio files are stored
audio_directory = '/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/' # Adjust this

for index, row in df.iterrows():
    audio_file_path = os.path.join(audio_directory, row['Name'])  # Use the correct directory
    mfccs = extract_features(audio_file_path)  # Extract features
    
    if mfccs is not None:  # Only append if mfccs are valid
        features.append(mfccs)  # Append extracted features
        labels.append(row['Label'])  # Append the label for the current row
    else:
        print(f"Failed to extract features for {audio_file_path}")

# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)


# Encode the labels (if they're not already in binary format)
unique_labels = np.unique(y)  # Get unique labels (e.g., 0 and 1)
num_classes = len(unique_labels)  # Number of classes (should be 2 for binary)
y_encoded = np.eye(num_classes)[y]  # Create one-hot encoded labels

# Print shapes of the labels after encoding
print(f"Shape of y (before encoding): {y.shape}")
print(f"Shape of y_encoded (after encoding): {y_encoded.shape}")

# Split the dataset into training and testing sets if features exist
if len(X) > 0 and len(y) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.4, random_state=42)
    
    # Print shapes after splitting
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Build the model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
    model.add(Dropout(0.5))  # Dropout layer for regularization
    model.add(Dense(64, activation='relu'))  # Hidden layer
    model.add(Dropout(0.5))  # Dropout layer for regularization
    model.add(Dense(num_classes, activation='softmax'))  # Output layer (binary classification)

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy}")
else:
    print("No valid features or labels to train the model.")


Shape of y (before encoding): (10120,)
Shape of y_encoded (after encoding): (10120, 2)
Shape of X_train: (6072, 15)
Shape of y_train: (6072, 2)
Shape of X_test: (4048, 15)
Shape of y_test: (4048, 2)
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.5386 - loss: 15.2979 - val_accuracy: 0.6700 - val_loss: 0.6495
Epoch 2/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6038 - loss: 1.2080 - val_accuracy: 0.6700 - val_loss: 0.6440
Epoch 3/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6223 - loss: 0.7912 - val_accuracy: 0.6700 - val_loss: 0.6476
Epoch 4/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6448 - loss: 0.6955 - val_accuracy: 0.6700 - val_loss: 0.6379
Epoch 5/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6630 - loss: 0.6793 - val_accuracy: 0.6700 - val_loss: 0.6392
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6711 - loss: 0.6385
Test accuracy: 0.6699604988098145


In [37]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Function to extract MFCC features from audio files
def extract_features(audio_file):
    try:
        # Load audio file
        audio, sample_rate = librosa.load(audio_file, sr=None)
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=20)  # Increased to 20 MFCCs
        mfccs = np.mean(mfccs.T, axis=0)  # Take mean of MFCCs over time
        return mfccs
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None  # Return None for failed cases

# Load your dataset of audio files and labels
data = pd.read_csv('/kaggle/input/annotated-data2/CSV_file.csv')  # Ensure this path is correct
df = pd.DataFrame(data)  # Adjust path if necessary

features = []
labels = []

# Directory where audio files are stored
audio_directory = '/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/'  # Adjust this

for index, row in df.iterrows():
    audio_file_path = os.path.join(audio_directory, row['Name'])  # Use the correct directory
    mfccs = extract_features(audio_file_path)  # Extract features
    
    if mfccs is not None:  # Only append if mfccs are valid
        features.append(mfccs)  # Append extracted features
        labels.append(row['Label'])  # Append the label for the current row
    else:
        print(f"Failed to extract features for {audio_file_path}")

# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)

# Encode the labels (if they're not already in binary format)
unique_labels = np.unique(y)  # Get unique labels (e.g., 0 and 1)
num_classes = len(unique_labels)  # Number of classes (should be 2 for binary)
y_encoded = np.eye(num_classes)[y]  # Create one-hot encoded labels

# Print shapes of the labels after encoding
print(f"Shape of y (before encoding): {y.shape}")
print(f"Shape of y_encoded (after encoding): {y_encoded.shape}")

# Split the dataset into training and testing sets if features exist
if len(X) > 0 and len(y) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.4, random_state=42)
    
    # Print shapes after splitting
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Reshape data for Conv1D and LSTM input (samples, time_steps, features)
    X_train = np.expand_dims(X_train, axis=-1)
    X_test = np.expand_dims(X_test, axis=-1)

    # Build the model
    model = Sequential()

    # Convolutional Layer for feature extraction
    model.add(Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))

    # LSTM Layer to capture sequential information
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dropout(0.5))

    # Fully connected layers
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stop])

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy}")
else:
    print("No valid features or labels to train the model.")


Shape of y (before encoding): (10120,)
Shape of y_encoded (after encoding): (10120, 2)
Shape of X_train: (6072, 20)
Shape of y_train: (6072, 2)
Shape of X_test: (4048, 20)
Shape of y_test: (4048, 2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.6478 - loss: 0.6535 - val_accuracy: 0.6700 - val_loss: 0.6374
Epoch 2/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6654 - loss: 0.6462 - val_accuracy: 0.6700 - val_loss: 0.6342
Epoch 3/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6623 - loss: 0.6448 - val_accuracy: 0.6700 - val_loss: 0.6372
Epoch 4/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6717 - loss: 0.6372 - val_accuracy: 0.6700 - val_loss: 0.6363
Epoch 5/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6669 - loss: 0.6392 - val_accuracy: 0.6700 - val_loss: 0.6343
Epoch 6/50
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6695 - loss: 0.6388 - val_accuracy: 0.6700 - val_loss: 0.6350
Epoch 7/50
[1m190/190[0m 

In [38]:
from pydub import AudioSegment
import numpy as np
import librosa
import tensorflow as tf

# Load your audio file
audio_file_path = r'/kaggle/input/hindi-hate-speech-audio-data-wav/Converted_Audio_Data/Converted_Audio_Data/Audio_3009.wav'
audio = AudioSegment.from_wav(audio_file_path)

# Define the segment duration and the list of timestamps to mute
segment_duration = 500  # Duration of each segment in milliseconds (e.g., 500ms)
bad_word_indices = []  # This will contain the indices of segments predicted as abusive

# Function to extract features from audio segments
def extract_features_from_segment(segment):
    try:
        # Convert the AudioSegment to numpy array
        samples = np.array(segment.get_array_of_samples()).astype(np.float32)
        # Extract MFCCs (same settings as used in training, using 40 MFCCs for example)
        mfccs = librosa.feature.mfcc(y=samples, sr=segment.frame_rate, n_mfcc=40)  # Use 40 MFCCs
        return mfccs.T  # Return the MFCCs for each time-step (transposed)
    except Exception as e:
        print(f"Error processing segment: {e}")
        return None

# Split the audio into segments and process them
segments = []
for i in range(0, len(audio), segment_duration):
    segment = audio[i:i + segment_duration]
    mfccs = extract_features_from_segment(segment)
    
    if mfccs is not None:
        segments.append(mfccs)

# Convert list of segments to a numpy array and ensure consistent shape by padding/truncating
max_sequence_length = 5349  # Define the length that matches your model's expected input size
X_test = []

# Ensure all MFCCs have the same length (pad or truncate each segment)
for mfcc in segments:
    if mfcc.shape[0] < max_sequence_length:
        # Padding with zeros if MFCC sequence is shorter than max_sequence_length
        padded_mfcc = np.pad(mfcc, ((0, max_sequence_length - mfcc.shape[0]), (0, 0)), mode='constant')
    else:
        # Truncate if the MFCC sequence is longer than max_sequence_length
        padded_mfcc = mfcc[:max_sequence_length, :]
    X_test.append(padded_mfcc)

# Convert the list to a numpy array
X_test = np.array(X_test)

# Ensure the input has shape (num_samples, sequence_length, num_features)
# Here we're using 40 MFCC features per time-step
print(f"X_test shape before reshaping: {X_test.shape}")

# Ensure the input has the shape (batch_size, time_steps, num_features)
# Here we're using 40 MFCC features per time-step
# Reshape if necessary
if X_test.shape[-1] != 40:
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 40)  # Ensure it has 40 features per timestep

print(f"X_test shape after reshaping: {X_test.shape}")

# Now let's check the model's expected output shape before flattening
# If the error persists, check the actual shape of your model output
try:
    # This will give us an idea of the shape before it is passed to the Dense layers
    intermediate_output = model.layers[0].output  # First layer to check
    print(f"Intermediate output shape before Dense layers: {intermediate_output.shape}")

    # Make predictions on the audio segments
    predictions = model.predict(X_test)

    # Debugging predictions
    print(f"Predictions: {predictions}")

    for i, pred in enumerate(predictions):
        print(f"Segment {i} prediction: {np.argmax(pred)}")  # Debugging prediction for each segment

        if np.argmax(pred) == 1:  # Check if the segment is predicted as abusive
            start_time = i * segment_duration  # Start time in milliseconds
            end_time = start_time + segment_duration  # End time in milliseconds
            bad_word_indices.append((start_time, end_time))
    
    print(f"Bad word indices: {bad_word_indices}")  # Debugging the identified bad words

    # Mute the identified segments
    for start, end in bad_word_indices:
        audio = audio[:start] + AudioSegment.silent(duration=end - start) + audio[end:]

    # Save the modified audio
    audio.export("muted_audio_for_3009_500ms_audio.wav", format="wav")

except Exception as e:
    print(f"Error during prediction: {e}")
else:
    print("No valid segments to predict.")


X_test shape before reshaping: (163, 5349, 40)
X_test shape after reshaping: (163, 5349, 40)
Intermediate output shape before Dense layers: (None, 18, 64)
Error during prediction: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "conv1d_28" is incompatible with the layer: expected axis -1 of input shape to have value 1, but received input with shape (32, 5349, 40)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 5349, 40), dtype=float32)
  • training=False
  • mask=None
