In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (6).json


{'kaggle (6).json': b'{"username":"nadasalem81","key":"533c66068bcc937ae5bee94405d25391"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d murtadhanajim/gender-recognition-by-voiceoriginal --unzip

Dataset URL: https://www.kaggle.com/datasets/murtadhanajim/gender-recognition-by-voiceoriginal
License(s): apache-2.0
Downloading gender-recognition-by-voiceoriginal.zip to /content
 98% 1.29G/1.31G [00:11<00:00, 252MB/s]
100% 1.31G/1.31G [00:11<00:00, 118MB/s]


In [None]:
import os
import random
import librosa
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models, callbacks

# --- Folders ---
FEMALE_FOLDER = "/content/data/female"
MALE_FOLDER = "/content/data/male"

# --- Load file names ---
female_files = os.listdir(FEMALE_FOLDER)
male_files = os.listdir(MALE_FOLDER)

# --- Take only half of the files for each class ---
female_files = random.sample(female_files, len(female_files)//2)
male_files = random.sample(male_files, len(male_files)//2)

# --- Function to load and preprocess audio ---
def load_wav_16k_mono(filename, max_len=48000):
    wav, sr = librosa.load(filename, sr=16000, mono=True)
    if len(wav) > max_len:
        wav = wav[:max_len]  # trim
    else:
        wav = np.pad(wav, (0, max_len - len(wav)))  # pad
    return wav

# --- Load audio and labels ---
X = []
y = []

for f in female_files:
    path = os.path.join(FEMALE_FOLDER, f)
    X.append(load_wav_16k_mono(path))
    y.append(0)  # female label

for f in male_files:
    path = os.path.join(MALE_FOLDER, f)
    X.append(load_wav_16k_mono(path))
    y.append(1)  # male label

X = np.array(X)
y = np.array(y, dtype='int32')

# --- Convert audio to spectrograms and resize to 128x128 ---
X_spec = []
for x in X:
    spec = np.abs(librosa.stft(x, n_fft=512, hop_length=256))
    spec = tf.expand_dims(spec, -1)  # add channel dimension
    spec = tf.image.resize(spec, [128, 128])  # resize to 128x128
    X_spec.append(spec)

X_spec = tf.stack(X_spec)

# --- Create TensorFlow Dataset ---
dataset = tf.data.Dataset.from_tensor_slices((X_spec, y))

# Shuffle the whole dataset
dataset = dataset.shuffle(buffer_size=len(X_spec), seed=42)

# Split dataset into train and test
dataset_size = len(X_spec)
train_size = int(0.7 * dataset_size)

train_ds = dataset.take(train_size)
test_ds = dataset.skip(train_size)

# Batch and prefetch **after** splitting
BATCH_SIZE = 2
train_ds = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# --- Define CNN model ---
input_shape = (128, 128, 1)
model = models.Sequential([
    layers.Input(shape=input_shape),  # Input layer
    layers.Conv2D(16, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# --- Compile model ---
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['Precision', 'Recall']
)

# --- Define callbacks ---
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    min_lr=1e-6
)


In [10]:
# --- Train the model ---
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=6,
    callbacks=[early_stop, reduce_lr]
)


Epoch 1/6
[1m2826/2826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 54ms/step - Precision: 0.9987 - Recall: 0.9993 - loss: 0.0054 - val_Precision: 1.0000 - val_Recall: 1.0000 - val_loss: 1.8961e-04 - learning_rate: 0.0010
Epoch 2/6
[1m2826/2826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 51ms/step - Precision: 0.9999 - Recall: 0.9999 - loss: 0.0016 - val_Precision: 1.0000 - val_Recall: 1.0000 - val_loss: 2.0152e-05 - learning_rate: 0.0010
Epoch 3/6
[1m2826/2826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 48ms/step - Precision: 0.9996 - Recall: 0.9994 - loss: 0.0046 - val_Precision: 1.0000 - val_Recall: 1.0000 - val_loss: 5.3467e-07 - learning_rate: 0.0010
Epoch 4/6
[1m2826/2826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 47ms/step - Precision: 0.9999 - Recall: 0.9999 - loss: 1.9481e-04 - val_Precision: 1.0000 - val_Recall: 1.0000 - val_loss: 1.4175e-05 - learning_rate: 0.0010
Epoch 5/6
[1m2826/2826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [17]:
import librosa
import tensorflow as tf
import numpy as np

# --- Function to load and preprocess a single audio file ---
def preprocess_audio(filename, max_len=48000):
    wav, sr = librosa.load(filename, sr=16000, mono=True)
    if len(wav) > max_len:
        wav = wav[:max_len]
    else:
        wav = np.pad(wav, (0, max_len - len(wav)))

    # Convert to spectrogram
    spec = np.abs(librosa.stft(wav, n_fft=512, hop_length=256))
    spec = tf.expand_dims(spec, -1)         # Add channel dimension
    spec = tf.image.resize(spec, [128, 128])  # Resize to 128x128
    spec = tf.expand_dims(spec, 0)          # Add batch dimension
    return spec

# --- Example usage ---
female_file = "/content/female2.wav"
male_file   = "/content/male.wav"

# Preprocess audio
female_spec = preprocess_audio(female_file)
male_spec   = preprocess_audio(male_file)

# --- Predict ---
female_pred = model.predict(female_spec)
male_pred   = model.predict(male_spec)

# --- Print results ---
print("Female audio prediction:", "Male" if female_pred[0][0] > 0.5 else "Female")
print("Male audio prediction:", "Male" if male_pred[0][0] > 0.5 else "Female")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Female audio prediction: Female
Male audio prediction: Male
