This notebook is used to put the audio datasets in suitable csv format to create image datasets, and inspect the properties of the datasets

In [1]:
import os
import shutil
from collections import Counter

from tqdm import tqdm
import pandas as pd
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset_folder_path = ""
dataset_name = ""

## Data Cleaning

In [None]:
os.listdir(os.path.join(dataset_folder_path, dataset_name))

In [None]:
os.makedirs(os.path.join(dataset_folder_path, dataset_name, "clean_data"))
os.makedirs(os.path.join(dataset_folder_path, dataset_name, "clean_data", "fake"))
os.makedirs(os.path.join(dataset_folder_path, dataset_name, "clean_data", "real"))

In [None]:
df = pd.read_csv(os.path.join(dataset_folder_path, dataset_name, "meta.csv"))
print("length: ", len(df), "\nlabels: ", df["label"].unique(), "\n",
      "number of fake audios: ", len(df[df["label"] == "spoof"]), "\n"
      "number of real audios: ", len(df[df["label"] == "bona-fide"]), sep="")
df.head(2)

In [None]:
# re-formats the In The Wild dataset
sample_count = 1000000
spoof_counter = 0
real_counter = 0

for i, row in tqdm(enumerate(df.values)):
    original_name = row[0]
    speaker = row[1]
    category = row[2]

    new_name = speaker.replace(" ", "_") + "_" + original_name

    if spoof_counter < sample_count and category == "spoof":
        shutil.copy(os.path.join(dataset_folder_path, dataset_name, "wavs", original_name), 
        os.path.join(dataset_folder_path, dataset_name, "clean_data", "fake", new_name))
        spoof_counter = spoof_counter + 1
    
    if real_counter < sample_count and category == "bona-fide":
        shutil.copy(os.path.join(dataset_folder_path, dataset_name, "wavs", original_name), 
        os.path.join(dataset_folder_path, dataset_name, "clean_data", "real", new_name))
        real_counter = real_counter + 1

    if spoof_counter == sample_count and real_counter == sample_count:
        break

In [None]:
# create csv file of the real part
real_path = os.path.join(dataset_folder_path, dataset_name, "clean_data", "real")
real_wavs = os.listdir(real_path)
real_dataframe_values = []

for wav_file in tqdm(real_wavs):
    file_path = os.path.join(real_path, wav_file)
    samplerate, data = wavfile.read(file_path)
    audio_length = len(data) // samplerate
    real_dataframe_values.append([file_path, samplerate, len(data), audio_length])

real_dataframe = pd.DataFrame(columns=["path", "sample_rate", "total_sample_number", "seconds"], data=real_dataframe_values)
real_dataframe.to_csv(os.path.join(dataset_folder_path, dataset_name, "clean_data", "real.csv"), index=False)

In [None]:
# create csv file of the fake part
fake_path = os.path.join(dataset_folder_path, dataset_name, "clean_data", "fake")
fake_wavs = os.listdir(fake_path)
fake_dataframe_values = []

for wav_file in tqdm(fake_wavs):
    file_path = os.path.join(fake_path, wav_file)
    samplerate, data = wavfile.read(file_path)
    audio_length = len(data) // samplerate
    fake_dataframe_values.append([file_path, samplerate, len(data), audio_length])

fake_dataframe = pd.DataFrame(columns=["path", "sample_rate", "total_sample_number", "seconds"], data=fake_dataframe_values)
fake_dataframe.to_csv(os.path.join(dataset_folder_path, dataset_name, "clean_data", "fake.csv"), index=False)

# Data Inspection

In [3]:
real_df = pd.read_csv(os.path.join(dataset_folder_path, dataset_name, "clean_data", "real.csv"))
fake_df = pd.read_csv(os.path.join(dataset_folder_path, dataset_name, "clean_data", "fake.csv"))

In [None]:
# plot the audio lengths
real_seconds_occurance = real_df["seconds"].value_counts().sort_index()
fake_seconds_occurance = fake_df["seconds"].value_counts().sort_index()

fig, ax = plt.subplots(figsize=(12, 6))

bar_width = 0.35
bar_positions_real = real_seconds_occurance.index - bar_width/2
bar_positions_fake = fake_seconds_occurance.index + bar_width/2

ax.bar(bar_positions_real, real_seconds_occurance, width=bar_width, color='green', alpha=0.7, label='Real', edgecolor='black')
ax.bar(bar_positions_fake, fake_seconds_occurance, width=bar_width, color='red', alpha=0.7, label='Fake', edgecolor='black')

ax.set_xlabel('Audio length in seconds')
ax.set_ylabel('Occurrences')
ax.set_title('Histogram of audio length occurrences')
ax.set_ylim(0, 5000)
ax.set_yticks(range(0, 5001, 500))
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# plot the speaker counts
real_speakers = [p[p.index("/real/")+6 : p.rindex("_")] for p in real_df["path"]]
real_speakers_occurance = dict(sorted(Counter(real_speakers).items(), key=lambda x: x[1], reverse=True))
real_names, real_occurrences = zip(*real_speakers_occurance.items())

fake_speakers = [p[p.index("/fake/")+6 : p.rindex("_")] for p in fake_df["path"]]
fake_speakers_occurance = dict(sorted(Counter(fake_speakers).items(), key=lambda x: list(real_speakers_occurance.keys()).index(x[0])))
fake_names, fake_occurrences = zip(*fake_speakers_occurance.items())

fig, ax = plt.subplots(figsize=(15, 15))

bar_width = 0.35
bar_positions_real = np.arange(len(real_names))
bar_positions_fake = bar_positions_real + bar_width

ax.barh(bar_positions_real, real_occurrences, height=bar_width, color='green', edgecolor='black', label='Real')
ax.barh(bar_positions_fake, fake_occurrences, height=bar_width, color='red', edgecolor='black', label='Fake')

ax.set_yticks(bar_positions_real + bar_width/2)
ax.set_yticklabels(real_names)
ax.set_xlabel('Occurrences')
ax.set_title('Occurrences of Speakers')
ax.set_xlim(0, 3500)
ax.set_xticks(range(0, 3501, 500))
ax.legend()

plt.tight_layout()
plt.show()