In [2]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [3]:
class AudioDeepfakeDataset(Dataset):
    def __init__(self, root_dir, transform=None, target_sample_rate=16000, duration=3):
        """
        Args:
            root_dir (string): Directory with 'real' and 'fake' subdirectories
            transform (callable, optional): Optional transform to be applied
            target_sample_rate (int): Target sample rate for audio files
            duration (float): Duration in seconds to which audio will be trimmed/padded
        """
        self.root_dir = root_dir
        self.transform = transform
        self.target_sample_rate = target_sample_rate
        self.duration = duration
        self.samples = []
        
        # Walk through the directory structure
        for label, folder in enumerate(['real', 'fake']):
            folder_path = os.path.join(root_dir, folder)
            if os.path.exists(folder_path):
                for file in os.listdir(folder_path):
                    if file.endswith('.wav'):
                        self.samples.append({
                            'path': os.path.join(folder_path, file),
                            'label': label
                        })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        audio_path = self.samples[idx]['path']
        label = self.samples[idx]['label']
        
        # Load audio file
        try:
            audio, sr = librosa.load(audio_path, sr=self.target_sample_rate)
            
            # Ensure consistent duration
            target_length = int(self.duration * self.target_sample_rate)
            if len(audio) > target_length:
                audio = audio[:target_length]
            else:
                padding = target_length - len(audio)
                audio = np.pad(audio, (0, padding), mode='constant')
                
            if self.transform:
                audio = self.transform(audio)
                
            return torch.FloatTensor(audio), label
            
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            return torch.zeros(int(self.duration * self.target_sample_rate)), label

# Create datasets
train_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/training')
val_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/validation')
test_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/testing')

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.downsample = downsample
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class RawNet2(nn.Module):
    def __init__(self, num_classes=2):
        super(RawNet2, self).__init__()
        
        # Initial layers
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=3)
        
        # Residual blocks
        self.layer1 = self._make_layer(64, 64, 3)
        self.layer2 = self._make_layer(64, 128, 4, stride=2)
        self.layer3 = self._make_layer(128, 256, 6, stride=2)
        self.layer4 = self._make_layer(256, 512, 3, stride=2)
        
        # Attention block
        self.attention = nn.Sequential(
            nn.Conv1d(512, 256, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Conv1d(256, 512, kernel_size=1),
            nn.Softmax(dim=2)
        )
        
        # Classifier
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(512, num_classes)
        
    def _make_layer(self, in_channels, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm1d(out_channels)
            )
        layers = []
        layers.append(ResidualBlock(in_channels, out_channels, stride, downsample))
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        # Add channel dimension
        x = x.unsqueeze(1)
        
        # Initial processing
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        # Residual blocks
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        # Attention
        att = self.attention(x)
        x = x * att
        
        # Pooling and classification
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        
        return x

In [5]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RawNet2(num_classes=2).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

train_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/training')
val_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/validation')
test_dataset = AudioDeepfakeDataset('/kaggle/input/the-fake-or-real-dataset/for-original/for-original/testing')

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10):
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss = running_loss / len(train_loader)
        train_acc = correct / total
        train_losses.append(train_loss)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss = val_loss / len(val_loader)
        val_acc = correct / total
        val_losses.append(val_loss)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}')
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), '/kaggle/working/best_model.pth')
    
    return train_losses, val_losses

# Train the model
train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=5)

Epoch 1/5: 100%|██████████| 944/944 [37:55<00:00,  2.41s/it]


Epoch 1: Train Loss: 0.1808, Acc: 0.9211 | Val Loss: 0.0994, Acc: 0.9661


Epoch 2/5: 100%|██████████| 944/944 [30:53<00:00,  1.96s/it]


Epoch 2: Train Loss: 0.0567, Acc: 0.9808 | Val Loss: 0.0544, Acc: 0.9857


Epoch 3/5: 100%|██████████| 944/944 [30:56<00:00,  1.97s/it]


Epoch 3: Train Loss: 0.0289, Acc: 0.9906 | Val Loss: 0.0275, Acc: 0.9921


Epoch 4/5: 100%|██████████| 944/944 [30:54<00:00,  1.96s/it]


Epoch 4: Train Loss: 0.0213, Acc: 0.9933 | Val Loss: 0.0287, Acc: 0.9909


Epoch 5/5: 100%|██████████| 944/944 [31:00<00:00,  1.97s/it]


Epoch 5: Train Loss: 0.0147, Acc: 0.9953 | Val Loss: 0.0132, Acc: 0.9969


In [6]:
import os

# Paths to the directories
fake_dir = '/kaggle/input/the-fake-or-real-dataset/for-original/for-original/training/fake'
real_dir = '/kaggle/input/the-fake-or-real-dataset/for-original/for-original/training/real'

# Count the number of files in each directory
num_fake_files = len(os.listdir(fake_dir))
num_real_files = len(os.listdir(real_dir))

print(f"Number of files in 'fake' directory: {num_fake_files}")
print(f"Number of files in 'real' directory: {num_real_files}")


Number of files in 'fake' directory: 26941
Number of files in 'real' directory: 26941


In [21]:

from torch.utils.data import Subset
import random

def evaluate_model_subset(model, test_loader, subset_size=50):
    model.eval()
    
    # Set random seed for reproducibility
    random.seed(42)
    
    # Get a random subset of indices
    all_indices = list(range(len(test_loader.dataset)))
    subset_indices = random.sample(all_indices, min(subset_size, len(all_indices)))
    
    # Create a subset dataloader
    subset_dataset = Subset(test_loader.dataset, subset_indices)
    subset_loader = DataLoader(subset_dataset, batch_size=test_loader.batch_size, shuffle=False)
    
    correct = 0
    total = 0
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for inputs, labels in subset_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())
    
    accuracy = correct / total
    f1 = f1_score(all_labels, all_preds)
    
    print(f'Test Accuracy : {accuracy:.4f}')

    
    # Plot confusion matrix
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    
    return accuracy, f1

# Evaluate on exactly 50 samples
print("Evaluating trained model...")
test_accuracy = evaluate_model_subset(model, test_loader, subset_size=50)

Evaluating trained model...
Test Accuracy : 0.5800
