In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
from PIL import Image
import cv2
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm.auto import tqdm


In [2]:
!ls /kaggle/input/


Dataset


In [3]:
main_path = '/kaggle/input/Dataset'
sub_dirs = ['Train', 'Test', 'Validation']


In [4]:
def create_dataframe(main_path, sub_dir, sample_fraction=1):
    data = {"file_path": [], "label": []}
    for label_dir, label in zip(['Real', 'Fake'], [1, 0]):
        folder_path = os.path.join(main_path, sub_dir, label_dir)
        all_files = os.listdir(folder_path)
        sampled_files = random.sample(all_files, int(len(all_files) * sample_fraction))
        for img_file in sampled_files:
            data["file_path"].append(os.path.join(folder_path, img_file))
            data["label"].append(label)
    return pd.DataFrame(data)

In [5]:
for sub_dir in sub_dirs:
    df = create_dataframe(main_path, sub_dir, sample_fraction=1)
    csv_path = f"{sub_dir}.csv"
    df.to_csv(csv_path, index=False)
    print(f"Saved {csv_path} with {len(df)} entries.")

Saved Train.csv with 140002 entries.
Saved Test.csv with 10905 entries.
Saved Validation.csv with 39428 entries.


In [6]:
train_df = pd.read_csv("Train.csv").sample(frac=1, random_state=42).reset_index(drop=True)
valid_df = pd.read_csv("Validation.csv").sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.read_csv("Test.csv").sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
IMAGE_SIZE = (256, 256)
BATCH_SIZE = 64

In [8]:
class DeepfakeDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['file_path']
        label = float(self.dataframe.iloc[idx]['label'])
        
        # Load and convert image
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        label = torch.tensor(label, dtype=torch.float32)

            
        return image, label

In [9]:
train_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])



In [10]:
train_dataset = DeepfakeDataset(train_df, transform=train_transform)
valid_dataset = DeepfakeDataset(valid_df, transform=test_transform)
test_dataset = DeepfakeDataset(test_df, transform=test_transform)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

print(f"Found {len(train_dataset)} validated image filenames belonging to 2 classes.")
print(f"Found {len(valid_dataset)} validated image filenames belonging to 2 classes.")
print(f"Found {len(test_dataset)} validated image filenames belonging to 2 classes.")

Found 140002 validated image filenames belonging to 2 classes.
Found 39428 validated image filenames belonging to 2 classes.
Found 10905 validated image filenames belonging to 2 classes.


In [12]:
class DeepfakeDetector(nn.Module):
    def __init__(self):
        super(DeepfakeDetector, self).__init__()
        # Load pretrained ResNet50
        self.resnet = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        # Replace final FC layer
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        
        # Custom classifier
        self.classifier = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        features = self.resnet(x)
        output = self.classifier(features)
        return output


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [14]:
model = DeepfakeDetector().to(device)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 190MB/s]


In [15]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    all_targets = []
    all_outputs = []
    
    # Thêm thanh tiến trình cho training
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for inputs, targets in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # Cập nhật metrics
        running_loss += loss.item() * inputs.size(0)
        predicted = (outputs > 0.5).float()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        
        # Cập nhật postfix cho thanh tiến trình
        progress_bar.set_postfix({
            'train_loss': running_loss / total,
            'train_acc': correct / total
        })

        all_targets.extend(targets.cpu().numpy())
        all_outputs.extend(outputs.detach().cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total
    epoch_auc = roc_auc_score(all_targets, all_outputs)
    
    return epoch_loss, epoch_acc, epoch_auc

In [17]:
def validate(model, dataloader, criterion, device, mode='Validation'):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_targets = []
    all_outputs = []

    progress_bar = tqdm(dataloader, desc="Validating", leave=True)
    with torch.no_grad():
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item() * inputs.size(0)
            predicted = (outputs > 0.5).float()
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

            # Cập nhật postfix
            progress_bar.set_postfix({
                'val_loss': f"{(running_loss/total):.4f}",
                'val_acc': f"{(correct/total):.4f}"
            })

            all_targets.extend(targets.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total
    epoch_auc = roc_auc_score(all_targets, all_outputs)
    
    return epoch_loss, epoch_acc, epoch_auc

In [18]:
num_epochs = 5
best_val_acc = 0.0
checkpoint_path = '/kaggle/working/140K_resnet50_model.pth'

# Lists to store metrics
train_losses, train_accs, train_aucs = [], [], []
val_losses, val_accs, val_aucs = [], [], []

epoch_bar = tqdm(range(num_epochs), desc="Training", unit="epoch")


for epoch in epoch_bar:
    # Train
    train_loss, train_acc, train_auc = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    train_aucs.append(train_auc)
    
    # Validate
    val_loss, val_acc, val_auc = validate(model, valid_loader, criterion, device)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    val_aucs.append(val_auc)

    epoch_bar.set_postfix({
        'train_loss': f"{train_loss:.4f}",
        'train_acc': f"{train_acc:.4f}",
        'val_loss': f"{val_loss:.4f}",
        'val_acc': f"{val_acc:.4f}",
        'best_val': f"{best_val_acc:.4f}"
    })
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), checkpoint_path)
        tqdm.write(f"[Epoch {epoch+1}] 🚀 Model improved! val_acc: {val_acc:.4f} (best: {best_val_acc:.4f})")
    else:
        tqdm.write(f"[Epoch {epoch+1}] ❌ No improvement. val_acc: {val_acc:.4f} (best: {best_val_acc:.4f})")

    tqdm.write(f"\nEpoch {epoch+1}/{num_epochs} summary:")
    tqdm.write(f"Train => Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | AUC: {train_auc:.4f}")
    tqdm.write(f"Valid => Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | AUC: {val_auc:.4f}\n")

tqdm.write("\n🔥 Training completed!")
tqdm.write(f"Best validation accuracy: {best_val_acc:.4f}")


Training:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training:   0%|          | 0/2188 [00:00<?, ?it/s]

Validating:   0%|          | 0/617 [00:00<?, ?it/s]

[Epoch 1] 🚀 Model improved! val_acc: 0.9438 (best: 0.9438)

Epoch 1/5 summary:
Train => Loss: 0.1096 | Acc: 0.9562 | AUC: 0.9925
Valid => Loss: 0.1372 | Acc: 0.9438 | AUC: 0.9908



Training:   0%|          | 0/2188 [00:00<?, ?it/s]

Validating:   0%|          | 0/617 [00:00<?, ?it/s]

[Epoch 2] 🚀 Model improved! val_acc: 0.9575 (best: 0.9575)

Epoch 2/5 summary:
Train => Loss: 0.0722 | Acc: 0.9715 | AUC: 0.9966
Valid => Loss: 0.1092 | Acc: 0.9575 | AUC: 0.9929



Training:   0%|          | 0/2188 [00:00<?, ?it/s]

Validating:   0%|          | 0/617 [00:00<?, ?it/s]

[Epoch 3] ❌ No improvement. val_acc: 0.9544 (best: 0.9575)

Epoch 3/5 summary:
Train => Loss: 0.0626 | Acc: 0.9752 | AUC: 0.9974
Valid => Loss: 0.1140 | Acc: 0.9544 | AUC: 0.9925



Training:   0%|          | 0/2188 [00:00<?, ?it/s]

Validating:   0%|          | 0/617 [00:00<?, ?it/s]

[Epoch 4] 🚀 Model improved! val_acc: 0.9642 (best: 0.9642)

Epoch 4/5 summary:
Train => Loss: 0.0562 | Acc: 0.9776 | AUC: 0.9979
Valid => Loss: 0.0940 | Acc: 0.9642 | AUC: 0.9956



Training:   0%|          | 0/2188 [00:00<?, ?it/s]

Validating:   0%|          | 0/617 [00:00<?, ?it/s]

[Epoch 5] 🚀 Model improved! val_acc: 0.9687 (best: 0.9687)

Epoch 5/5 summary:
Train => Loss: 0.0505 | Acc: 0.9796 | AUC: 0.9983
Valid => Loss: 0.0812 | Acc: 0.9687 | AUC: 0.9962


🔥 Training completed!
Best validation accuracy: 0.9687


In [19]:
model.load_state_dict(torch.load(checkpoint_path))


  model.load_state_dict(torch.load(checkpoint_path))


<All keys matched successfully>

In [20]:
def test(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_targets = []
    all_outputs = []
    
    # Thêm thanh tiến trình cho test
    progress_bar = tqdm(test_loader, desc="Testing", leave=True)
    
    with torch.no_grad():
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)
            
            predicted = (outputs > 0.5).float()
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
            progress_bar.set_postfix({
                'test_loss': f"{(running_loss/total):.4f}",
                'test_acc': f"{(correct/total):.4f}"
            })
            
            all_targets.extend(targets.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())
    
    test_loss = running_loss / len(test_loader.dataset)
    test_acc = correct / total
    test_auc = roc_auc_score(all_targets, all_outputs)
    
    return test_loss, test_acc, test_auc

In [21]:
test_loss, test_acc, test_auc = validate(model, test_loader, criterion, device)
tqdm.write(f"\nFinal Test Results:")
tqdm.write(f"Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f} | AUC: {test_auc:.4f}")

Validating:   0%|          | 0/171 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x79c5bca95e10>
Traceback (most recent call last):
Exception ignored in:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x79c5bca95e10>    
<function _MultiProcessingDataLoaderIter.__del__ at 0x79c5bca95e10>Traceback (most recent call last):
self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
Traceback (most recent call last):
Exception ignored in:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__

    <function _MultiProcessingDataLoaderIter.__del__ at 0x79c5bca95e10>    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
Traceback (most recent call last):
self._shutdown_wo


Final Test Results:
Loss: 0.2542 | Accuracy: 0.9025 | AUC: 0.9729


In [22]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.title('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(train_accs, label='Train')
plt.plot(val_accs, label='Validation')
plt.title('Accuracy')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(train_aucs, label='Train')
plt.plot(val_aucs, label='Validation')
plt.title('AUC')
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.close()

In [23]:
y_pred, y_true = evaluate(model, test_loader, device)

# Calculate metrics
print('Classification Report:')
print(classification_report(y_true, y_pred))

print('Confusion Matrix:')
conf_matrix = confusion_matrix(y_true, y_pred)
print(conf_matrix)

# Save model in PyTorch format
torch.save(model, 'deepfake_model_full.pth')

# Export to ONNX (for deployment)
dummy_input = torch.randn(1, 3, img_size, img_size, device=device)
torch.onnx.export(model, dummy_input, "deepfake_model.onnx", 
                  input_names=["input"], output_names=["output"], 
                  verbose=True, opset_version=11)
print("Model exported to ONNX format")

NameError: name 'evaluate' is not defined