<a href="https://colab.research.google.com/github/me1nna/fake-image-detection/blob/main/RF_clf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Random Forest + CLIP for fake images detection

Установка зависимостей

In [1]:
import torch
import numpy as np

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


In [2]:
import random
import os

import pickle
import numpy as np
from skimage import io

from tqdm import tqdm, tqdm_notebook
from PIL import Image
from pathlib import Path

from torchvision.transforms import v2
from torchvision import transforms
from multiprocessing.pool import ThreadPool
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn

from matplotlib import colors, pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = "42"

In [4]:
import PIL
print(PIL.__version__)

9.4.0


### Загрузка датасета

In [None]:
!gdown --id 13rZ1HNOWsN-cTYgYHu0S3UYUHI0KAUBV
!gdown --id 1qjK-uwy20OAWCO1l39RNvjRgjsZiFqm0
!gdown --id 1Heeu9E9vI3MtkYE-_iZdGGcytm5Mw8ha
!gdown --id 1vnG3dlMb9wxo0P-noc84LUBaZNjwumMq
!gdown --id 1BW2ARIM2r0EgL5Z4A8E0O4PXB2BNRhmA

!unzip "images.zip"

In [6]:
import seaborn as sns
import pandas as pd

In [7]:
data = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [8]:
data

Unnamed: 0,id,target
0,gL5fWrgNPU.jpg,0
1,KKWXJDN8uM.png,1
2,Wb2I0CXlAY.jpg,0
3,G7PLhKpaf7.jpeg,1
4,DEUkeefz6Z.jpg,0
...,...,...
1007,fqoVycdxLV,1
1008,TN2ZOiNI5e.jpg,0
1009,2WMWt6CAWQ.jpg,0
1010,fQCY5n2vcF.jpg,0


Некоторые файлы в датасете не имеют расширения. Нетрудно заметить, если расширения не указано, то оно .png

In [9]:
def load_images(image_paths):
    images = []
    for image_path in image_paths:
        full_path = os.path.join('/content/images', image_path)
        if not os.path.isfile(full_path):
            full_path += ".png"
        try:
            images.append(full_path)
        except Exception as e:
            print(f"Не удалось загрузить изображение: {full_path}")
            print(f"Ошибка: {e}")
            continue
    return images

In [10]:
train_files = load_images(data["id"])
test_files = load_images(data_test["id"])

In [12]:
train_labels = data["target"]

In [None]:
!pip install git+https://github.com/openai/CLIP.git

Вытащим эмбеддинги с помощью image-encoder из CLIP

In [13]:
import clip
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Загрузка модели CLIP
model, preprocess = clip.load('ViT-B/32', device='cuda')

# Подготовка изображений и получение эмбеддингов
def get_image_embeddings(image_paths):
    embeddings = []
    for path in image_paths:
        image = preprocess(Image.open(path)).unsqueeze(0).to('cuda')
        with torch.no_grad():
            embedding = model.encode_image(image).cpu().numpy()
        embeddings.append(embedding)
    return np.vstack(embeddings)


train_embeddings = get_image_embeddings(train_files)
test_embeddings = get_image_embeddings(test_files)

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 170MiB/s]


Обучим случайный лес

In [18]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=1, random_state=42, n_estimators=100)

forest.fit(train_embeddings, train_labels)

In [20]:
y_pred = forest.predict(train_embeddings)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(train_labels, y_pred)

print(f"Точность модели на трейне: {accuracy}")

Точность модели на тестовых данных: 0.7045454545454546


Давайте подберем более оптимальные параметры для леса

In [37]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 7, 9],
    'n_estimators': [100, 200, 300, 400, 500]
}

forest = RandomForestClassifier()

grid_search = GridSearchCV(forest, param_grid, cv=5)

grid_search.fit(train_embeddings, train_labels)

best_params = grid_search.best_params_

best_rf = grid_search.best_estimator_
rf_score = best_rf.score(train_embeddings, train_labels)

print(f"Лучшие параметры Random Forest: {best_params}")
print(f"Score лучшей Random Forest модели на трейне: {rf_score}")

Лучшие параметры Random Forest: {'max_depth': 9, 'n_estimators': 300}
Score лучшей Random Forest модели на тестовом наборе: 1.0


In [26]:
y_pred = best_rf.predict(train_embeddings)
accuracy = accuracy_score(train_labels, y_pred)

print(f"Точность модели на трейне: {accuracy}")

Точность модели на трейне: 1.0


In [None]:
probs = best_rf.predict_proba(test_embeddings)[:, 1]
probs

In [30]:
my_submit = pd.read_csv("sample_submission.csv")

my_submit = pd.DataFrame({"id": my_submit["id"], "target": probs})
my_submit.head()

Unnamed: 0,id,target
0,Qt1fGUB0Vz.jpeg,0.059719
1,j4Rhioq7R3.jpeg,0.397144
2,rD0hgFHJUZ.jpeg,0.32206
3,aY5z1EJsJ6.jpeg,0.910327
4,qZ3IoxD2TE.jpeg,0.193315


In [31]:
my_submit.to_csv("sub_rf1.csv", index=False)

Продолжим искать оптимальный лес! =)

In [39]:
param_grid = {
    'max_depth': [9, 10, 11, 12],
    'n_estimators': [500, 600, 700, 800]
}

forest = RandomForestClassifier()

grid_search = GridSearchCV(forest, param_grid, cv=5)

grid_search.fit(train_embeddings, train_labels)

best_params = grid_search.best_params_

best_rf = grid_search.best_estimator_
rf_score = best_rf.score(train_embeddings, train_labels)

print(f"Лучшие параметры Random Forest: {best_params}")
print(f"Score лучшей Random Forest модели на трейне: {rf_score}")

Лучшие параметры Random Forest: {'max_depth': 10, 'n_estimators': 700}
Score лучшей Random Forest модели на трейне: 1.0


In [33]:
y_pred = best_rf.predict(train_embeddings)
accuracy = accuracy_score(train_labels, y_pred)

print(f"Точность модели на трейне: {accuracy}")

Точность модели на трейне: 1.0


In [None]:
probs = best_rf.predict_proba(test_embeddings)[:, 1]
probs

In [36]:
my_submit = pd.read_csv("sample_submission.csv")

my_submit = pd.DataFrame({"id": my_submit["id"], "target": probs})
my_submit.head()

Unnamed: 0,id,target
0,Qt1fGUB0Vz.jpeg,0.060417
1,j4Rhioq7R3.jpeg,0.436622
2,rD0hgFHJUZ.jpeg,0.352238
3,aY5z1EJsJ6.jpeg,0.899952
4,qZ3IoxD2TE.jpeg,0.182378


In [38]:
my_submit.to_csv("sub_rf2.csv", index=False)

In [41]:
best_rf.predict_proba(train_embeddings)[:, 1]

array([0.050633  , 0.76730129, 0.03146498, ..., 0.05779466, 0.04882568,
       0.82290436])