In [1]:
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import train_test_split
import numpy as np 
import pickle

In [2]:
pristine_data = np.load('sample_pristine_np.npy')
fake_data = np.load('sample_fakes_np.npy')

In [3]:
pristine_train, pristine_test = train_test_split(pristine_data, train_size = 0.8, shuffle = True)
fake_train, fake_test = train_test_split(fake_data, train_size = 0.8, shuffle = True)

In [4]:
pristine_train.shape, pristine_test.shape, fake_train.shape, fake_test.shape

((37720, 64, 64, 3), (9430, 64, 64, 3), (36988, 64, 64, 3), (9247, 64, 64, 3))

In [7]:
pca_train = np.concatenate((pristine_train, fake_train), axis = 0 )

In [8]:
pca_train.shape

(74708, 64, 64, 3)

The point is to create a PCA that is modular (needs to be able to compress both pristine and fake images), and then train with pristine images only (pca projected) on OneClassSVM. 

In [9]:
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=256, batch_size=300)
ipca.fit(pca_train.reshape(-1, 64*64*3))


IncrementalPCA(batch_size=300, copy=True, n_components=256, whiten=False)

In [11]:
sum(ipca.explained_variance_ratio_)

0.9594004115191553

Out of sample testing, samples not even previously visible to PCA

In [12]:
x_train_transformed = ipca.transform(pristine_test.reshape(-1, 64*64*3))
x_test_transformed = ipca.transform(fake_test.reshape(-1, 64*64*3))

In [13]:
from sklearn import svm

model = svm.OneClassSVM(kernel='rbf', nu=0.0005,gamma=0.007)
model.fit(x_train_transformed)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.007, kernel='rbf',
            max_iter=-1, nu=0.0005, random_state=None, shrinking=True,
            tol=0.001, verbose=False)

In [14]:
y_train_pred = model.predict(x_train_transformed)

sum(y_train_pred == 1 ), len(y_train_pred), sum(y_train_pred == 1 )/len(y_train_pred)

(4118, 9430, 0.4366914103923648)

In [15]:
y_test_pred = model.predict(x_test_transformed)

sum(y_test_pred == 1 ), len(y_test_pred), sum(y_test_pred == 1 )/len(y_test_pred)

(4, 9247, 0.00043257272628960744)

Sample was visible to PCA, but now recycled to train OneClassSVM

In [16]:
x_train_transformed = ipca.transform(pristine_train.reshape(-1, 64*64*3))
x_test_transformed = ipca.transform(fake_train.reshape(-1, 64*64*3))

In [17]:
from sklearn import svm

model = svm.OneClassSVM(kernel='rbf', nu=0.0005,gamma=0.007)
model.fit(x_train_transformed)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.007, kernel='rbf',
            max_iter=-1, nu=0.0005, random_state=None, shrinking=True,
            tol=0.001, verbose=False)

In [18]:
y_train_pred = model.predict(x_train_transformed)

sum(y_train_pred == 1 ), len(y_train_pred), sum(y_train_pred == 1 )/len(y_train_pred)

(18485, 37720, 0.49005832449628844)

In [19]:
y_test_pred = model.predict(x_test_transformed)

sum(y_test_pred == 1 ), len(y_test_pred), sum(y_test_pred == 1 )/len(y_test_pred)

(0, 36988, 0.0)

Actually, what if the PCA is fitted with only pristine samples ... ?

In [21]:
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=256, batch_size=300)
ipca.fit(pristine_train.reshape(-1, 64*64*3))

sum(ipca.explained_variance_ratio_)

0.9566036841099145

In [22]:
x_train_transformed = ipca.transform(pristine_train.reshape(-1, 64*64*3))
x_test_transformed = ipca.transform(fake_train.reshape(-1, 64*64*3))

In [23]:
from sklearn import svm

model = svm.OneClassSVM(kernel='rbf', nu=0.0005,gamma=0.007)
model.fit(x_train_transformed)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.007, kernel='rbf',
            max_iter=-1, nu=0.0005, random_state=None, shrinking=True,
            tol=0.001, verbose=False)

in-sample performance for pristine samples, and of course fake samples (all of it) is always out-of-sample, but let's take only the 20% as split above for easy comparison 

In [24]:
y_train_pred = model.predict(x_train_transformed)
sum(y_train_pred == 1 ), len(y_train_pred), sum(y_train_pred == 1 )/len(y_train_pred)

(18485, 37720, 0.49005832449628844)

In [25]:
y_test_pred = model.predict(x_test_transformed)
sum(y_test_pred == 1 ), len(y_test_pred), sum(y_test_pred == 1 )/len(y_test_pred)

(0, 36988, 0.0)

Out of sample performance for pristine: 

In [26]:
x_train_transformed = ipca.transform(pristine_test.reshape(-1, 64*64*3))
x_test_transformed = ipca.transform(fake_test.reshape(-1, 64*64*3))

In [27]:
y_train_pred = model.predict(x_train_transformed)
sum(y_train_pred == 1 ), len(y_train_pred), sum(y_train_pred == 1 )/len(y_train_pred)

(27, 9430, 0.002863202545068929)

In [28]:
y_test_pred = model.predict(x_test_transformed)
sum(y_test_pred == 1 ), len(y_test_pred), sum(y_test_pred == 1 )/len(y_test_pred)

(0, 9247, 0.0)

In [None]:
with open("OC_models/pca_trained_with_80_percent_pristine_data.pickle", "wb") as f:
    pickle.dump(ipca, f)
    
with open("OC_models/corresponding_iPCA_OCSVM.pickle", "wb") as f:
    pickle.dump(model, f)