In [None]:
from typing import List
from numpy import load, ndarray
from sklearn.model_selection import PredefinedSplit
from sqlmodel import create_engine, SQLModel, Session, select

from models import Config, Sample


__author__ = "Marius Benthin"


# load secrets from environment
config = Config()

# create database connection and tables
sql_engine = create_engine(config.database_url)
SQLModel.metadata.create_all(sql_engine)

# load numpy feature vector
with load(file=config.numpy_file, allow_pickle=True) as data:

    # get features and sample IDs
    X: ndarray = data['X']
    sample_ids: ndarray = data['sample_ids']

    # get labels and fold ID for each sample
    y: List[str] = []
    test_fold: List[int] = []
    with Session(sql_engine) as session:
        for parent_id, child_id  in sample_ids:
            # get all parent samples
            parent: Sample = session.exec(select(Sample).where(Sample.id == parent_id)).one()
            y.append(parent.actor.name)
            test_fold.append(parent.fold_id)

# split dataset into k folds, with k = 8
cv: PredefinedSplit = PredefinedSplit(test_fold=test_fold)

In [None]:
from sklearn.svm import SVC
from numpy import savez_compressed
from sklearn.base import ClassifierMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

svm_linear: ClassifierMixin = SVC(kernel='linear', random_state=config.random_state, max_iter=config.max_iter)

# predict class labels (for hard voting)
y_pred = cross_val_predict(estimator=svm_linear, X=X, y=y, method='predict', cv=cv.get_n_splits())
print(classification_report(y, y_pred))

# predict class probabilities (for soft voting)
probabilities = cross_val_predict(estimator=svm_linear, X=X, y=y, method='predict_proba', cv=cv.get_n_splits())
# export numpy feature vector
savez_compressed(file='model_C_svm_linear_probabilities.npz', P=probabilities, sample_ids=sample_ids)

In [None]:
from sklearn.svm import SVC
from numpy import savez_compressed
from sklearn.base import ClassifierMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

svm_poly: ClassifierMixin = SVC(kernel='poly', random_state=config.random_state, max_iter=config.max_iter)

# predict class labels (for hard voting)
y_pred = cross_val_predict(estimator=svm_poly, X=X, y=y, method='predict', cv=cv.get_n_splits())
print(classification_report(y, y_pred))

# predict class probabilities (for soft voting)
probabilities = cross_val_predict(estimator=svm_poly, X=X, y=y, method='predict_proba', cv=cv.get_n_splits())
# export numpy feature vector
savez_compressed(file='model_C_svm_poly_probabilities.npz', P=probabilities, sample_ids=sample_ids)

In [None]:
from sklearn.svm import SVC
from numpy import savez_compressed
from sklearn.base import ClassifierMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

svm_rbf: ClassifierMixin = SVC(kernel='rbf', random_state=config.random_state, max_iter=config.max_iter)

# predict class labels (for hard voting)
y_pred = cross_val_predict(estimator=svm_rbf, X=X, y=y, method='predict', cv=cv.get_n_splits())
print(classification_report(y, y_pred))

# predict class probabilities (for soft voting)
probabilities = cross_val_predict(estimator=svm_rbf, X=X, y=y, method='predict_proba', cv=cv.get_n_splits())
# export numpy feature vector
savez_compressed(file='model_C_svm_rbf_probabilities.npz', P=probabilities, sample_ids=sample_ids)

In [None]:
from numpy import savez_compressed
from sklearn.base import ClassifierMixin
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

rfc: ClassifierMixin = RandomForestClassifier(random_state=config.random_state)

# predict class labels (for hard voting)
y_pred = cross_val_predict(estimator=rfc, X=X, y=y, method='predict', cv=cv.get_n_splits())
print(classification_report(y, y_pred))

# predict class probabilities (for soft voting)
probabilities = cross_val_predict(estimator=rfc, X=X, y=y, method='predict_proba', cv=cv.get_n_splits())
# export numpy feature vector
savez_compressed(file='model_C_rfc_probabilities.npz', P=probabilities, sample_ids=sample_ids)

In [None]:
# install pytorch with CUDA
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117

In [None]:
import torch
from sklearn import preprocessing
from skorch import NeuralNetClassifier
from sklearn.pipeline import make_pipeline
from neural_network import DNN, Preprocessor
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

# check if GPU is available
if torch.cuda.is_available():
    device = 'cuda'
    torch.cuda.empty_cache()
else:
    device = 'cpu'
print(f"Using device: {device}")

y_encoder = preprocessing.LabelEncoder()
y_encoded = y_encoder.fit_transform(y)

fnn = make_pipeline(
    Preprocessor(),
    NeuralNetClassifier(
        module=DNN,
        module__D_in=X.shape[1],
        module__D_out=len(set(y)),
        train_split=None,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.Adam,
        lr=config.max_iter,
        device=device,
        max_epochs=config.epochs,
    )
)

# predict class labels (for hard voting)
y_pred = cross_val_predict(estimator=fnn, X=X, y=y_encoded, method='predict', cv=cv.get_n_splits())
print(classification_report(y, y_encoder.inverse_transform(y_pred)))

# predict class probabilities (for soft voting)
probabilities = cross_val_predict(estimator=fnn, X=X, y=y_encoded, method='predict_proba', cv=cv.get_n_splits())
# export numpy feature vector
savez_compressed(file='model_C_fnn_probabilities.npz', P=probabilities, sample_ids=sample_ids)