In [1]:
import numpy as np
import pandas as pd
import os

import skops.io as sio
import torch

from scipy.io import arff
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from classifier import FeatureExtractor
from classifier import train_model, test_model

np.random.seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузка данных

In [2]:
train = arff.loadarff("Ham/Ham_TRAIN.arff")
train = pd.DataFrame(train[0])
train = train.sample(frac=1)
train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att423,att424,att425,att426,att427,att428,att429,att430,att431,target
84,-0.678676,-0.692275,-0.700148,-0.701222,-0.700506,-0.696927,-0.689412,-0.679392,-0.669729,-0.66114,...,0.363438,0.120087,-0.091056,-0.255675,-0.373772,-0.452503,-0.499742,-0.521572,-0.526224,b'2'
10,-0.664186,-0.639378,-0.601115,-0.54351,-0.476235,-0.434188,-0.421573,-0.421573,-0.429983,-0.451006,...,-0.32907,-0.582614,-0.73945,-0.818499,-0.847091,-0.852978,-0.842466,-0.812612,-0.784441,b'1'
75,-0.721131,-0.659126,-0.623255,-0.669375,-0.752389,-0.772887,-0.766225,-0.765713,-0.781086,-0.798509,...,0.586096,0.263261,-0.049326,-0.320918,-0.515644,-0.633504,-0.690385,-0.710882,-0.718569,b'2'
2,-0.695711,-0.702015,-0.695317,-0.677191,-0.640544,-0.558976,-0.436034,-0.365105,-0.361165,-0.392688,...,-0.392688,-0.455736,-0.530605,-0.606262,-0.684677,-0.760334,-0.818653,-0.852147,-0.865545,b'1'
24,-0.925798,-0.905106,-0.887431,-0.886138,-0.891311,-0.903381,-0.915452,-0.920194,-0.920194,-0.919332,...,0.656295,0.134678,-0.223125,-0.44729,-0.574892,-0.643866,-0.671456,-0.681802,-0.683958,b'1'


In [3]:
train.isnull().sum().unique()

array([0])

In [4]:
train.target.unique()

array([b'2', b'1'], dtype=object)

In [5]:
train.target = train.target.astype(int)

In [6]:
train.head()

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att423,att424,att425,att426,att427,att428,att429,att430,att431,target
84,-0.678676,-0.692275,-0.700148,-0.701222,-0.700506,-0.696927,-0.689412,-0.679392,-0.669729,-0.66114,...,0.363438,0.120087,-0.091056,-0.255675,-0.373772,-0.452503,-0.499742,-0.521572,-0.526224,2
10,-0.664186,-0.639378,-0.601115,-0.54351,-0.476235,-0.434188,-0.421573,-0.421573,-0.429983,-0.451006,...,-0.32907,-0.582614,-0.73945,-0.818499,-0.847091,-0.852978,-0.842466,-0.812612,-0.784441,1
75,-0.721131,-0.659126,-0.623255,-0.669375,-0.752389,-0.772887,-0.766225,-0.765713,-0.781086,-0.798509,...,0.586096,0.263261,-0.049326,-0.320918,-0.515644,-0.633504,-0.690385,-0.710882,-0.718569,2
2,-0.695711,-0.702015,-0.695317,-0.677191,-0.640544,-0.558976,-0.436034,-0.365105,-0.361165,-0.392688,...,-0.392688,-0.455736,-0.530605,-0.606262,-0.684677,-0.760334,-0.818653,-0.852147,-0.865545,1
24,-0.925798,-0.905106,-0.887431,-0.886138,-0.891311,-0.903381,-0.915452,-0.920194,-0.920194,-0.919332,...,0.656295,0.134678,-0.223125,-0.44729,-0.574892,-0.643866,-0.671456,-0.681802,-0.683958,1


In [7]:
train.shape

(109, 432)

In [8]:
train_inputs = torch.tensor(train.iloc[:, :-1].values, dtype=torch.float)
train_inputs = train_inputs.reshape((train_inputs.shape[0], 1, train_inputs.shape[1]))
train_labels = train.iloc[:, -1:].values.ravel()
train_labels = train_labels * 2 - 3

In [9]:
test = arff.loadarff("Ham/Ham_TEST.arff")
test = pd.DataFrame(test[0])
test_inputs = torch.tensor(test.iloc[:, :-1].values, dtype=torch.float)
test_inputs = test_inputs.reshape((test_inputs.shape[0], 1, test_inputs.shape[1]))
test_labels = test.iloc[:, -1:].astype(int).values.ravel()
test_labels = test_labels * 2 - 3

# Модель 1

In [10]:
num_iterations = 100
accuracy_histories = {
    "normal": None,
    "binary": None,
    "tertiary": None
}

In [13]:
kernel_sampling_type = "normal"
os.makedirs(f"models/{kernel_sampling_type}_sampling/", exist_ok=True)

num_iterations = 100
accuracy_history = np.zeros(num_iterations)
best_accuracy = 0
desc = f"Examining models with {kernel_sampling_type} kernel sampling"
for i in tqdm(range(num_iterations), desc=desc, maxinterval=0.1):
    feature_extractor = FeatureExtractor(input_size=train_inputs.shape[-1], num_conv=10000, kernel_sampling_type=kernel_sampling_type)
    scaler = StandardScaler()
    classifier = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10))
    
    train_model(classifier, feature_extractor, scaler, train_inputs, train_labels)
    accuracy = test_model(classifier, feature_extractor, scaler, test_inputs, test_labels)
    accuracy_history[i] = accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy

        torch.save(feature_extractor.state_dict(), f=f"models/{kernel_sampling_type}_sampling/feature_extractor.pt")
        sio.dump(scaler, file=f"models/{kernel_sampling_type}_sampling/scaler.skops")
        sio.dump(classifier, file=f"models/{kernel_sampling_type}_sampling/classifier.skops")

accuracy_histories[kernel_sampling_type] = accuracy_history

Examining models with normal kernel sampling:  21%|██        | 21/100 [09:18<34:32, 26.23s/it]

# Модель 2

In [None]:
kernel_sampling_type = "binary"
os.makedirs(f"models/{kernel_sampling_type}_sampling/", exist_ok=True)

num_iterations = 100
accuracy_history = np.zeros(num_iterations)
best_accuracy = 0
desc = f"Examining models with {kernel_sampling_type} kernel sampling"
for i in tqdm(range(num_iterations), desc=desc):
    feature_extractor = FeatureExtractor(input_size=train_inputs.shape[-1], num_conv=10000, kernel_sampling_type=kernel_sampling_type)
    scaler = StandardScaler()
    classifier = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10))
    
    train_model(classifier, feature_extractor, scaler, train_inputs, train_labels)
    accuracy = test_model(classifier, feature_extractor, scaler, test_inputs, test_labels)
    accuracy_history[i] = accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy

        torch.save(feature_extractor.state_dict(), f=f"models/{kernel_sampling_type}_sampling/feature_extractor.pt")
        sio.dump(scaler, file=f"models/{kernel_sampling_type}_sampling/scaler.skops")
        sio.dump(classifier, file=f"models/{kernel_sampling_type}_sampling/classifier.skops")

accuracy_histories[kernel_sampling_type] = accuracy_history

# Модель 3

In [None]:
kernel_sampling_type = "tertiary"
os.makedirs(f"models/{kernel_sampling_type}_sampling/", exist_ok=True)

num_iterations = 100
accuracy_history = np.zeros(num_iterations)
best_accuracy = 0
desc = f"Examining models with {kernel_sampling_type} kernel sampling"
for i in tqdm(range(num_iterations), desc=desc):
    feature_extractor = FeatureExtractor(input_size=train_inputs.shape[-1], num_conv=10000, kernel_sampling_type=kernel_sampling_type)
    scaler = StandardScaler()
    classifier = RidgeClassifierCV(alphas = np.logspace(-3, 3, 10))
    
    train_model(classifier, feature_extractor, scaler, train_inputs, train_labels)
    accuracy = test_model(classifier, feature_extractor, scaler, test_inputs, test_labels)
    accuracy_history[i] = accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy

        torch.save(feature_extractor.state_dict(), f=f"models/{kernel_sampling_type}_sampling/feature_extractor.pt")
        sio.dump(scaler, file=f"models/{kernel_sampling_type}_sampling/scaler.skops")
        sio.dump(classifier, file=f"models/{kernel_sampling_type}_sampling/classifier.skops")

accuracy_histories[kernel_sampling_type] = accuracy_history

# Сравнительный анализ и результаты в статье