In [91]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn

In [92]:
training = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
training = training.iloc[:, 1:] # Dropping 'Unnamed' column
test = test.iloc[:, 1:] # Dropping 'Unnamed' column

In [93]:
def onehot_feature(pd_data, column_name):
    # Retrieve the unique values (the categories) and an index for each sample
    # specifying the sample category (values[value_idx] reconstruct the original array)
    col_values = pd_data[column_name].to_numpy().astype('<U')
    values, value_idx = np.unique(col_values, return_inverse=True)
    n_values = values.size
    # Create a temporary identity matrix to convert value_idx into one-hot features
    onehots = np.eye(n_values) #when you use an array to index another array in NumPy, it selects rows from the indexed array based on the values in the index array. e[a] selects rows from the identity matrix e based on the values in array a
    value_onehot = onehots[value_idx]
    # Remove the categorical feature
    pd_data = pd_data.drop(column_name, axis=1)
    # Add the new featues
    for i in range(n_values):
        pd_data["{}_{}".format(column_name, values[i])] = value_onehot[:, i]
    return pd_data

In [94]:
training_for_class = onehot_feature(training, 'DNAtype')
training_for_class = training_for_class.drop(["SpeciesID", "SpeciesName","Ncodons","AGA"], axis = 1)

In [95]:
X_for_class = training_for_class.drop("Kingdom", axis = 1)
y_for_class = training_for_class.loc[:, "Kingdom"]

In [96]:
X = X_for_class.to_numpy()
y = y_for_class.to_numpy()

## Quello che vorrei fare è provare prima le SVM lineari e fare la grid search sul parametro C, poi provo con i kernel non lineari e poi li confronto

In [97]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [100]:
# Divido i dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [99]:
param_grid_linear = {'C': [100, 1000, 10000, 100000]}

# Grid Search per C con modello lineare
svm_linear = GridSearchCV(SVC(kernel='linear'), param_grid_linear, cv=5, scoring='accuracy')
svm_linear.fit(X_train, y_train)
print("Best C parameter for the linear model:", svm_linear.best_params_)

KeyboardInterrupt: 

In [18]:
model = SVC(kernel='linear', C=1000)
model.fit(X_train, y_train)

train_acc = accuracy_score(model.predict(X_train), y_train)
test_acc = accuracy_score(model.predict(X_test), y_test)

print("SVM train accuracy:", train_acc)
print("SVM test accuracy:", test_acc)

SVM train accuracy: 0.8973269362577108
SVM test accuracy: 0.868244323632875


In [101]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np

def refine_grid_search(estimator, param_grid, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1):
    while True:
        grid_search = GridSearchCV(estimator, param_grid, cv=cv, scoring=scoring, refit=True, n_jobs=n_jobs)
        grid_search.fit(X_train, y_train)
        
        best_params = grid_search.best_params_
        
        refine_needed = False
        
        for param in param_grid:    #here param_grid for the first kernel is 
            min_val, max_val = min(param_grid[param]), max(param_grid[param])
            
            if best_params[param] == min_val:
                if param in ['C', 'gamma']:
                    new_min = min_val / 10
                    param_grid[param] = np.linspace(new_min, max_val, len(param_grid[param])).tolist()
                else:
                    new_min = min_val - 1
                    param_grid[param] = [int(round(x)) for x in np.linspace(min_val, new_max, len(param_grid[param]))]
                refine_needed = True
            
            elif best_params[param] == max_val:
                if param in ['C', 'gamma']:
                    new_max = max_val * 10
                    param_grid[param] = np.linspace(min_val, new_max, len(param_grid[param])).tolist()
                else:
                    new_max = max_val + 1
                    param_grid[param] = [int(round(x)) for x in np.linspace(min_val, new_max, len(param_grid[param]))]
                refine_needed = True

        if not refine_needed:
            break
    
    return best_params

In [102]:
import time

param_grids = {
    'linear': {
        'C': [0.01, 0.1, 1, 10, 100, 1000]
    },
    'poly': {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'degree': [2, 3, 4, 5],
        'coef0': [-10, -1, 0.1, 1, 10]
    },
    'rbf': {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'gamma': [1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10]
    },
    'sigmoid': {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'coef0': [-10, -1, 0.1, 1, 10]
    }
}

kernels = ('linear', 'poly', 'rbf', 'sigmoid')

start_time = time.time()

for kernel in kernels:
    print(f"\nEseguendo GridSearch per kernel: {kernel}")
    svm = SVC(kernel=kernel)
    param_grid = param_grids[kernel]    
    best_params = refine_grid_search(svm, param_grid, X_train, y_train)
    print(f"Parametri finali ottimali per kernel {kernel}: {best_params}")
    
    
end_time = time.time()  # End the timer

print(f"Tempo di esecuzione totale: {end_time - start_time:.2f} secondi")


Eseguendo GridSearch per kernel: linear
Parametri finali ottimali per kernel linear: {'C': 2000.008}

Eseguendo GridSearch per kernel: poly


KeyboardInterrupt: 

In [26]:
# polynomial
param_grid_poly = {'C': [10000, 100000, 1000000], 'coef0': np.linspace(-10, 10, 3), 'degree': [4, 5]}

svm_poly = GridSearchCV(SVC(kernel='poly'), param_grid_poly, cv=5, scoring='accuracy', refit=True, n_jobs=-1)
svm_poly.fit(X_train, y_train)
print("Migliori parametri per il kernel polinomiale:", svm_poly.best_params_)

Migliori parametri per il kernel polinomiale: {'C': 10000, 'degree': 5}


In [None]:
model = SVC(kernel='poly', C=, degree=)
model.fit(X_train, y_train)

train_acc = accuracy_score(model.predict(X_train), y_train)
test_acc = accuracy_score(model.predict(X_test), y_test)

print("SVM train accuracy:", train_acc)
print("SVM test accuracy:", test_acc)

In [None]:
# RBF (radial basis)
param_grid_rbf = {'C': np.linspace(0.001, 100, 3), 'gamma': np.linspace(0.001, 100, 3)}

svm_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=5, scoring='accuracy', refit=True, n_jobs = -1)
svm_rbf.fit(X_train, y_train)
print("Migliori parametri per il kernel rbf:", svm_rbf.best_params_)

In [None]:
model = SVC(kernel='rbf', C=1000)
model.fit(X_train, y_train)

train_acc = accuracy_score(model.predict(X_train), y_train)
test_acc = accuracy_score(model.predict(X_test), y_test)

print("SVM train accuracy:", train_acc)
print("SVM test accuracy:", test_acc)

In [None]:
# Sigmoid
param_grid_sigmoid = {'C': np.linspace(0.001, 100, 3), 'gamma': np.linspace(0.001, 100, 3),'coef0': np.linspace(-10, 10, 3)}

sigm_poly = GridSearchCV(SVC(kernel='sigmoid'), param_grid_sigmoid, cv=5, scoring='accuracy', refit=True, n_jobs = -1)
sigm_poly.fit(X_train, y_train)
print("Migliori parametri per il kernel sigmoid:", grid_search_s.best_params_)