First, let's split data into train and validation sets

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline

In [2]:
X = pd.read_csv('../data/preprocessed/gaussian_blur_train.csv', header=None)
y = pd.read_csv('../data/original/train/train_labels.csv')

y.head()

Unnamed: 0,Volcano?,Type,Radius,Number Volcanoes
0,1,3.0,17.46,1.0
1,0,,,
2,0,,,
3,0,,,
4,0,,,


In [3]:
y = y['Volcano?']

y

0       1
1       0
2       0
3       0
4       0
       ..
6995    0
6996    0
6997    0
6998    0
6999    0
Name: Volcano?, Length: 7000, dtype: int64

In [4]:
X.shape

(7000, 12100)

In [5]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12090,12091,12092,12093,12094,12095,12096,12097,12098,12099
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import numpy as np
import random
from sklearn.utils import check_random_state

seed = 42

np.random.seed(seed)

random.seed(seed)

check_random_state(seed)

RandomState(MT19937) at 0x293D6783740

We will try 80/20 division since there are few data avaliable

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [8]:
# Pipeline para SVM
pipe_svm = Pipeline([
    ('svm', SVC(random_state=seed))
])

pipe_dt = Pipeline([
    ('dt', DecisionTreeClassifier(random_state=seed))
])

pipe_xgb = Pipeline([
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=seed))
])

In [9]:
# Parâmetros para GridSearchCV
param_grid_svm = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

param_grid_dt = {
    'dt__max_depth': [3, 5, 10],
    'dt__min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2]
}

In [10]:
from joblib import Parallel, delayed
from sklearn.model_selection import GridSearchCV

def run_grid_search(pipe, param_grid, X_train, y_train):
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search

# Criar uma lista de tarefas para GridSearchCV
tasks = [
    (pipe_svm, param_grid_svm),
    (pipe_dt, param_grid_dt),
    (pipe_xgb, param_grid_xgb)
]

# Executar GridSearchCV para cada tarefa em paralelo
results = Parallel(n_jobs=-1)(
    delayed(run_grid_search)(pipe, param_grid, X_train, y_train)
    for pipe, param_grid in tasks
)

KeyboardInterrupt: 

In [4]:

from config import get_config
from data_preparation.dataset import load_data, split_data
from preprocessing.pipeline import preprocess_pipeline

### Get config file

config_path = '../configs/configs.json'
config = get_config(config_path)

### Load Data

X_train, y_train, X_test, y_test = load_data(config)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_train.shape}')

### Pre Process Data

X_proc_train = preprocess_pipeline(X_train, config)

print(f'X_proc_train shape: {X_proc_train.shape}')

### Augment Data

### Data Split (training/validation)

X_train, X_val, y_train, y_val = split_data(X_train, y_train, config)

NameError: name '__file__' is not defined