In [None]:
import pandas as pd
import dask.dataframe as dd

### Los sets de train, test y val son los mismos utilizados en el otro modelo. Ahi esta todo el proceso

# Hyper Parameter Search

In [None]:
from sklearn.neural_network import MLPClassifier
from dask_ml.model_selection import RandomizedSearchCV as DaskRandomizedSearchCV  
train_ddf = dd.read_parquet('train_normalized.parq').sample(frac=0.015)
y_train = train_ddf['label']
train_df = train_ddf.drop('label', axis=1)
model = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'learning_rate': ['constant','adaptive']}
search = DaskRandomizedSearchCV(model, param_grid, random_state=42, scoring='f1')
search.fit(train_df, y_train)
print(f'Best score: {search.best_score_}')
print(f'Best params: {search.best_params_}')
print(f'Best estimator: {search.best_estimator_}')

# Training

In [2]:
import pyarrow.parquet as pq
from sklearn.metrics import f1_score  
from sklearn.neural_network import MLPClassifier

train_df = pq.ParquetFile('train_normalized.parq')

model = MLPClassifier(max_iter=40)
for batch in train_df.iter_batches(batch_size=100000):
    df_batch = batch.to_pandas()
    y_batch = df_batch['label']
    X_batch = df_batch.drop('label', axis=1)  
    # Entrena el modelo en el lote actual
    model.partial_fit(X_batch, y_batch, classes=[0, 1])

print('Entrenamiento completo para todos los batches')
valid_df = pd.read_parquet('val_normalized.parq')
y_valid = valid_df['label']
X_valid = valid_df.drop('label', axis=1)

y_pred = model.predict(X_valid)  
f1 = f1_score(y_valid, y_pred)
print(f'Puntaje F1 del modelo: {f1}')

Entrenamiento completo para todos los batches
Puntaje F1 del modelo: 0.7177400664383387


# Predict for test

In [3]:
train_df = pd.read_parquet('train_normalized.parq')
test_df = pd.read_parquet('test_normalized.parq')
ips = test_df['attacker_ip_enum']
train_df = train_df.drop('label', axis=1)
feature_names = train_df.columns
# Reordenar las columnas en X_test para que coincida con el orden del conjunto de entrenamiento
X_test_reordered = test_df[feature_names]

In [None]:
y_pred = model.predict(X_test_reordered)
kaggle_df = pd.DataFrame({'attacker_ip_enum': ips, 'label': y_pred})
kaggle_df = kaggle_df.groupby('attacker_ip_enum')['label'].apply(lambda x: x.mode().iloc[0]).reset_index(name='label')
kaggle_df.to_csv('submission_mlp_model.csv', index=False)