In [30]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

def train_model(X_data, y_data, model):
    X_model = X_data.to_numpy()
    y_model = y_data.to_numpy().ravel()

    imputer = SimpleImputer(strategy='median')
    X_model = imputer.fit_transform(X_model)

    X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=0.2, random_state=42)
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='micro')

    print("F1 Score:", f1)

    return f1

def predict_model(X_data, model):
    X_model = X_data.to_numpy()

    imputer = SimpleImputer(strategy='median')
    X_model = imputer.fit_transform(X_model)

    y_pred = model.predict(X_model)

    return y_pred


In [54]:
import pandas as pd

X = pd.read_csv('features/X_train_features.csv', index_col='id').to_numpy()
y = pd.read_csv('public/y_train.csv', index_col='id').to_numpy()
X.shape, y.shape

((5117, 300), (5117, 1))

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.feature_selection import SelectKBest, f_classif


# Create and train the Random Forest model with selected features
model_selected = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000, max_depth=20, 
                                 min_samples_split=60, min_samples_leaf=20, subsample=1.0,
                                 max_features=100, random_state=42)
model_selected.fit(X_train_selected, y_train.ravel())

# Evaluate the model with selected features
f1_selected = f1_score(y_test, model_selected.predict(X_test_selected), average='micro')
print("F1 Score with selected features:", f1_selected)



F1 Score with selected features: 0.814453125


In [49]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import nevergrad as ng
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Nevergrad optimization

def objective_function(hyperparameters):
    
    # Create and train the SVC model
    model = GradientBoostingClassifier(
        n_estimators=hyperparameters["n_estimators"],
        learning_rate=hyperparameters["learning_rate"],
        subsample=hyperparameters["subsample"],
        max_depth=hyperparameters["max_depth"],
        min_samples_split=hyperparameters["min_samples_split"],
        min_samples_leaf=hyperparameters["min_samples_leaf"],
        max_features=hyperparameters["max_features"],
        random_state=42
        )
    model.fit(X_train, y_train.ravel())
    preds = model.predict(X_test)
    
    # Compute the score
    score = -f1_score(y_test, preds, average='micro')  # Negative because Nevergrad minimizes the objective function

    # Print the score and hyperparameters
    print(f"Score: {-score}, Hyperparameters: {hyperparameters}")

    return score

# Set up the Nevergrad optimizer
parametrization = ng.p.Dict(
    n_estimators=ng.p.Scalar(lower=50, upper=500, init=300).set_integer_casting(),
    max_depth=ng.p.Scalar(lower=10, upper=100, init=50).set_integer_casting(),
    min_samples_split=ng.p.Scalar(lower=2, upper=10, init=2).set_integer_casting(),
    min_samples_leaf=ng.p.Scalar(lower=1, upper=10, init=1).set_integer_casting(),
    max_features=ng.p.Scalar(lower=10, upper=100, init=50).set_integer_casting(),
    learning_rate=ng.p.Scalar(lower=0.01, upper=0.1, init=0.05),
    subsample=ng.p.Scalar(lower=0.5, upper=1.0, init=1.0),
)
optimizer = ng.optimizers.NGOpt(parametrization=parametrization, budget=10)

# Run the optimization
recommendation = optimizer.minimize(objective_function)
# Train the final model with optimized hyperparameters
best_hyperparams = recommendation.value
print("Best hyperparameters:", best_hyperparams)

final_model = RandomForestClassifier(**best_hyperparams)
final_model.fit(X_train, y_train.ravel())

y_pred = final_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='micro')
print("F1 Score:", f1)
# Now, final_model is your trained model with optimized hyperparameters

Score: 0.7998046875, Hyperparameters: {'n_estimators': 300, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 50, 'learning_rate': 0.05, 'subsample': 1.0}
Score: 0.8017578125, Hyperparameters: {'n_estimators': 320, 'max_depth': 53, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 50, 'learning_rate': 0.03617076608959525, 'subsample': 0.8794644869775123}
Score: 0.798828125, Hyperparameters: {'n_estimators': 294, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 64, 'learning_rate': 0.03617076608959525, 'subsample': 0.9224495452624744}
Score: 0.798828125, Hyperparameters: {'n_estimators': 320, 'max_depth': 53, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 56, 'learning_rate': 0.03617076608959525, 'subsample': 0.8794644869775123}
Score: 0.80078125, Hyperparameters: {'n_estimators': 239, 'max_depth': 72, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 50, 'learning_rate': 0.04124877012

TypeError: __init__() got an unexpected keyword argument 'learning_rate'

In [None]:
# Predict the test set
y_pred = final_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='micro')
f1

In [None]:

out = pd.DataFrame(index=X_out.index, columns=['y'])
y_out = predict_model(X_out, final_model)
out['y'] = y_out
out.to_csv("out.csv")