In [1]:
import optuna
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import average_precision_score

In [2]:
wine_data = pd.read_csv('winequality_red.csv')
x = wine_data.drop('good', axis =1)

# indep_var
y = wine_data['good']

indep_vars = x.columns
print(indep_vars)

scaler = StandardScaler()

scaler.fit(x)
scaled_x = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(scaled_x, y , test_size = 0.3, random_state  = 999)

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')


In [3]:
# Def logit modl w gridsearch

# Create regression instance, saga solver because l1 and l2 support.
log_reg = LogisticRegression(n_jobs=-1, penalty='elasticnet',  random_state=0, solver='saga',
                             fit_intercept=True) 

# Define a grid of hyperparameters to search
# # Regularization type and C hyperparameter
parameters = {
    'penalty': ['elasticnet'],           
    'C': [0.001, 0.01, 0.5, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.2, 0.5, 0.8]}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator = log_reg, param_grid = parameters, scoring = 'roc_auc', cv=5, verbose=0)
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Fit the model with the best hyperparameters on the training data
log_Reg2 = LogisticRegression(solver='saga', **best_params)
log_Reg2.fit(x_train, y_train)

print('Val0', average_precision_score(log_Reg2.predict(x_train), y_train))
print('Val1', average_precision_score(log_Reg2.predict(x_test), y_test))

Best Hyperparameters: {'C': 0.1, 'l1_ratio': 0.1, 'penalty': 'elasticnet'}
Val0 0.22923992897183246
Val1 0.2023834961334961


In [4]:
def create_objective_tgm(variables0, x_train, x_test, y_train, y_test):

    def objective(trial):
        # Suggest a value between 0 and 1
        min_hyper = 1e-2
        max_hyper = 10
        l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
        # Suggest a value from a logarithmic scale
        C = trial.suggest_float('C', min_hyper, max_hyper, log=True)
        
        _tgmodel = LogisticRegression(n_jobs=-1, penalty='elasticnet',  random_state=0, solver='saga',
                                      l1_ratio=l1_ratio, C=C, fit_intercept=True)
        variables = variables0
        
        # indep_vars_index = [tgmodel.all_vars_init.index(x) for x in variables]

        _tgmodel.fit(x_train, y_train)

        score_train = average_precision_score(_tgmodel.predict(x_train), y_train)
        
        score_test = average_precision_score(_tgmodel.predict(x_test), y_test)
        
        trial.set_user_attr("final_features", variables)
        trial.set_user_attr("final_coeffs", _tgmodel.coef_)
        trial.set_user_attr('final_removed', [x for x in variables0 if x not in variables])
        
        return score_train, score_test

    return objective

In [15]:
# Create a study for optuna
sampler = optuna.samplers.TPESampler(seed=42)

study = optuna.create_study(
    directions=['maximize', 'maximize'], sampler=sampler, pruner=optuna.pruners.HyperbandPruner())

study.optimize(create_objective_tgm(indep_vars, x_train, x_test, y_train, y_test),
               n_trials= 20, 
               n_jobs= -1, 
               catch=(), 
               gc_after_trial=True)

best_trials = study.trials_dataframe()

[I 2023-12-06 18:57:31,340] A new study created in memory with name: no-name-da13e20c-7818-4e1a-a193-692ede54204f
[I 2023-12-06 18:57:31,487] Trial 5 finished with values: [0.22836805732248092, 0.184593837535014] and parameters: {'l1_ratio': 0.1306614934693242, 'C': 0.05800480245404324}. 
[I 2023-12-06 18:57:31,501] Trial 2 finished with values: [0.17953867563908998, 0.13468915343915344] and parameters: {'l1_ratio': 0.4443395973466012, 'C': 0.035068011614143704}. 
[I 2023-12-06 18:57:31,739] Trial 0 finished with values: [0.20262383412085044, 0.1646164021164021] and parameters: {'l1_ratio': 0.8035790125681523, 'C': 0.08253123654274648}. 
[I 2023-12-06 18:57:31,894] Trial 7 finished with values: [0.24879204436845187, 0.21458333333333332] and parameters: {'l1_ratio': 0.6948453826638531, 'C': 0.18147147398621}. 
[I 2023-12-06 18:57:31,910] Trial 4 finished with values: [0.26930853190444576, 0.23694167589516424] and parameters: {'l1_ratio': 0.23707277961022954, 'C': 4.643684595984111}. 
[I

In [6]:
C = best_trials['params_C'][0]
l1_ratio = best_trials['params_l1_ratio'][0]
indep_vars = best_trials['user_attrs_final_features'][0]

print('C       ', C)
print('l1_ratio', l1_ratio)
print('Val 0   ', best_trials['values_0'][0])
print('Val 1   ', best_trials['values_1'][0])

C        7.114476009343421
l1_ratio 0.3745401188473625
Val 0    0.26930853190444576
Val 1    0.23694167589516424


In [8]:
C = best_trials['params_C'][0]
l1_ratio = best_trials['params_l1_ratio'][0]
indep_vars = best_trials['user_attrs_final_features'][0]

print('C       ', C)
print('l1_ratio', l1_ratio)
print('Val 0   ', best_trials['values_0'][0])
print('Val 1   ', best_trials['values_1'][0])

C        7.114476009343421
l1_ratio 0.3745401188473625
Val 0    0.26930853190444576
Val 1    0.23694167589516424


In [16]:
C = best_trials['params_C'][0]
l1_ratio = best_trials['params_l1_ratio'][0]
indep_vars = best_trials['user_attrs_final_features'][0]

print('C       ', C)
print('l1_ratio', l1_ratio)
print('Val 0   ', best_trials['values_0'][0])
print('Val 1   ', best_trials['values_1'][0])

C        0.08253123654274648
l1_ratio 0.8035790125681523
Val 0    0.20262383412085044
Val 1    0.1646164021164021


In [12]:
C = best_trials['params_C'][0]
l1_ratio = best_trials['params_l1_ratio'][0]
indep_vars = best_trials['user_attrs_final_features'][0]

print('C       ', C)
print('l1_ratio', l1_ratio)
print('Val 0   ', best_trials['values_0'][0])
print('Val 1   ', best_trials['values_1'][0])

C        2.9253981196925185
l1_ratio 0.8438981427233
Val 0    0.26930853190444576
Val 1    0.23694167589516424


In [14]:
C = best_trials['params_C'][0]
l1_ratio = best_trials['params_l1_ratio'][0]
indep_vars = best_trials['user_attrs_final_features'][0]

print('C       ', C)
print('l1_ratio', l1_ratio)
print('Val 0   ', best_trials['values_0'][0])
print('Val 1   ', best_trials['values_1'][0])

C        0.1448202507113456
l1_ratio 0.06102588367650874
Val 0    0.24834325638615182
Val 1    0.20039682539682538
