In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [38]:
df=pd.read_csv("suicide.csv")

In [39]:
index = df['suicides_no'] > df['suicides_no'].mean()
df.loc[index, 'suicides_no'] = 'High level of suicide'
df.loc[~index, 'suicides_no'] = 'Low level of suicide'

  df.loc[index, 'suicides_no'] = 'High level of suicide'


In [40]:
SEED = 42 # как random_state
TARGET = 'suicides_no'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
print(f"Numerical features: {', '.join(NUMERICAL)}")

CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))
print(f"Categorical features: {', '.join(CATEGORICAL)}")

Numerical features: year, population, suicides/100k pop, HDI for year, gdp_per_capita ($)
Categorical features:  gdp_for_year ($) , age, country, country-year, generation, sex


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=TARGET), df[TARGET],
                                                    test_size=0.2, random_state=SEED,
                                                    stratify=df[TARGET])
num_imputer = SimpleImputer(strategy='mean')
train_num_imputed = num_imputer.fit_transform(X_train[NUMERICAL])

scaler = MinMaxScaler()
train_num_scaled = scaler.fit_transform(train_num_imputed)

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
train_cat_imputed = cat_imputer.fit_transform(X_train[CATEGORICAL])

encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False) # ?
train_cat_encoded = encoder.fit_transform(train_cat_imputed)

train_preprocessed = np.concatenate((train_num_scaled, train_cat_encoded), axis=1)

columns = np.append(NUMERICAL, encoder.get_feature_names_out(CATEGORICAL))
pd.DataFrame(train_preprocessed, columns=columns, index=X_train.index).head()

Unnamed: 0,year,population,suicides/100k pop,HDI for year,gdp_per_capita ($),"gdp_for_year ($) _1,011,797,457,139","gdp_for_year ($) _1,016,418,229","gdp_for_year ($) _1,018,847,043,277","gdp_for_year ($) _1,022,191,296","gdp_for_year ($) _1,023,196,003,075",...,country-year_Uzbekistan2011,country-year_Uzbekistan2012,country-year_Uzbekistan2013,country-year_Uzbekistan2014,generation_G.I. Generation,generation_Generation X,generation_Generation Z,generation_Millenials,generation_Silent,sex_male
3739,0.774194,0.013992,0.002889,0.63592,0.375723,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
11438,0.387097,0.032543,0.32391,0.63592,0.036518,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
27300,0.258065,0.005143,0.021692,0.63592,0.039088,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16612,0.129032,0.01086,0.077521,0.63592,0.022791,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3131,0.774194,0.000425,0.0,0.63592,0.132878,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [29]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

preprocessors = ColumnTransformer(transformers=[
    ('num', numerical_pipe, NUMERICAL),
    ('cat', categorical_pipe, CATEGORICAL)
])

pipe = Pipeline([
    ('preprocessors', preprocessors),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)

In [32]:
print(f"Train ROC-AUC: {calculate_roc_auc(pipe, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe, X_test, y_test):.4f}")

Train ROC-AUC: 0.9958
Test ROC-AUC: 0.9881


# OPTUNA

In [47]:
def objective(trial):
    logistic_regression_C = trial.suggest_loguniform('logistic_regression_C', 1e-4, 1e2)
    pipe.named_steps['model'].C = logistic_regression_C
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print('Best trial:')
best_trial = study.best_trial
print('  Value: {:.4f}'.format(best_trial.value))
print('  Params: ')
for key, value in best_trial.params.items():
    print('    {}: {}'.format(key, value))
best_pipe = pipe


[I 2024-06-14 14:14:49,199] A new study created in memory with name: no-name-d411e879-c9c4-4ddc-b03a-a0d63fc4a037
  logistic_regression_C = trial.suggest_loguniform('logistic_regression_C', 1e-4, 1e2)
[I 2024-06-14 14:14:55,026] Trial 0 finished with value: 0.9541696621135873 and parameters: {'logistic_regression_C': 0.39503028741210927}. Best is trial 0 with value: 0.9541696621135873.
  logistic_regression_C = trial.suggest_loguniform('logistic_regression_C', 1e-4, 1e2)
[I 2024-06-14 14:14:59,563] Trial 1 finished with value: 0.9293673616103523 and parameters: {'logistic_regression_C': 0.0425868856810804}. Best is trial 0 with value: 0.9541696621135873.
  logistic_regression_C = trial.suggest_loguniform('logistic_regression_C', 1e-4, 1e2)
[I 2024-06-14 14:15:01,918] Trial 2 finished with value: 0.8306973400431344 and parameters: {'logistic_regression_C': 0.00025194233926271407}. Best is trial 0 with value: 0.9541696621135873.
  logistic_regression_C = trial.suggest_loguniform('logisti

Best trial:
  Value: 0.9551
  Params: 
    logistic_regression_C: 2.6910996144553994
