In [9]:
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestCentroid
import os

In [16]:
glinert_with_combinations_params = {
    'n_estimators': 200,
    'learning_rate': 0.005102526970639764,
    'max_depth': 2,
    'subsample': 0.9802199511834219,
    'colsample_bytree': 0.5040893101729162,
    'min_child_weight': 2
}

glinert_without_combinations_params = {
    'C': 4313419626880376,
    'fit_intercept': True,
    'solver': 'liblinear',
    'max_iter': 905,
    'penalty': 'l1'
}

blau_with_combinations_params = {
    'n_estimators': 214,
    'learning_rate': 0.08413767421235069,
    'max_depth': 2,
    'subsample': 0.8509639676276632,
    'colsample_bytree': 0.9536060098203626,
    'min_child_weight': 1
    
}

blau_without_combinations_params =  {
    'shrink_threshold': 0.06913327679322363
}

In [15]:

def test_train(method, comb = False):
    filename = f'merged_comb_{method}.xlsx'
    if comb:
        df = pd.read_excel(os.path.join('../excel', f'merged_comb_{method}.xlsx'))
    else:
        df = pd.read_excel(os.path.join('../excel', f'merged_{method}.xlsx'))
    df.replace({None: pd.NA}, inplace=True)
    df.drop(columns=['target word_0'], inplace=True)
    df.dropna(subset=[method.capitalize()], inplace=True)
    df = df[df[method.capitalize()] != '-']
    df.fillna('', inplace=True)

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    encoder = OneHotEncoder(sparse=False , handle_unknown='ignore')
    X = X.astype(str)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    encoder.fit(X_train)
    X_train = encoder.transform(X_train)
    X_test = encoder.transform(X_test)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.fit_transform(y_test)
    return X_train, X_test, y_train, y_test

In [22]:
import warnings
warnings.filterwarnings('ignore')

def test_optimization():
    methods = {
        ('glinert', True): (XGBClassifier , glinert_with_combinations_params),
        ('glinert', False): (LogisticRegression, glinert_without_combinations_params),
        ('blau', True): (XGBClassifier, blau_with_combinations_params),
        ('blau', False): (NearestCentroid, blau_without_combinations_params)
    }

    for method in methods:
        result = {'optimized':[], 'not_optimized':[]}
        for _ in range(25):
            X_train, X_test, y_train, y_test = test_train(method[0], method[1])
            opt_model = methods[method][0](**methods[method][1])
            opt_model.fit(X_train, y_train)
            y_pred_opt = opt_model.predict(X_test)
            result['optimized'].append(accuracy_score(y_test, y_pred_opt))
            not_opt_model = methods[method][0]()
            not_opt_model.fit(X_train, y_train)
            y_pred = not_opt_model.predict(X_test)
            result['not_optimized'].append(accuracy_score(y_test, y_pred))

        optmized_mean = sum(result['optimized'])/len(result['optimized'])
        not_optimized_mean = sum(result['not_optimized'])/len(result['not_optimized'])

        print (f'optimized mean: {optmized_mean}')
        print (f'not optimized mean: {not_optimized_mean}')

        if optmized_mean > not_optimized_mean:
            print(f'{method[0]} with combinations={method[1]} is better with optimization')
            print(classification_report(y_test, y_pred_opt))
            
        else:
            print(f'{method[0]} with combinations={method[1]} is better without optimization')
            print(classification_report(y_test, y_pred))



test_optimization()

optimized mean: 0.5242718446601944
not optimized mean: 0.4854368932038836
glinert with combinations=True is better with optimization
              precision    recall  f1-score   support

           0       0.11      0.07      0.09        14
           1       0.52      0.68      0.59        41
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2
           4       0.62      0.60      0.61        42

    accuracy                           0.52       103
   macro avg       0.25      0.27      0.26       103
weighted avg       0.48      0.52      0.50       103

optimized mean: 0.5339805825242719
not optimized mean: 0.5339805825242719
glinert with combinations=False is better without optimization
              precision    recall  f1-score   support

           0       0.20      0.29      0.24        14
           1       0.56      0.68      0.62        41
           2       0.00      0.00      0.00         4
           3       0.00 

As we can see,
For Glinert, it's best to not use the combinations and there is no difference between the hyperparameter tuning approach and the default one
For Blau, it's best to use the combinations and without Optuna's tuning.

Now we will train the whole dataset with the best model for each approach.

In [25]:
def encoded_dataset(method, comb=True):
    excel_folder = '../excel'
    if comb:
        df = pd.read_excel(os.path.join(excel_folder,f'merged_comb_{method}.xlsx'))
    else:
        df = pd.read_excel(os.path.join(excel_folder,f'merged_{method}.xlsx'))
    df.replace({None: pd.NA}, inplace=True)
    df.drop(columns=['target word_0'], inplace=True)
    df.dropna(subset=[method.capitalize()], inplace=True)
    df = df[df[method.capitalize()] != '-']
    df.fillna('', inplace=True)

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    encoder = OneHotEncoder(sparse=False , handle_unknown='ignore')
    X = X.astype(str)
    encoder.fit(X)
    X = encoder.transform(X)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    return X, y

Model for glinert

In [26]:
glinert_final_model = LogisticRegression(**glinert_without_combinations_params)
X, y = encoded_dataset('glinert')
glinert_final_model.fit(X, y)

Model for Blau

In [27]:
blau_final_model = XGBClassifier()
X, y = encoded_dataset('blau')
blau_final_model.fit(X, y)