# Traditional Machine Learning Models

This notebook focuses on implementing traditional machine learning models, each tailored to a specific annotator. The models are solely trained on the annotator's labeled data and undergo hyperparameter fine-tuning before.

Additionally, the performance of different feature extraction approaches on the dataset is compared, namely CountVectorizer, TfidfVectorizer, and Transformer ([BERT](https://huggingface.co/dbmdz/bert-base-german-cased)).

Training is conducted on a dataset augmented twice. The training process is conducted in two stages: initially, a binary classification determines whether a data point exhibits sexism, followed by a more detailed classification of the level of sexism for each predicted instance.

### Import and Installation of Libraries

In [1]:
!pip install torch -q
!pip install imbalanced-learn -q
!pip install xgboost -q
!pip install lightgbm -q
!pip install catboost -q
!pip install mlxtend -q
!pip install tqdm -q
!pip install gensim -q
!pip install transformers -q
!python -m spacy download de_core_news_md -q
!pip install spacy -q
!pip install nltk -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_md')


In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import mlxtend
import spacy
import os
import pickle
import logging

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from scipy.spatial.distance import jensenshannon

from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertForSequenceClassification
from catboost import CatBoostClassifier

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from typing import Dict
from scipy.stats import randint

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['LOGGING_LEVEL'] = 'INFO' 
logging.basicConfig(level=os.environ.get('LOGGING_LEVEL', 'WARNING'),
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
ANNOTATOR_COLUMNS = ['A001', 'A002', 'A003', 'A004', 'A005', 'A007', 'A008', 'A009', 'A010', 'A012']
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#nlp = spacy.load('de_core_news_md')

checkpoint = "google-bert/bert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model_bert = AutoModel.from_pretrained(checkpoint)

### Data Preparation

In [16]:
def lemma(text):
    doc = nlp(text)
    result = ' '.join([
        token.text if token.is_punct else token.lemma_
        for token in doc
        if not token.is_stop
    ])
    return result.lower()

def get_data(classes, lemmat, path):
    """
    Processes the input data based on specified classes and lemmatization option.
    """
    df = pd.read_csv(path)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df = df[:100]
    
    if lemmat:
        df['text'] = df['text'].apply(lambda x: lemma(x))
    
    if classes==2:
        df[ANNOTATOR_COLUMNS] = df[ANNOTATOR_COLUMNS].replace([2, 3, 4], 1)

    elif classes==4:
        replace_dict = {0: -1, 1: 0, 2: 1, 3: 2, 4: 3}
        df[ANNOTATOR_COLUMNS] = df[ANNOTATOR_COLUMNS].replace(replace_dict)

    elif classes != 5:
        print('Specify right nr of classes!')
    
    dataframes = {}

    for col in ANNOTATOR_COLUMNS:
        df_task = df[['text', col]].dropna()
        df_task = df_task[df_task[col] != -1]
        dataframes[col] = df_task
    
    return dataframes

In [17]:
eng_stopwords = stopwords.words('german')

def NLP1(text):
    text_tok = word_tokenize(text)
    
    eng_stopwords = stopwords.words('german')
    text_stp = [word for word in text_tok if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='german')
    text_stm = [stemmer.stem(word) for word in text_stp]
    return text_stm

In [18]:
# Uses pickle to save and load vectorizers to/from disk.
vecs = {
    'A001': None, 'A002': None, 'A003': None, 'A004': None, 'A005': None, 'A007': None,
     'A008': None, 'A009': None, 'A010': None, 'A012': None
} 

def get_text_features(df, method, train, anno):
    """
    Generate text features from the dataset using different vectorization methods (indicated by parameter).
    """
    vectorizer_filename = f'{anno}_{method.lower()}_vectorizer.pkl'
    
    if method == 'CountVectorizer':
        if train:
            vectorizer = CountVectorizer()
            X = vectorizer.fit_transform(df['text'])
            with open(vectorizer_filename, 'wb') as f:
                pickle.dump(vectorizer, f)
            vecs[anno] = vectorizer
            return X
        else:
            if vecs[anno] is None:
                with open(vectorizer_filename, 'rb') as f:
                    vecs[anno] = pickle.load(f)
            return vecs[anno].transform(df['text'])
    
    elif method == 'TfidfVectorizer':
        if train:
            vectorizer = TfidfVectorizer()
            X = vectorizer.fit_transform(df['text'])
            with open(vectorizer_filename, 'wb') as f:
                pickle.dump(vectorizer, f)
            vecs[anno] = vectorizer
            return X
        else:
            if vecs[anno] is None:
                with open(vectorizer_filename, 'rb') as f:
                    vecs[anno] = pickle.load(f)
            return vecs[anno].transform(df['text'])
    
    elif method == 'Transformer':
        
        inputs = tokenizer(df, return_tensors='pt', max_length=64, truncation=True, padding='max_length')
        inputs.to(device)
        model_bert.to(device)
        with torch.no_grad():
            outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

In [19]:
def compute_class_weights(df, anno):
    """
    Computes class weights for a specific annotator's column in the DataFrame.
    """
    y = df[anno].values
    classes = np.unique(y)
    class_weights = compute_class_weight('balanced', classes=classes, y=y)
    
    return dict(zip(classes, class_weights))

### Define Training Methods for each Model

The selected models are:
- Random Forest
- Extreme Gradient Boosting (XGB)
- LightGBM
- CatBoost
- Support Vector Machine (SVM)


In [21]:
def train_random_forest(X_train, y_train, class_weights, anno, method, paras):
    
   # if method=='Transformer':
   #     X_train = np.array([np.array(x) for x in X_train])
    
    model = RandomForestClassifier(random_state=42,
                                   class_weight=class_weights,
                                   bootstrap=paras['bootstrap'],
                                   max_depth=paras['max_depth'],
                                   min_samples_leaf=paras['min_samples_leaf'],
                                   min_samples_split=paras['min_samples_split'],
                                   n_estimators=paras['n_estimators']
                                  )
    model.fit(X_train, y_train)
    return model

def train_xgboost(X_train, y_train, anno, paras):
    if anno=='A002':
        X_train = np.array(X_train.tolist())
    
    model = xgb.XGBClassifier(random_state=42,
                              colsample_bytree=paras['colsample_bytree'],
                              learning_rate=paras['learning_rate'],
                              max_depth=paras['max_depth'],
                              n_estimators=paras['n_estimators'],
                              subsample=paras['subsample']
                             )
    
    model.fit(X_train, y_train)
    return model

def train_lightgbm(X_train, y_train, class_weights, anno, paras):
    X_train = X_train.astype('float32')
    
    model = lgb.LGBMClassifier(class_weight=class_weights,
                               max_depth=paras['max_depth'],
                               learning_rate=paras['learning_rate'],
                               n_estimators=paras['n_estimators'],
                               verbose=-1
                              )
    model.fit(X_train, y_train)
    return model

def train_catboost(X_train, y_train, class_weights, anno, paras):
    
    model = CatBoostClassifier(iterations=paras['iterations'],
                               learning_rate=paras['learning_rate'],
                               depth=paras['depth'],
                               logging_level='Silent',
                               class_weights=class_weights
                              )
    
    model.fit(X_train, y_train, verbose=True)
    return model

def train_svm(X_train, y_train, class_weights, anno, paras):
    
    model = SVC(C=paras['C'],
                kernel=paras['kernel'],
                gamma=paras['gamma'],
                class_weight=class_weights,
                verbose=False 
               )
    model.fit(X_train, y_train)
    return model

In [29]:
def get_features(df, method, train, anno):
    if method=='Transformer':
        df['text_features'] = df['text'].apply(lambda text: get_text_features(df=text, method='Transformer', train=True, anno=anno))
        X_train, X_test, y_train, y_test = train_test_split(df['text_features'].tolist(), df[anno], test_size=0.15, random_state=42)
        #X_train = df['text_features'].tolist()
        #y_train = df[anno]
        return X_train, X_test, y_train, y_test
    
        #X_train = df['text'].apply(lambda text: get_text_features(text, method='Transformer', train=train, anno=anno))
        #y_train = df[anno].values
        #return X_train, y_train
    
    else:
        train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
        X_train = get_text_features(train_df, method=method, train=True, anno=anno)
        X_test = get_text_features(test_df, method=method, train=False, anno=anno)
        y_train = train_df[anno].values
        y_test = test_df[anno].values
        return X_train, X_test, y_train, y_test

In [30]:
"""
Determined the optimal model, feature extraction technique, and parameter combinations using grid search 
for each annotator individually. Results were saved in dictionaries to ensure reproducibility, and findings 
were categorized for both binary and multi-class classifications.
"""

best_combis_bin = {
    'A001': {'method': 'Transformer', 'model': 'Catboost', 
            'paras': {'depth': 6, 'iterations': 500, 'learning_rate': 0.01}},
    'A002': {'method': 'CountVectorizer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}},
    'A003': {'method': 'Transformer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}},
    'A004': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}},
    'A005': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}},
    'A007': {'method': 'TfidfVectorizer', 'model': 'SVM', 
            'paras': {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}},
    'A008': {'method': 'CountVectorizer', 'model': 'SVM', 
            'paras': {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}},
    'A009': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.8}},
    'A010': {'method': 'Transformer', 'model': 'Light', 
            'paras': {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100}},
    'A012': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.8}}
}

best_combis_multi = {
    'A001': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}},
    'A002': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}},
    'A003': {'method': 'TfidfVectorizer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}},
    'A004': {'method': 'CountVectorizer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}},
    'A005': {'method': 'Transformer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}},
    'A007': {'method': 'CountVectorizer', 'model': 'Light', 
            'paras': {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}},
    'A008': {'method': 'Transformer', 'model': 'Xgb', 
            'paras': {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}},
    'A009': {'method': 'TfidfVectorizer', 'model': 'Light', 
            'paras': {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}},
    'A010': {'method': 'CountVectorizer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}},
    'A012': {'method': 'CountVectorizer', 'model': 'Rf', 
            'paras': {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}},
}

### Prepare and start Training

In [31]:
def evaluate_model(model, X_test, y_test, method, model_type='traditional'):
        if method != 'Transformer':
            X_test = X_test.astype('float32')
            
        #X_test = X_test.reshape(-1, 1)
        if model_type == 'traditional':
            y_pred = model.predict(X_test)
            
        accuracy = round(accuracy_score(y_test, y_pred), 3)
        precision = round(precision_score(y_test, y_pred, average='macro', zero_division=1), 3)
        recall = round(recall_score(y_test, y_pred, average='macro', zero_division=1), 3)
        f1 = round(f1_score(y_test, y_pred, average='macro', zero_division=1), 3)
    
        return accuracy, precision, recall, f1

In [32]:
def evaluate_model(model, X_test, y_test, method):
    """
    Evaluates the performance of the trained model on the test set. 
    """
    
    if method != 'Transformer':
        X_test = X_test.astype('float32')

    #X_test = X_test.reshape(-1, 1)
    y_pred = model.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    precision = round(precision_score(y_test, y_pred, average='macro', zero_division=1), 3)
    recall = round(recall_score(y_test, y_pred, average='macro', zero_division=1), 3)
    f1 = round(f1_score(y_test, y_pred, average='macro', zero_division=1), 3)

    return accuracy, precision, recall, f1

In [33]:
def train_models(dataframes, best_combis):  
    """
    Trains models for each annotator based on the best parameter config from grid search.
    """
    
    models = {}
    for anno, df in dataframes.items():
        print(f"Training model(s) for Annotator {anno}...")

        method = best_combis[anno]['method']
        model_name = best_combis[anno]['model']
        paras = best_combis[anno]['paras']
        print(f'Method: {method}')

        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        class_weights_dict = compute_class_weights(df, anno)
        X_train, X_test, y_train, y_test = get_features(df, method, True, anno)

        if method == 'Transformer':
            X_train = np.concatenate(X_train, axis=0)
            X_test = np.concatenate(X_test, axis=0)
            
        model = None
        
        if model_name=='Rf':
            model = train_random_forest(X_train, y_train, class_weights_dict, anno, method, paras)
        elif model_name=='Xgb':
            model = train_xgboost(X_train, y_train, anno, paras)
        elif model_name=='Light':
            model = train_lightgbm(X_train, y_train, class_weights_dict, anno, paras)
        elif model_name=='Catboost':
            model =  train_catboost(X_train, y_train, class_weights_dict, anno, paras)
        elif model_name=='SVM':
            model = train_svm(X_train, y_train, class_weights_dict, anno, paras)

        metrics = evaluate_model(model, X_test, y_test, method)
        print(metrics)

        models[anno] = {
            'Model name': model_name,
            'Model': model,
            'Text feature method': method,
            'Metrics': metrics
        }
    return models

In [34]:
path = 'df_train_original.csv'
classes = 2
lemmat = False
dataframes = get_data(classes=classes, lemmat=lemmat, path=path)
best_combis = best_combis_bin if classes == 2 else best_combis_multi

models = train_models(dataframes, best_combis)

Training model(s) for Annotator A001...
Method: Transformer
(1.0, 1.0, 1.0, 1.0)
Training model(s) for Annotator A002...
Method: CountVectorizer
(0.867, 0.933, 0.5, 0.464)
Training model(s) for Annotator A003...
Method: Transformer
(0.8, 0.875, 0.75, 0.762)
Training model(s) for Annotator A004...
Method: Transformer
(0.75, 0.875, 0.5, 0.429)
Training model(s) for Annotator A005...
Method: Transformer
(0.333, 0.25, 0.25, 0.25)
Training model(s) for Annotator A007...
Method: TfidfVectorizer
(0.5, 0.75, 0.5, 0.333)
Training model(s) for Annotator A008...
Method: CountVectorizer
(0.6, 0.8, 0.5, 0.375)
Training model(s) for Annotator A009...
Method: Transformer
(0.875, 0.917, 0.833, 0.855)
Training model(s) for Annotator A010...
Method: Transformer
(0.6, 0.568, 0.556, 0.55)
Training model(s) for Annotator A012...
Method: Transformer
(0.667, 0.667, 0.611, 0.603)


In [None]:
print(models)

### Apply trained Models for Inference

In [None]:
def predict_and_update(df, models, binary_prediction):
    """
    Predicts values for cells in the dataframe based on trained models stored in 'models'.
    """
    
    df_predicted = df.copy()

    # Iterate over each annotator and model
    for anno, model_info in tqdm(models.items(), desc='Annotators', total=len(models)):

        model = model_info['Model']
        method = model_info['Text feature method']

        if method == 'CountVectorizer':
            text_features = get_text_features(df, method='CountVectorizer', train=False, anno=anno).toarray()
        elif method == 'TfidfVectorizer':
            text_features = get_text_features(df, method='TfidfVectorizer', train=False, anno=anno).toarray()
        elif method == 'Transformer':
            text_features = df['text'].apply(lambda text: get_text_features(df=text, method='Transformer', train=False, anno=anno))
            text_features = list(text_features)

        for idx, value in enumerate(df[anno]):
            if value == 1:   
                text_to_predict = text_features[idx]
                if method=='Transformer':
                    text_to_predict = np.concatenate(text_to_predict, axis=0)
                predicted_value = model.predict([text_to_predict])[0]
                
                if binary_prediction:
                    df_predicted.loc[idx, anno] = predicted_value #+ 1  # Increment by 1 as requested
                else: 
                    df_predicted.loc[idx, anno] = predicted_value + 1
                    
            if binary_prediction:
                df_predicted.loc[idx, anno] = -1
                
    return df_predicted

In [132]:
df_test = pd.read_csv('df_comp_bin_3.csv')
df_new = predict_and_update(df_test, models)
df_new = df_new.drop('Unnamed: 0', axis=1)
df_new.head(2)

Annotators: 10it [00:52,  5.26s/it]


In [133]:
#df_new.to_csv('df_com_bin_3.csv')
#df_new.to_csv('df_comp_multi_3.tsv', sep='\t', index=False)

Unnamed: 0,id,text,A008,A007,A003,A005,A004,A012,A009,A002,A001,A010
0,f3b81af2f6852bf1b9896629525d2f41,"Ja, Frauen können krankhaft eifersüchtig werde...",2,4,4,0,0,2,0,0,0,0
1,cf8b8bac7165144bb62b399a98843366,"Ich hau' auf jede Religion gern drauf, aber de...",0,0,2,0,0,0,0,0,0,0
2,0c45cdf4cca5eec566d6dd53653b532b,"Wow, Vorstadtmama, perfekte Dransleischen. Und...",0,4,4,0,0,2,0,0,2,3
3,3a60877d2c04ba65f457f7cc3e003169,gratuliere USA\ndie erste schwarze & frau als ...,0,0,0,0,0,0,0,0,0,0
4,f389b63364d8da93860e3c7e6569bf5b,Frauen wählten mehrheitlich Biden ...\nwürden ...,4,0,4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1981,2f7322c62b63ff74ec945bb38ed9f258,Ihre Partnerin schämt sich wahrscheinlich für ...,-1,-1,-1,-1,-1,-1,0,0,-1,2
1982,ec5fe35f542aac2f3155177dbf2731c2,glaube ich diese These einfach nicht.Grund: He...,-1,-1,-1,-1,-1,-1,0,0,-1,0
1983,6674986a02bab67b011df90cc7396a96,Damit die Ehefrau dann ein Dauervisum erhält m...,-1,-1,-1,-1,-1,-1,0,0,-1,0
1984,2a3774eba33afe18af2f0d312d081bb3,Ich selbst habe in meiner Hierarchie zwei Frau...,-1,-1,-1,-1,-1,-1,0,0,-1,0


### Inference

In [64]:
def get_test_df(): # to predict df, has 1 and NaN
    df_test = pd.read_csv('test.csv')
    df_test = df_test.drop('Unnamed: 0', axis=1)
    df_test.replace(0, np.nan, inplace=True) # replace 0 with NaN 
    valid_counts = df_test[ANNOTATOR_COLUMNS].apply(lambda x: x.notna()) # add nr annos
    df_test['N'] = valid_counts.sum(axis=1)
    return df_test

In [2]:
def predict(models, df, text_vectorized, bin):
    #df[ANNOTATOR_COLUMNS] = df[ANNOTATOR_COLUMNS].replace(0, -1)

    for index, row in tqdm(df.iterrows(), total=len(df), position=0, leave=True, desc="Predicting"):
        for annotator in ANNOTATOR_COLUMNS:
            
            if bin:
                if not pd.isna(row[annotator]): # dataframe has 1 and NaN
                    model = models.get(annotator)
                    if model is not None:
                        pred = model.predict(text_vectorized[index].reshape(1, -1)) 
                        df.at[index, annotator] = pred[0]     
                        
            else: 
                 if row[annotator] == -1 or row[annotator] == -1.0: # weil um -2 dekr, eig 1, also eig sexis
                    model = models.get(annotator)
                    if model is not None:
                        pred = model.predict(text_vectorized[index].reshape(1, -1)) 
                        df.at[index, annotator] = pred[0]  
    return df


In [27]:
_, _, df_mul = get_train_df(split=3, path='df_train_original.csv', upsample=False, lemmat=False)

*Other code used for hyperparameter tuning via grid search.*

In [None]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted', zero_division=1),
    'f1': make_scorer(f1_score, average='weighted', zero_division=1)
}

def hyperparameter_search_rf(X_train, y_train, class_weights):
    
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 4, 10],
        'bootstrap': [False],
        'class_weight': [class_weights]
    }
    
    # if method=='Transformer':
    #    X_train = np.array([np.array(x) for x in X_train])
    
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring=scoring, refit='accuracy', n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for RandomForest: {grid_search.best_params_}")
    print(f"Best score for RandomForest: {grid_search.best_score_}")
    return grid_search.best_estimator_

def hyperparameter_search_xgb(X_train, y_train):
    
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    model = xgb.XGBClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring=scoring, refit='accuracy', n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for XGBoost: {grid_search.best_params_}")
    print(f"Best score for XGBoost: {grid_search.best_score_}")
    return grid_search.best_estimator_

def hyperparameter_search_lgb(X_train, y_train, class_weights):
    X_train = X_train.astype('float32')
    
    param_grid = {
        'n_estimators': [100],
        'max_depth': [10],
        'learning_rate': [0.01, 0.1, 0.2],
        'class_weight': [class_weights]
    }
    
    model = lgb.LGBMClassifier(verbose=-1)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring=scoring, refit='accuracy', n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for LightGBM: {grid_search.best_params_}")
    print(f"Best score for LightGBM: {grid_search.best_score_}")
    return grid_search.best_estimator_

def hyperparameter_search_cb(X_train, y_train, class_weights=None):
    param_grid = {
        'iterations': [750],
        'depth': [5],
        'learning_rate': [0.05],
        'class_weights': [class_weights]
    }
    
    model = CatBoostClassifier(metric_period=1000)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, scoring=scoring, refit='accuracy', n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for CatBoost: {grid_search.best_params_}")
    print(f"Best score for CatBoost: {grid_search.best_score_}")
    return grid_search.best_estimator_

In [None]:
dataframes = get_data(classes=4, lemmat=False, path='df_train_original.csv')
results = {}

for anno, df in dataframes.items():
    print(f"Training models for Annotator {anno}...")
    
    methods = ['TfidfVectorizer', 'CountVectorizer', 'TfidfVectorizer']
    
    for method in methods:
        print(method)
        
        X_train, X_test, y_train, y_test = get_features(df, method, True, anno)

        # for multi
        if method == 'Transformer':
            X_train = np.concatenate(X_train, axis=0)
            X_test = np.concatenate(X_test, axis=0)

        class_weights_dict = compute_class_weights(df, anno)
        model = None

        def evaluate_model(model, X_test, y_test, model_type='traditional'):
            if method != 'Transformer':
                X_test = X_test.astype('float32')
            if model_type == 'traditional':
                X_test = X_test[0]
                y_pred = model.predict(X_test)

            accuracy = round(accuracy_score(y_test, y_pred), 3)
            precision = round(precision_score(y_test, y_pred, average='macro', zero_division=1), 3)
            recall = round(recall_score(y_test, y_pred, average='macro', zero_division=1), 3)

            return accuracy, precision, recall

        model = hyperparameter_search_rf(X_train, y_train, class_weights_dict)
        print(f'RF: {evaluate_model(model, X_test, y_test)}')

        model = hyperparameter_search_xgb(X_train, y_train)
        print(f'Xgb: {evaluate_model(model, X_test, y_test)}')

        model = hyperparameter_search_lgb(X_train, y_train, class_weights_dict)
        print(f'Light: {evaluate_model(model, X_test, y_test)}')

        model = hyperparameter_search_cb(X_train, y_train, class_weights_dict)
        print(f'Cat: {evaluate_model(model, X_test, y_test)}')

        model = train_svm(X_train, y_train, class_weights_dict, anno)
        print(f'SVM: {evaluate_model(model, X_test, y_test)}')