In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import ast
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.utils import resample

  from pandas.core import (


In [2]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

In [3]:
annotated_texts_df.head(2)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569..."
1,ParlaMint-IT_2014-02-05-LEG17-Senato-sed-184.u79,Lo dico al senatore Casson e agli altri: capis...,3,0,1,1,1,1,"[0.46272910958786384, 0.09530122244710613, -0....","[0.01605109330957291, 0.024485928836790936, -0...","[0.003973030663484822, -0.023834898513667484, ...","[74.0, 1771.0, 23.93243243243243, 5.1573248407..."


In [4]:
def train_fit_multiple_models(X_train, X_test, y_train, y_test):
    
    #LightGBM
    clf_lgbm = LGBMClassifier(random_state=8)
    clf_lgbm.fit(X_train, y_train)
    y_pred = clf_lgbm.predict(X_test)
    print("=======LIGHTGBM: \n", classification_report(y_test ,y_pred))

    #Linear SVC
    clf_linear_svc = LinearSVC(C=1.0, random_state=42)
    clf_linear_svc.fit(X_train, y_train)
    y_pred = clf_linear_svc.predict(X_test)
    print("=======LINEAR SVC: \n",classification_report(y_test, y_pred))
    
    #Non-Linear SVC
    clf_svc = SVC(gamma='auto', C=0.1, kernel='rbf', random_state=42)
    clf_svc.fit(X_train, y_train)
    y_pred = clf_svc.predict(X_test)
    print("=======NON-LINEAR SVC: \n",classification_report(y_test, y_pred))
    
    #Random Forest
    clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)
    clf_rf.fit(X_train, y_train)
    y_pred = clf_rf.predict(X_test)
    print("========RANDOM FOREST: \n", classification_report(y_test, y_pred))
    '''
    #Bagging
    clf_bag = BaggingClassifier(estimator=None, n_estimators=100, random_state=0)
    clf_bag.fit(X_train, y_train)
    y_pred = clf_bag.predict(X_test)
    print("========BAGGING: \n", classification_report(y_test, y_pred))
    
    #Bagging SVC
    clf_bag_svc = BaggingClassifier(estimator=SVC(C=1000), n_estimators=10, random_state=0)
    clf_bag_svc.fit(X_train, y_train)
    y_pred = clf_bag_svc.predict(X_test)
    print("========BAGGING SVC: \n", classification_report(y_test, y_pred))
    
    #Bagging RF
    clf_bag_rf = BaggingClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)
    clf_bag_rf.fit(X_train, y_train)
    y_pred = clf_bag_rf.predict(X_test)
    print("========BAGGING RF: \n", classification_report(y_test, y_pred))
    '''
    #AdaBoost
    clf_ada = AdaBoostClassifier(estimator=None, n_estimators=100, random_state=0)
    clf_ada.fit(X_train, y_train)
    y_pred = clf_ada.predict(X_test)
    print("========ADABOOST \n", classification_report(y_test, y_pred))
    
    #AdaBoost RF
    clf_ada_rf = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)
    clf_ada_rf.fit(X_train, y_train)
    y_pred = clf_ada_rf.predict(X_test)
    print("========ADABOOST RF \n", classification_report(y_test, y_pred))

In [5]:
def train_fit_best_params(X_train, X_test, y_train, y_test):
    #Light GBM
    param_dist = {
        'n_estimators': randint(100, 1200),                # Range for boosting rounds
        'learning_rate': uniform(0.001, 0.3),              # Learning rate
        'num_leaves': randint(20, 150),                    # Maximum number of leaves in one tree
        'max_depth': randint(3, 15),                       # Maximum depth of the tree
        'min_data_in_leaf': randint(10, 100),              # Minimum number of samples in a leaf
        'feature_fraction': uniform(0.5, 0.5),             # Proportion of features to consider at each split
        'bagging_fraction': uniform(0.5, 0.5),             # Proportion of data to consider at each iteration
        'bagging_freq': randint(1, 10),                    # Frequency of bagging
        'lambda_l1': uniform(0, 5),                        # L1 regularization term
        'lambda_l2': uniform(0, 5)                         # L2 regularization term
    }

    clf_lgb = lgb.LGBMClassifier(verbose=-1, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=clf_lgb,
        param_distributions=param_dist,
        n_iter=50,                   # Number of parameter settings sampled
        scoring='accuracy',
        cv=5,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    selected_clf =  random_search.best_estimator_
    selected_clf.fit(X_train, y_train)
    y_pred = selected_clf.predict(X_test)
    print("=======BEST PARAM LIGHTGBM:\n", classification_report(y_test ,y_pred))
    
    #Random Forest
    param_dist = {
        'n_estimators': randint(100, 500),                  # Number of trees
        'max_depth': randint(5, 15),                         # Maximum depth of each tree
        'min_samples_split': randint(2, 20),                 # Minimum samples needed to split a node
        'min_samples_leaf': randint(1, 20),                  # Minimum samples needed in each leaf
        'max_features': ['sqrt', 'log2', None],              # Number of features to consider at each split
        'bootstrap': [True, False],                          # Whether to use bootstrapping
        'class_weight': [None, 'balanced']                   # Handle class imbalance
    }

    clf = RandomForestClassifier(random_state=42)

    random_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_dist,
        n_iter=20,                   
        scoring='accuracy',          
        cv=5,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    selected_clf =  random_search.best_estimator_
    selected_clf.fit(X_train, y_train)
    y_pred = selected_clf.predict(X_test)
    print("=======BEST PARAM RANDOM FOREST:\n", classification_report(y_test ,y_pred))


### Populism

In [6]:
df_pop_train, df_pop_test = train_test_split(annotated_texts_df, test_size=0.25, stratify=annotated_texts_df['pop_sum'], random_state=42)
y_train_pop = df_pop_train["pop_sum"].values
y_test_pop = df_pop_test["pop_sum"].values

#### TF-IDF

In [8]:
X_train = df_pop_train["tfidf"].values
X_test = df_pop_test["tfidf"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pop, y_test_pop)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 300
[LightGBM] [Info] Start training from score -0.998283
[LightGBM] [Info] Start training from score -1.520072
[LightGBM] [Info] Start training from score -2.189872
[LightGBM] [Info] Start training from score -2.392620
[LightGBM] [Info] Start training from score -1.563170
               precision    recall  f1-score   support

           0       0.64      0.85      0.73       999
           1       0.42      0.41      0.41       593
           2       0.21      0.05      0.08       303
           3       0.29      0.05      0.08       248
           4       0.63      0.76      0.69       567

    accuracy                           0.57      2710
   macro avg       0.44      0.42      0.40      2710
weigh



               precision    recall  f1-score   support

           0       0.67      0.89      0.76       999
           1       0.49      0.44      0.46       593
           2       0.23      0.02      0.04       303
           3       0.29      0.02      0.04       248
           4       0.61      0.86      0.71       567

    accuracy                           0.61      2710
   macro avg       0.46      0.45      0.40      2710
weighted avg       0.53      0.61      0.54      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.37      1.00      0.54       999
           1       0.00      0.00      0.00       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.00      0.00      0.00       567

    accuracy                           0.37      2710
   macro avg       0.07      0.20      0.11      2710
weighted avg       0.14      0.37      0.20      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.51      0.94      0.66       999
           1       0.42      0.20      0.28       593
           2       0.50      0.00      0.01       303
           3       0.00      0.00      0.00       248
           4       0.66      0.65      0.66       567

    accuracy                           0.53      2710
   macro avg       0.42      0.36      0.32      2710
weighted avg       0.47      0.53      0.44      2710

               precision    recall  f1-score   support

           0       0.66      0.77      0.71       999
           1       0.41      0.42      0.41       593
           2       0.17      0.08      0.11       303
           3       0.19      0.09      0.12       248
           4       0.59      0.70      0.64       567

    accuracy                           0.54      2710
   macro avg       0.40      0.41      0.40      2710
weighted avg       0.49      0.54      0.51      2710

               preci

In [9]:
train_fit_best_params(X_train, X_test, y_train_pop, y_test_pop)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

#### Doc Embedding

In [23]:
X_train = df_pop_train["doc_embedding"].values
X_test = df_pop_test["doc_embedding"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pop, y_test_pop)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 128
[LightGBM] [Info] Start training from score -0.998283
[LightGBM] [Info] Start training from score -1.520072
[LightGBM] [Info] Start training from score -2.189872
[LightGBM] [Info] Start training from score -2.392620
[LightGBM] [Info] Start training from score -1.563170
               precision    recall  f1-score   support

           0       0.65      0.83      0.73       999
           1       0.40      0.39      0.39       593
           2       0.22      0.07      0.10       303
           3       0.26      0.06      0.10       248
           4       0.58      0.74      0.65       567

    accuracy                           0.56      2710
   macro avg       0.42      0.42      0.40      2710
weigh

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.59      0.90      0.71       999
           1       0.49      0.26      0.34       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.53      0.79      0.63       567

    accuracy                           0.56      2710
   macro avg       0.32      0.39      0.34      2710
weighted avg       0.43      0.56      0.47      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.37      1.00      0.54       999
           1       0.00      0.00      0.00       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.00      0.00      0.00       567

    accuracy                           0.37      2710
   macro avg       0.07      0.20      0.11      2710
weighted avg       0.14      0.37      0.20      2710

               precision    recall  f1-score   support

           0       0.62      0.85      0.72       999
           1       0.41      0.38      0.39       593
           2       0.18      0.01      0.02       303
           3       0.50      0.02      0.04       248
           4       0.55      0.75      0.64       567

    accuracy                           0.56      2710
   macro avg       0.45      0.40      0.36      2710
weighted avg       0.50      0.56      0.49      2710

               preci

In [None]:
train_fit_best_params(X_train, X_test, y_train_pop, y_test_pop)

#### Doc Embedding POS

In [24]:
X_train = df_pop_train["doc_embedding_pos"].values
X_test = df_pop_test["doc_embedding_pos"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pop, y_test_pop)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 128
[LightGBM] [Info] Start training from score -0.998283
[LightGBM] [Info] Start training from score -1.520072
[LightGBM] [Info] Start training from score -2.189872
[LightGBM] [Info] Start training from score -2.392620
[LightGBM] [Info] Start training from score -1.563170
               precision    recall  f1-score   support

           0       0.66      0.82      0.73       999
           1       0.43      0.41      0.42       593
           2       0.25      0.08      0.12       303
           3       0.24      0.07      0.11       248
           4       0.57      0.74      0.65       567

    accuracy                           0.56      2710
   macro avg       0.43      0.42      0.41      2710
weigh

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.60      0.89      0.71       999
           1       0.49      0.30      0.37       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.52      0.80      0.63       567

    accuracy                           0.56      2710
   macro avg       0.32      0.40      0.34      2710
weighted avg       0.44      0.56      0.48      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.37      1.00      0.54       999
           1       0.00      0.00      0.00       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.00      0.00      0.00       567

    accuracy                           0.37      2710
   macro avg       0.07      0.20      0.11      2710
weighted avg       0.14      0.37      0.20      2710

               precision    recall  f1-score   support

           0       0.61      0.84      0.71       999
           1       0.44      0.39      0.41       593
           2       0.34      0.03      0.06       303
           3       0.57      0.02      0.03       248
           4       0.55      0.76      0.64       567

    accuracy                           0.56      2710
   macro avg       0.50      0.41      0.37      2710
weighted avg       0.53      0.56      0.49      2710

               preci

In [None]:
train_fit_best_params(X_train, X_test, y_train_pop, y_test_pop)

#### Linguistic Profile

In [25]:
X_train = df_pop_train["linguistic_profile"].values
X_test = df_pop_test["linguistic_profile"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pop, y_test_pop)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33046
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 140
[LightGBM] [Info] Start training from score -0.998283
[LightGBM] [Info] Start training from score -1.520072
[LightGBM] [Info] Start training from score -2.189872
[LightGBM] [Info] Start training from score -2.392620
[LightGBM] [Info] Start training from score -1.563170
               precision    recall  f1-score   support

           0       0.59      0.79      0.67       999
           1       0.33      0.33      0.33       593
           2       0.09      0.01      0.02       303
           3       0.12      0.02      0.03       248
           4       0.55      0.67      0.61       567

    accuracy                           0.51      2710
   macro avg       0.34      0.37      0.33      2710
weigh

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.41      0.99      0.58       999
           1       0.50      0.01      0.01       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.69      0.37      0.48       567

    accuracy                           0.44      2710
   macro avg       0.32      0.27      0.21      2710
weighted avg       0.41      0.44      0.32      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.37      1.00      0.54       999
           1       0.00      0.00      0.00       593
           2       0.00      0.00      0.00       303
           3       0.00      0.00      0.00       248
           4       0.00      0.00      0.00       567

    accuracy                           0.37      2710
   macro avg       0.07      0.20      0.11      2710
weighted avg       0.14      0.37      0.20      2710

               precision    recall  f1-score   support

           0       0.54      0.85      0.66       999
           1       0.35      0.25      0.29       593
           2       0.00      0.00      0.00       303
           3       0.33      0.00      0.01       248
           4       0.53      0.67      0.59       567

    accuracy                           0.51      2710
   macro avg       0.35      0.35      0.31      2710
weighted avg       0.42      0.51      0.43      2710

               preci

In [None]:
train_fit_best_params(X_train, X_test, y_train_pop, y_test_pop)

## Polarization

In [None]:
df_pol_train, df_pol_test = train_test_split(annotated_texts_df, test_size=0.25, stratify=annotated_texts_df['polarization'], random_state=42)

y_train_pol = df_pol_train["polarization"].values
y_test_pol = df_pol_test["polarization"].values

#### TF-IDF

In [None]:
X_train = df_pol_train["tfidf"].values
X_test = df_pol_test["tfidf"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pol, y_test_pol)

In [None]:
train_fit_best_params(X_train, X_test, y_train_pol, y_test_pol)

#### Doc Embedding

In [27]:
X_train = df_pol_train["doc_embedding"].values
X_test = df_pol_test["doc_embedding"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pol, y_test_pol)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 128
[LightGBM] [Info] Start training from score -1.882064
[LightGBM] [Info] Start training from score -0.782375
[LightGBM] [Info] Start training from score -0.940568
               precision    recall  f1-score   support

           0       0.52      0.25      0.34       413
           1       0.64      0.73      0.68      1239
           2       0.67      0.69      0.68      1058

    accuracy                           0.64      2710
   macro avg       0.61      0.56      0.57      2710
weighted avg       0.63      0.64      0.63      2710





               precision    recall  f1-score   support

           0       0.60      0.07      0.12       413
           1       0.61      0.70      0.65      1239
           2       0.60      0.70      0.64      1058

    accuracy                           0.60      2710
   macro avg       0.60      0.49      0.47      2710
weighted avg       0.60      0.60      0.57      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.00      0.00      0.00       413
           1       0.46      1.00      0.63      1239
           2       0.00      0.00      0.00      1058

    accuracy                           0.46      2710
   macro avg       0.15      0.33      0.21      2710
weighted avg       0.21      0.46      0.29      2710

               precision    recall  f1-score   support

           0       0.58      0.17      0.27       413
           1       0.63      0.75      0.68      1239
           2       0.65      0.68      0.67      1058

    accuracy                           0.64      2710
   macro avg       0.62      0.54      0.54      2710
weighted avg       0.63      0.64      0.61      2710

               precision    recall  f1-score   support

           0       0.45      0.30      0.36       413
           1       0.60      0.66      0.63      1239
           2       0.62      0.64      0.63      1058

    accuracy     

In [None]:
train_fit_best_params(X_train, X_test, y_train_pol, y_test_pol)

#### POS Doc Embedding

In [28]:
X_train = df_pol_train["doc_embedding_pos"].values
X_test = df_pol_test["doc_embedding_pos"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pol, y_test_pol)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 128
[LightGBM] [Info] Start training from score -1.882064
[LightGBM] [Info] Start training from score -0.782375
[LightGBM] [Info] Start training from score -0.940568
               precision    recall  f1-score   support

           0       0.53      0.26      0.35       413
           1       0.64      0.73      0.68      1239
           2       0.66      0.68      0.67      1058

    accuracy                           0.64      2710
   macro avg       0.61      0.56      0.57      2710
weighted avg       0.63      0.64      0.63      2710





               precision    recall  f1-score   support

           0       0.52      0.11      0.18       413
           1       0.61      0.72      0.66      1239
           2       0.62      0.68      0.65      1058

    accuracy                           0.61      2710
   macro avg       0.58      0.50      0.50      2710
weighted avg       0.60      0.61      0.58      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.00      0.00      0.00       413
           1       0.46      1.00      0.63      1239
           2       0.00      0.00      0.00      1058

    accuracy                           0.46      2710
   macro avg       0.15      0.33      0.21      2710
weighted avg       0.21      0.46      0.29      2710

               precision    recall  f1-score   support

           0       0.52      0.16      0.24       413
           1       0.62      0.73      0.67      1239
           2       0.63      0.66      0.65      1058

    accuracy                           0.62      2710
   macro avg       0.59      0.52      0.52      2710
weighted avg       0.61      0.62      0.59      2710

               precision    recall  f1-score   support

           0       0.48      0.31      0.38       413
           1       0.60      0.65      0.63      1239
           2       0.60      0.63      0.62      1058

    accuracy     

In [None]:
train_fit_best_params(X_train, X_test, y_train_pol, y_test_pol)

#### Linguistic Profile

In [29]:
X_train = df_pol_train["linguistic_profile"].values
X_test = df_pol_test["linguistic_profile"].values

X_train = np.array([ast.literal_eval(item) for item in X_train])
X_test = np.array([ast.literal_eval(item) for item in X_test])

train_fit_multiple_models(X_train, X_test, y_train_pol, y_test_pol)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33056
[LightGBM] [Info] Number of data points in the train set: 8130, number of used features: 140
[LightGBM] [Info] Start training from score -1.882064
[LightGBM] [Info] Start training from score -0.782375
[LightGBM] [Info] Start training from score -0.940568
               precision    recall  f1-score   support

           0       0.47      0.13      0.21       413
           1       0.60      0.70      0.65      1239
           2       0.61      0.66      0.63      1058

    accuracy                           0.60      2710
   macro avg       0.56      0.50      0.49      2710
weighted avg       0.58      0.60      0.57      2710





               precision    recall  f1-score   support

           0       0.50      0.00      0.00       413
           1       0.46      1.00      0.63      1239
           2       0.73      0.01      0.02      1058

    accuracy                           0.46      2710
   macro avg       0.56      0.34      0.22      2710
weighted avg       0.57      0.46      0.30      2710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

           0       0.00      0.00      0.00       413
           1       0.46      1.00      0.63      1239
           2       0.00      0.00      0.00      1058

    accuracy                           0.46      2710
   macro avg       0.15      0.33      0.21      2710
weighted avg       0.21      0.46      0.29      2710

               precision    recall  f1-score   support

           0       0.62      0.02      0.04       413
           1       0.58      0.73      0.65      1239
           2       0.60      0.65      0.62      1058

    accuracy                           0.59      2710
   macro avg       0.60      0.46      0.43      2710
weighted avg       0.59      0.59      0.54      2710

               precision    recall  f1-score   support

           0       0.34      0.19      0.25       413
           1       0.59      0.63      0.61      1239
           2       0.58      0.63      0.60      1058

    accuracy     

In [None]:
train_fit_best_params(X_train, X_test, y_train_pol, y_test_pol)