In [50]:
import pandas as pd
import numpy as np
import ast

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [51]:
import warnings
warnings.filterwarnings("ignore")

In [52]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

x_tfidf = annotated_texts_df["tfidf"]
x_tfidf = np.array([ast.literal_eval(item) for item in x_tfidf])

x_docembedding = annotated_texts_df["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])

x_docembedding_pos = annotated_texts_df["doc_embedding_pos"]
x_docembedding_pos = np.array([ast.literal_eval(item) for item in x_docembedding_pos])

x_linguistic_profile = annotated_texts_df["linguistic_profile"]
x_linguistic_profile = np.array([ast.literal_eval(item) for item in x_linguistic_profile])

standard_reprs = {
    "tfidf": x_tfidf,
    "doc_embedding": x_docembedding,
    "doc_embedding_pos": x_docembedding_pos,
    "linguistic_profile": x_linguistic_profile
} 

In [53]:
annotated_texts_pro_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", sep=",", encoding="utf-8")

x_tfidf_extended = annotated_texts_pro_df["tfidf_pro"]
x_tfidf_extended = np.array([ast.literal_eval(item) for item in x_tfidf_extended])

x_docembedding_extended = annotated_texts_pro_df["doc_embedding_pro"]
x_docembedding_extended = np.array([ast.literal_eval(item) for item in x_docembedding_extended])

x_docembedding_pos_extended = annotated_texts_pro_df["docembedding_pos_pro"]
x_docembedding_pos_extended = np.array([ast.literal_eval(item) for item in x_docembedding_pos_extended])

x_linguistic_profile_extended = annotated_texts_pro_df["linguistic_profile_pro"]
x_linguistic_profile_extended = np.array([ast.literal_eval(item) for item in x_linguistic_profile_extended])

extended_reprs = {
    "tfidf": x_tfidf_extended,
    "doc_embedding": x_docembedding_extended,
    "doc_embedding_pos": x_docembedding_pos_extended,
    "linguistic_profile": x_linguistic_profile_extended
} 

In [54]:
y_pol = annotated_texts_df["polarization"]
y_pop = annotated_texts_df["pop_sum"]
y_pop_bin = [0 if x < 2 else 1 for x in y_pop]

In [55]:
clf_svc = LinearSVC(C=0.1, random_state=42)

clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)

clf_lgbm = LGBMClassifier(random_state=8, verbose=-1)

models = {
    "svc": clf_svc,
    "rf": clf_rf,
    "lgbm": clf_lgbm
}

In [56]:
def models_x_repr(reprs, y):
    for model_name, model in models.items():
        print("-------------------")
        for rep_name, rep in reprs.items():
            print(model_name)
            print(rep_name)
            predictions = cross_val_predict(model, rep, y, cv=5)
            print(classification_report(y, predictions))

# Document level

### Polarization - Standard repr - Document Level

In [57]:
models_x_repr(standard_reprs, y_pol)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.70      0.20      0.31      1651
           1       0.65      0.79      0.72      4957
           2       0.70      0.73      0.72      4232

    accuracy                           0.68     10840
   macro avg       0.69      0.57      0.58     10840
weighted avg       0.68      0.68      0.65     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1651
           1       0.58      0.69      0.63      4957
           2       0.56      0.64      0.60      4232

    accuracy                           0.57     10840
   macro avg       0.38      0.45      0.41     10840
weighted avg       0.48      0.57      0.52     10840

svc
doc_embedding_pos
              precision    recall  f1-score   support

           0       0.71      0.02      0.04      1651
           1       0.58      0.71      0.64      4957
     

### Polarization - Extended repr - Document Level

In [58]:
models_x_repr(extended_reprs, y_pol)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.69      0.21      0.32      1651
           1       0.65      0.78      0.71      4957
           2       0.69      0.73      0.71      4232

    accuracy                           0.67     10840
   macro avg       0.68      0.57      0.58     10840
weighted avg       0.68      0.67      0.65     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.55      0.01      0.02      1651
           1       0.57      0.68      0.62      4957
           2       0.56      0.65      0.60      4232

    accuracy                           0.57     10840
   macro avg       0.56      0.45      0.42     10840
weighted avg       0.57      0.57      0.52     10840

svc
doc_embedding_pos
              precision    recall  f1-score   support

           0       0.59      0.04      0.08      1651
           1       0.59      0.70      0.64      4957
     

### Populism - Fine - Standard repr - Document Level

In [59]:
models_x_repr(standard_reprs, y_pop)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.59      0.89      0.71      3995
           1       0.40      0.31      0.35      2371
           2       0.00      0.00      0.00      1213
           3       0.00      0.00      0.00       991
           4       0.59      0.77      0.67      2270

    accuracy                           0.56     10840
   macro avg       0.32      0.39      0.35     10840
weighted avg       0.43      0.56      0.48     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.52      0.91      0.66      3995
           1       0.46      0.13      0.21      2371
           2       0.00      0.00      0.00      1213
           3       0.00      0.00      0.00       991
           4       0.48      0.67      0.56      2270

    accuracy                           0.50     10840
   macro avg       0.29      0.34      0.29     10840
weighted avg       0.39      

### Populism - Fine - Extended repr - Document Level

In [60]:
models_x_repr(extended_reprs, y_pop)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.60      0.88      0.71      3995
           1       0.40      0.32      0.35      2371
           2       0.17      0.00      0.00      1213
           3       0.00      0.00      0.00       991
           4       0.57      0.77      0.66      2270

    accuracy                           0.56     10840
   macro avg       0.35      0.39      0.35     10840
weighted avg       0.45      0.56      0.48     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.56      0.87      0.68      3995
           1       0.40      0.18      0.24      2371
           2       0.00      0.00      0.00      1213
           3       0.00      0.00      0.00       991
           4       0.47      0.73      0.57      2270

    accuracy                           0.51     10840
   macro avg       0.28      0.35      0.30     10840
weighted avg       0.39      

### Populism - Bin - Standard repr - Document Level

In [61]:
models_x_repr(standard_reprs, y_pop_bin)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      6366
           1       0.82      0.71      0.76      4474

    accuracy                           0.82     10840
   macro avg       0.82      0.80      0.81     10840
weighted avg       0.82      0.82      0.81     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.75      0.85      0.79      6366
           1       0.73      0.59      0.65      4474

    accuracy                           0.74     10840
   macro avg       0.74      0.72      0.72     10840
weighted avg       0.74      0.74      0.73     10840

svc
doc_embedding_pos
              precision    recall  f1-score   support

           0       0.77      0.85      0.81      6366
           1       0.75      0.64      0.69      4474

    accuracy                           0.76     10840
   macro avg       0.76      0.75      0.75     10840
weig

### Populism - Bin - Extended repr - Document Level

In [62]:
models_x_repr(extended_reprs, y_pop_bin)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.81      0.88      0.85      6366
           1       0.81      0.71      0.76      4474

    accuracy                           0.81     10840
   macro avg       0.81      0.80      0.80     10840
weighted avg       0.81      0.81      0.81     10840

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      6366
           1       0.72      0.66      0.69      4474

    accuracy                           0.75     10840
   macro avg       0.74      0.74      0.74     10840
weighted avg       0.75      0.75      0.75     10840

svc
doc_embedding_pos
              precision    recall  f1-score   support

           0       0.78      0.83      0.81      6366
           1       0.74      0.67      0.70      4474

    accuracy                           0.77     10840
   macro avg       0.76      0.75      0.76     10840
weig