In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [28]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

### Undersampling

In [30]:
annotated_texts_df.groupby("polarization")["id"].count()

polarization
0    1651
1    4957
2    4232
Name: id, dtype: int64

In [31]:
counts = annotated_texts_df["polarization"].value_counts()
n_samples = min(counts.values)  # 1651 in your case
df_balanced = annotated_texts_df.groupby("polarization").apply(lambda x: x.sample(n=n_samples, random_state=42)).reset_index(drop=True)
print(df_balanced["polarization"].value_counts())

polarization
0    1651
1    1651
2    1651
Name: count, dtype: int64


  df_balanced = annotated_texts_df.groupby("polarization").apply(lambda x: x.sample(n=n_samples, random_state=42)).reset_index(drop=True)


### tf-idf based one-hot encoding

In [32]:
merged_df = df_balanced.groupby('polarization', as_index=False).agg({'text': ' '.join})

In [33]:
texts = merged_df["text"]

In [34]:
ita_stopwords = stopwords.words('italian')

In [35]:
vectorizer = TfidfVectorizer(stop_words=ita_stopwords)
tfidf_matrix = vectorizer.fit_transform(texts)

In [36]:
feature_names = vectorizer.get_feature_names_out()

In [37]:
top_n = 20
top_20_common_words = []
for i, text in enumerate(texts):
    tfidf_values = tfidf_matrix[i].toarray().flatten()  # Get TF-IDF values for document i
    top_indices = np.argsort(tfidf_values)[::-1][:top_n]  # Get indices of top words
    top_words = [(feature_names[idx], tfidf_values[idx]) for idx in top_indices]
    top_20_common_words += [feature_names[idx] for idx in top_indices]
    print(f"\nTop {top_n} words for class {i}:")
    for word, score in top_words:
        print(f"{word}: {score:.4f}")


Top 20 words for class 0:
legge: 0.1940
presidente: 0.1861
paese: 0.1826
governo: 0.1810
essere: 0.1755
stato: 0.1708
oggi: 0.1595
fatto: 0.1368
lavoro: 0.1349
solo: 0.1330
fare: 0.1126
molto: 0.1079
quindi: 0.1061
italia: 0.1048
parte: 0.1042
signor: 0.1023
anni: 0.1022
politica: 0.1009
ancora: 0.0911
prima: 0.0898

Top 20 words for class 1:
legge: 0.2230
stato: 0.1892
governo: 0.1848
essere: 0.1711
presidente: 0.1705
paese: 0.1320
quindi: 0.1317
articolo: 0.1251
fatto: 0.1183
oggi: 0.1173
solo: 0.1166
parte: 0.1164
fare: 0.1059
italia: 0.1050
anni: 0.1048
commissione: 0.1043
lavoro: 0.0992
decreto: 0.0961
già: 0.0942
poi: 0.0891

Top 20 words for class 2:
governo: 0.2549
presidente: 0.2250
legge: 0.1850
stato: 0.1808
essere: 0.1767
italia: 0.1487
paese: 0.1427
fatto: 0.1423
oggi: 0.1278
solo: 0.1270
fare: 0.1253
signor: 0.1217
quindi: 0.1148
parte: 0.1127
poi: 0.1098
provvedimento: 0.1081
quando: 0.1002
ancora: 0.0973
anni: 0.0941
ministro: 0.0929


In [38]:
top_20_common_words = list(set(top_20_common_words))

In [163]:
def extract_present_words(text, word_list):
    words = set(text.lower().split())  # Basic tokenization (split by spaces)
    return list(words.intersection(word_list))

In [164]:
df_balanced["matched_words"] = df_balanced["text"].apply(lambda x: extract_present_words(x, top_20_common_words))

In [165]:
# One-hot encoding
mlb = MultiLabelBinarizer(classes=top_20_common_words)
one_hot = mlb.fit_transform(df_balanced["matched_words"])

# Store as new column
df_balanced["one_hot"] = list(one_hot)
df_balanced.head(2)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile,matched_words,one_hot
0,ParlaMint-IT_2016-02-03-LEG17-Senato-sed-570.u44,"Signora Presidente, signori senatori, sono uno...",3,1,0,1,1,0,"[0.6199100123464134, 0.11861059301499045, -0.1...","[0.031349862797715955, 0.042622194620060935, -...","[0.028593717550653826, -0.01512078152371765, -...","[36.0, 1908.0, 53.0, 4.983529411764706, 0.64, ...","[ancora, già, stato, lavoro, poi, molto, fare,...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, ..."
1,ParlaMint-IT_2013-07-18-LEG17-Senato-sed-72.u9,"Signor Presidente, vorrei iniziare questo mio ...",1,0,0,0,1,0,"[0.5769382085840078, -0.08147886577024067, -0....","[0.01926288266461509, 0.06818148308746241, -0....","[0.0030541136772034593, 0.011180668423000339, ...","[50.0, 1555.0, 31.1, 4.775924583031182, 0.66, ...","[quindi, ancora, ministro, già, stato, italia,...","[1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ..."


### Sentiment & Emotion Extraction

In [166]:
emotion_texts_df = pd.read_csv("data/annotated_dataset/emotion_annotated_texts.csv", sep=",", encoding="utf-8")
df_balanced = df_balanced.merge(emotion_texts_df, how='left', left_on='id', right_on="id")

In [167]:
sentiment = df_balanced['sentiment']

In [168]:
stars = [int(eval(sent)[0].split()[0]) for sent in sentiment]
sentiment = [eval(sent)[1:] for sent in sentiment]

sentiment = [[star]+sent for star, sent in zip(stars, sentiment)]

In [169]:
df_balanced['sentiment'] = sentiment

In [170]:
df_balanced.head(1)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile,matched_words,one_hot,sentiment
0,ParlaMint-IT_2016-02-03-LEG17-Senato-sed-570.u44,"Signora Presidente, signori senatori, sono uno...",3,1,0,1,1,0,"[0.6199100123464134, 0.11861059301499045, -0.1...","[0.031349862797715955, 0.042622194620060935, -...","[0.028593717550653826, -0.01512078152371765, -...","[36.0, 1908.0, 53.0, 4.983529411764706, 0.64, ...","[ancora, già, stato, lavoro, poi, molto, fare,...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, ...","[1, 0.96742103099823, 0.013662347570061684, 0...."


### New Columns Creation

In [172]:
#df_balanced

x_onehot = df_balanced["one_hot"]
x_sentiment = df_balanced["sentiment"]


#tf_idf
x_tfidf = df_balanced["tfidf"]
x_tfidf = np.array([ast.literal_eval(item) for item in x_tfidf])
x_tfidf_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_tfidf, x_onehot, x_sentiment)]
df_balanced["tfidf_pro"] = x_tfidf_pro

#doc_embedding
x_docembedding = df_balanced["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])
x_docembedding_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_docembedding, x_onehot, x_sentiment)]
df_balanced["doc_embedding_pro"] = x_docembedding_pro

#doc_embedding_pos
x_docembedding_pos = df_balanced["doc_embedding_pos"]
x_docembedding_pos = np.array([ast.literal_eval(item) for item in x_docembedding_pos])
x_docembedding_pos_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_docembedding_pos, x_onehot, x_sentiment)]
df_balanced["docembedding_pos_pro"] = x_docembedding_pos_pro

#linguistic_profile
x_linguistic_profile = df_balanced["linguistic_profile"]
x_linguistic_profile = np.array([ast.literal_eval(item) for item in x_linguistic_profile])
x_linguistic_profile_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_linguistic_profile, x_onehot, x_sentiment)]
df_balanced["linguistic_profile_pro"] = x_linguistic_profile_pro

In [173]:
df_balanced.to_csv("data/annotated_dataset/annotated_texts_repr_pro.csv", index=False)

# Classification

In [29]:
import pandas as pd
from lightgbm import LGBMClassifier
import numpy as np
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

In [30]:
df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro.csv", sep=",", encoding="utf-8")

In [31]:
x_tfidf_pro = df["tfidf_pro"]
x_tfidf_pro = np.array([ast.literal_eval(item) for item in x_tfidf_pro])

x_docembedding_pro = df["doc_embedding_pro"]
x_docembedding_pro = np.array([ast.literal_eval(item) for item in x_docembedding_pro])

x_docembedding_pos_pro = df["docembedding_pos_pro"]
x_docembedding_pos_pro = np.array([ast.literal_eval(item) for item in x_docembedding_pos_pro])

x_linguistic_profile_pro = df["linguistic_profile_pro"]
x_linguistic_profile_pro = np.array([ast.literal_eval(item) for item in x_linguistic_profile_pro])

y = df["polarization"]
y_pop = df["pop_sum"]

In [4]:
clf_lgbm = LGBMClassifier(random_state=8, verbose=-1)

clf_linear_svc = LinearSVC(C=1.0, random_state=42)

clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)

clf_ada_rf = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)

clf_bag = BaggingClassifier(estimator=None, n_estimators=100, random_state=0)

### Polarization

In [179]:
models = [clf_lgbm,clf_linear_svc,clf_rf,clf_ada_rf,clf_bag]
reprs = [x_tfidf_pro, x_docembedding_pro, x_docembedding_pos_pro, x_linguistic_profile_pro]

for model in models:
    print("---")
    for rep in reprs:
        predictions = cross_val_predict(model, rep, y, cv=5)
        print(classification_report(y, predictions))

---
              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1651
           1       0.61      0.59      0.60      1651
           2       0.68      0.72      0.70      1651

    accuracy                           0.65      4953
   macro avg       0.65      0.65      0.65      4953
weighted avg       0.65      0.65      0.65      4953

              precision    recall  f1-score   support

           0       0.62      0.62      0.62      1651
           1       0.60      0.54      0.57      1651
           2       0.62      0.68      0.65      1651

    accuracy                           0.61      4953
   macro avg       0.61      0.61      0.61      4953
weighted avg       0.61      0.61      0.61      4953

              precision    recall  f1-score   support

           0       0.61      0.61      0.61      1651
           1       0.58      0.53      0.55      1651
           2       0.60      0.66      0.63      1651

    accuracy    



              precision    recall  f1-score   support

           0       0.72      0.71      0.72      1651
           1       0.66      0.63      0.65      1651
           2       0.73      0.77      0.75      1651

    accuracy                           0.71      4953
   macro avg       0.71      0.71      0.71      4953
weighted avg       0.71      0.71      0.71      4953





              precision    recall  f1-score   support

           0       0.58      0.61      0.59      1651
           1       0.57      0.45      0.50      1651
           2       0.58      0.67      0.62      1651

    accuracy                           0.58      4953
   macro avg       0.57      0.58      0.57      4953
weighted avg       0.57      0.58      0.57      4953





              precision    recall  f1-score   support

           0       0.59      0.61      0.60      1651
           1       0.57      0.48      0.52      1651
           2       0.59      0.66      0.62      1651

    accuracy                           0.58      4953
   macro avg       0.58      0.58      0.58      4953
weighted avg       0.58      0.58      0.58      4953





              precision    recall  f1-score   support

           0       0.40      0.39      0.40      1651
           1       0.39      0.46      0.42      1651
           2       0.46      0.39      0.42      1651

    accuracy                           0.41      4953
   macro avg       0.42      0.41      0.41      4953
weighted avg       0.42      0.41      0.41      4953

---
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      1651
           1       0.55      0.50      0.53      1651
           2       0.60      0.68      0.64      1651

    accuracy                           0.59      4953
   macro avg       0.59      0.59      0.59      4953
weighted avg       0.59      0.59      0.59      4953

              precision    recall  f1-score   support

           0       0.58      0.60      0.59      1651
           1       0.57      0.46      0.51      1651
           2       0.58      0.66      0.61      1651

    accuracy    

KeyboardInterrupt: 

### Populism (fine-grained)

In [186]:
models = [clf_lgbm,clf_linear_svc,clf_rf,clf_ada_rf,clf_bag]
reprs = [x_tfidf_pro, x_docembedding_pro, x_docembedding_pos_pro, x_linguistic_profile_pro]

for model in models:
    print("---")
    for rep in reprs:
        predictions = cross_val_predict(model, rep, y_pop, cv=5)
        print(classification_report(y_pop, predictions))

---
              precision    recall  f1-score   support

           0       0.61      0.80      0.69      1767
           1       0.43      0.49      0.46      1248
           2       0.19      0.03      0.06       577
           3       0.21      0.03      0.05       456
           4       0.58      0.67      0.62       905

    accuracy                           0.54      4953
   macro avg       0.40      0.41      0.38      4953
weighted avg       0.47      0.54      0.49      4953

              precision    recall  f1-score   support

           0       0.64      0.78      0.71      1767
           1       0.45      0.53      0.49      1248
           2       0.25      0.08      0.12       577
           3       0.24      0.05      0.09       456
           4       0.56      0.65      0.61       905

    accuracy                           0.55      4953
   macro avg       0.43      0.42      0.40      4953
weighted avg       0.50      0.55      0.51      4953

              prec



              precision    recall  f1-score   support

           0       0.65      0.82      0.73      1767
           1       0.48      0.49      0.48      1248
           2       0.19      0.04      0.06       577
           3       0.27      0.06      0.10       456
           4       0.58      0.77      0.66       905

    accuracy                           0.57      4953
   macro avg       0.43      0.44      0.41      4953
weighted avg       0.51      0.57      0.52      4953



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.58      0.85      0.69      1767
           1       0.47      0.40      0.43      1248
           2       0.00      0.00      0.00       577
           3       0.00      0.00      0.00       456
           4       0.49      0.71      0.58       905

    accuracy                           0.53      4953
   macro avg       0.31      0.39      0.34      4953
weighted avg       0.42      0.53      0.46      4953



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.60      0.85      0.70      1767
           1       0.49      0.45      0.47      1248
           2       0.33      0.00      0.00       577
           3       0.00      0.00      0.00       456
           4       0.50      0.74      0.60       905

    accuracy                           0.55      4953
   macro avg       0.39      0.41      0.35      4953
weighted avg       0.47      0.55      0.48      4953





              precision    recall  f1-score   support

           0       0.78      0.14      0.24      1767
           1       0.30      0.35      0.32      1248
           2       0.09      0.02      0.03       577
           3       0.09      0.20      0.12       456
           4       0.24      0.56      0.34       905

    accuracy                           0.26      4953
   macro avg       0.30      0.25      0.21      4953
weighted avg       0.42      0.26      0.24      4953

---


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.52      0.87      0.65      1767
           1       0.42      0.31      0.35      1248
           2       0.20      0.00      0.00       577
           3       0.00      0.00      0.00       456
           4       0.50      0.60      0.55       905

    accuracy                           0.50      4953
   macro avg       0.33      0.36      0.31      4953
weighted avg       0.41      0.50      0.42      4953

              precision    recall  f1-score   support

           0       0.62      0.80      0.70      1767
           1       0.43      0.51      0.46      1248
           2       0.21      0.01      0.02       577
           3       0.33      0.02      0.03       456
           4       0.53      0.65      0.58       905

    accuracy                           0.54      4953
   macro avg       0.42      0.40      0.36      4953
weighted avg       0.48      0.54      0.48      4953

              precisio

KeyboardInterrupt: 

### Populism (coarse-grained)

In [7]:
df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro.csv", sep=",", encoding="utf-8")

In [8]:
y_pop = df["pop_sum"]
y_pop_coarse = [0 if x < 2 else 1 for x in y_pop]
df["pop_bin"] = y_pop_coarse
counts = df["pop_bin"].value_counts()
n_samples = min(counts.values)  # 1651 in your case
df = df.groupby("pop_bin").apply(lambda x: x.sample(n=n_samples, random_state=42)).reset_index(drop=True)
print(df["pop_bin"].value_counts())

pop_bin
0    1938
1    1938
Name: count, dtype: int64


  df = df.groupby("pop_bin").apply(lambda x: x.sample(n=n_samples, random_state=42)).reset_index(drop=True)


In [9]:
x_tfidf_pro = df["tfidf_pro"]
x_tfidf_pro = np.array([ast.literal_eval(item) for item in x_tfidf_pro])

x_docembedding_pro = df["doc_embedding_pro"]
x_docembedding_pro = np.array([ast.literal_eval(item) for item in x_docembedding_pro])

x_docembedding_pos_pro = df["docembedding_pos_pro"]
x_docembedding_pos_pro = np.array([ast.literal_eval(item) for item in x_docembedding_pos_pro])

x_linguistic_profile_pro = df["linguistic_profile_pro"]
x_linguistic_profile_pro = np.array([ast.literal_eval(item) for item in x_linguistic_profile_pro])

y_pop_bin = df["pop_bin"]

In [80]:
models = [clf_lgbm,clf_linear_svc,clf_rf,clf_ada_rf,clf_bag]
reprs = [x_tfidf_pro, x_docembedding_pro, x_docembedding_pos_pro, x_linguistic_profile_pro]

for model in models:
    print("---")
    for rep in reprs:
        predictions = cross_val_predict(model, rep, y_pop_bin, cv=5)
        print(classification_report(y_pop_bin, predictions))

---
              precision    recall  f1-score   support

           0       0.78      0.79      0.79      1938
           1       0.79      0.78      0.78      1938

    accuracy                           0.79      3876
   macro avg       0.79      0.79      0.79      3876
weighted avg       0.79      0.79      0.79      3876

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1938
           1       0.78      0.78      0.78      1938

    accuracy                           0.78      3876
   macro avg       0.78      0.78      0.78      3876
weighted avg       0.78      0.78      0.78      3876

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1938
           1       0.78      0.78      0.78      1938

    accuracy                           0.78      3876
   macro avg       0.78      0.78      0.78      3876
weighted avg       0.78      0.78      0.78      3876

              p



              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1938
           1       0.82      0.80      0.81      1938

    accuracy                           0.81      3876
   macro avg       0.81      0.81      0.81      3876
weighted avg       0.81      0.81      0.81      3876





              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1938
           1       0.75      0.77      0.76      1938

    accuracy                           0.76      3876
   macro avg       0.76      0.76      0.76      3876
weighted avg       0.76      0.76      0.76      3876





              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1938
           1       0.77      0.77      0.77      1938

    accuracy                           0.77      3876
   macro avg       0.77      0.77      0.77      3876
weighted avg       0.77      0.77      0.77      3876





              precision    recall  f1-score   support

           0       0.62      0.46      0.53      1938
           1       0.57      0.72      0.64      1938

    accuracy                           0.59      3876
   macro avg       0.60      0.59      0.58      3876
weighted avg       0.60      0.59      0.58      3876

---
              precision    recall  f1-score   support

           0       0.74      0.76      0.75      1938
           1       0.75      0.73      0.74      1938

    accuracy                           0.75      3876
   macro avg       0.75      0.75      0.75      3876
weighted avg       0.75      0.75      0.75      3876

              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1938
           1       0.77      0.76      0.77      1938

    accuracy                           0.77      3876
   macro avg       0.77      0.77      0.77      3876
weighted avg       0.77      0.77      0.77      3876

              p

KeyboardInterrupt: 