In [1]:
import pandas as pd
import numpy as np
import ast
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

  from pandas.core import (


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [7]:

#annotated_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")
annotated_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", sep=",", encoding="utf-8")
meta_df = pd.read_csv("data/cleaned_dataset/meta.tsv", sep=",", encoding="utf-8")
merged_df = annotated_df.merge(meta_df, how='left', left_on='id', right_on="ID")

ling_prof_df = merged_df[["id", "pop_sum","polarization","Speaker_ID", "Speaker_party","Party_orientation","linguistic_profile_pro", "tfidf_pro", "doc_embedding_pro", "docembedding_pos_pro"]]
diz = {
    "LN-Aut": "Lega",
    "L-SP": "Lega",
    "M5S.1": "M5S",
    "M5S.2": "M5S",
}
ling_prof_df = ling_prof_df.replace({"Speaker_party": diz})
ling_prof_df = ling_prof_df.rename(columns={"linguistic_profile_pro": "linguistic_profile", "tfidf_pro": "tfidf", "doc_embedding_pro":"doc_embedding", "docembedding_pos_pro":"pos_docembedding"})


In [9]:
def mean_std_lists(lists):
    lists = np.array([ast.literal_eval(item) for item in lists])
    return np.mean(lists, axis=0).tolist(), np.std(lists, axis=0).tolist()

#Aggregazione per senatore
df_grouped = ling_prof_df.groupby('Speaker_ID', as_index=False).agg({
    'pop_sum': lambda x: x.sum() / x.count(),
    'polarization': lambda x: x.sum() / x.count(),
    'Speaker_party': lambda x: x.mode()[0],
    'Party_orientation': lambda x: x.mode()[0]
})

#Media e std delle rappresentazioni del testo (per tutti i testi di un senatore)
for col in ['linguistic_profile', 'tfidf', 'doc_embedding', 'pos_docembedding']:
    df_grouped[col], df_grouped[col + '_std'] = zip(*ling_prof_df.groupby('Speaker_ID')[col].apply(lambda x: mean_std_lists(list(x))))

df_grouped["linguistic_profile"] = df_grouped["linguistic_profile"] + df_grouped["linguistic_profile_std"]
df_grouped["tfidf"] = df_grouped["tfidf"] + df_grouped["tfidf_std"]
df_grouped["doc_embedding"] = df_grouped["doc_embedding"] + df_grouped["doc_embedding_std"]
df_grouped["pos_docembedding"] = df_grouped["pos_docembedding"] + df_grouped["pos_docembedding_std"]
    
group_sizes = ling_prof_df.groupby('Speaker_ID').size().reset_index(name='Count_Per_Group') #quanti testi per senatore
df_grouped = df_grouped.merge(group_sizes, on='Speaker_ID')

In [11]:
#df_grouped.to_csv("data/annotated_dataset/speaker_data.csv", index=False)

#### Estraggo X e y

In [16]:
speaker_df = pd.read_csv("data/annotated_dataset/speaker_data.csv", sep=",", encoding="utf-8")
speaker_df = speaker_df[speaker_df["Count_Per_Group"] > 10]

x_tfidf = speaker_df["tfidf"]
x_tfidf = np.array([ast.literal_eval(item) for item in x_tfidf])

x_docembedding = speaker_df["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])

x_pos_docembedding = speaker_df["pos_docembedding"]
x_pos_docembedding = np.array([ast.literal_eval(item) for item in x_pos_docembedding])

x_linguistic_profile = speaker_df["linguistic_profile"]
x_linguistic_profile = np.array([ast.literal_eval(item) for item in x_linguistic_profile])

reprs = {
    "tfidf": x_tfidf,
    "doc_embedding": x_docembedding,
    "pos_docembedding": x_pos_docembedding,
    "linguistic_profile": x_linguistic_profile
}

y_pol = speaker_df["polarization"]
y_pop = speaker_df["pop_sum"]
y_pop_bin = [0 if x < 2 else 1 for x in y_pop]

## Regressione

In [17]:
model = LinearRegression()
scaler = StandardScaler()
def regressione(reprs, y):
    for name, X in reprs.items():
        print(name)
        X = scaler.fit_transform(X)
        cv_scores = cross_val_score(model, X, y, cv=3, scoring='r2')  # R^2 Score
        cv_predictions = cross_val_predict(model, X, y, cv=3)

        print("Cross-Validation R² Scores:", cv_scores)
        print("Mean R² Score:", cv_scores.mean())

        mse = mean_squared_error(y, cv_predictions)
        spearman = stats.spearmanr(y, cv_predictions)
        
        print("Cross-Validated MSE:", mse)
        print("Spearman:", spearman)
        print("\n\n")

In [18]:
regressione(reprs, y_pop)

tfidf
Cross-Validation R² Scores: [0.62570368 0.67371458 0.68717556]
Mean R² Score: 0.6621979389447488
Cross-Validated MSE: 0.34233359363247823
Spearman: SignificanceResult(statistic=0.8209921611229694, pvalue=3.5604708173779363e-78)



doc_embedding
Cross-Validation R² Scores: [0.4301833  0.32932184 0.34026724]
Mean R² Score: 0.36659079339117606
Cross-Validated MSE: 0.6329532439782148
Spearman: SignificanceResult(statistic=0.7337698103620822, pvalue=1.7458807620159238e-54)



pos_docembedding
Cross-Validation R² Scores: [0.48500917 0.51050037 0.52041807]
Mean R² Score: 0.5053092012836897
Cross-Validated MSE: 0.4990100067901557
Spearman: SignificanceResult(statistic=0.7822060670673977, pvalue=2.746046061975103e-66)



linguistic_profile
Cross-Validation R² Scores: [ 0.26515859 -0.06502144  0.23664814]
Mean R² Score: 0.14559509388114503
Cross-Validated MSE: 0.854745290599053
Spearman: SignificanceResult(statistic=0.6610316302018271, pvalue=6.178635069701292e-41)





In [19]:
regressione(reprs, y_pol)

tfidf
Cross-Validation R² Scores: [0.53534629 0.58055947 0.51870942]
Mean R² Score: 0.5448717249046443
Cross-Validated MSE: 0.1402155241324299
Spearman: SignificanceResult(statistic=0.744919472296562, pvalue=5.800422382480333e-57)



doc_embedding
Cross-Validation R² Scores: [-0.69699673 -0.39784321 -0.11240653]
Mean R² Score: -0.4024154894505367
Cross-Validated MSE: 0.4297830361562644
Spearman: SignificanceResult(statistic=0.5350730691486638, pvalue=9.92976330952728e-25)



pos_docembedding
Cross-Validation R² Scores: [-0.60327721 -1.46174049 -0.26754508]
Mean R² Score: -0.7775209253426114
Cross-Validated MSE: 0.5429915473486618
Spearman: SignificanceResult(statistic=0.489165471654992, pvalue=2.352436814187315e-20)



linguistic_profile
Cross-Validation R² Scores: [-0.6449824  -0.59766782 -0.4888334 ]
Mean R² Score: -0.5771612073666954
Cross-Validated MSE: 0.48466788399847904
Spearman: SignificanceResult(statistic=0.4166460627934451, pvalue=1.1708507128118755e-14)





# Classificazione

In [20]:
y_pol_discr = [0 if x <= 0.75 else 1 if x<=1.45 else 2 for x in y_pol]
y_pop_discr = [0 if x <= 1 else 1 for x in y_pop]

In [21]:
clf_svc = LinearSVC(C=0.1, random_state=42)

clf_rf = RandomForestClassifier(n_estimators=150, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='sqrt', 
                             random_state=0, 
                             n_jobs=-1)

clf_lgbm = LGBMClassifier(random_state=8, verbose=-1)

models = {
    "svc": clf_svc,
    "rf": clf_rf,
    "lgbm": clf_lgbm
}

In [22]:
def models_x_repr(reprs, y):
    for model_name, model in models.items():
        print("-------------------")
        for rep_name, rep in reprs.items():
            print(model_name)
            print(rep_name)
            predictions = cross_val_predict(model, rep, y, cv=5)
            print(classification_report(y, predictions))

In [23]:
models_x_repr(reprs, y_pop_discr)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.82      0.75      0.78       144
           1       0.80      0.86      0.83       171

    accuracy                           0.81       315
   macro avg       0.81      0.80      0.81       315
weighted avg       0.81      0.81      0.81       315

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       144
           1       0.80      0.87      0.83       171

    accuracy                           0.81       315
   macro avg       0.81      0.81      0.81       315
weighted avg       0.81      0.81      0.81       315

svc
pos_docembedding
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       144
           1       0.80      0.87      0.83       171

    accuracy                           0.81       315
   macro avg       0.81      0.81      0.81       315
weigh

In [24]:
models_x_repr(reprs, y_pol_discr)

-------------------
svc
tfidf
              precision    recall  f1-score   support

           0       0.65      0.60      0.62        89
           1       0.63      0.53      0.58       116
           2       0.63      0.76      0.69       110

    accuracy                           0.63       315
   macro avg       0.63      0.63      0.63       315
weighted avg       0.63      0.63      0.63       315

svc
doc_embedding
              precision    recall  f1-score   support

           0       0.63      0.58      0.60        89
           1       0.62      0.52      0.56       116
           2       0.62      0.76      0.69       110

    accuracy                           0.62       315
   macro avg       0.62      0.62      0.62       315
weighted avg       0.62      0.62      0.62       315

svc
pos_docembedding
              precision    recall  f1-score   support

           0       0.62      0.57      0.60        89
           1       0.61      0.52      0.56       116
      

### Classificazione top, con ottimizzazione parametri

In [39]:
scaler = StandardScaler()
def linearSvcBestParams(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svc = LinearSVC(dual=False, max_iter=10000)

    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'loss': ['squared_hinge', 'hinge'],
        'penalty': ['l1', 'l2'],
        'class_weight': ['balanced']
    }

    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)

    best_model = LinearSVC(dual=False, **best_params)
    
    X_scaled = scaler.transform(X)
    predictions = cross_val_predict(best_model, X_scaled, y, cv=3)
    
    print(classification_report(y, predictions))

In [40]:
linearSvcBestParams(x_tfidf, y_pol_discr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 0.1, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'penalty': 'l1'}
Best Cross-Validation Accuracy: 0.7541176470588236
              precision    recall  f1-score   support

           0       0.67      0.67      0.67        89
           1       0.71      0.72      0.71       116
           2       0.83      0.81      0.82       110

    accuracy                           0.74       315
   macro avg       0.74      0.74      0.74       315
weighted avg       0.74      0.74      0.74       315



In [41]:
linearSvcBestParams(x_tfidf, y_pop_discr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 0.1, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'penalty': 'l1'}
Best Cross-Validation Accuracy: 0.8569411764705883
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       144
           1       0.87      0.82      0.84       171

    accuracy                           0.83       315
   macro avg       0.83      0.84      0.83       315
weighted avg       0.84      0.83      0.84       315

