In [12]:
import pandas as pd
import numpy as np
import ast
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [13]:
#annotated_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")
annotated_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", sep=",", encoding="utf-8")
meta_df = pd.read_csv("data/cleaned_dataset/meta.tsv", sep=",", encoding="utf-8")
merged_df = annotated_df.merge(meta_df, how='left', left_on='id', right_on="ID")

ling_prof_df = merged_df[["id", "pop_sum","polarization","Speaker_ID", "Speaker_party","Party_orientation","linguistic_profile_pro", "tfidf_pro", "doc_embedding_pro"]]
diz = {
    "LN-Aut": "Lega",
    "L-SP": "Lega",
    "M5S.1": "M5S",
    "M5S.2": "M5S",
}
ling_prof_df = ling_prof_df.replace({"Speaker_party": diz})
ling_prof_df = ling_prof_df.rename(columns={"linguistic_profile_pro": "linguistic_profile", "tfidf_pro": "tfidf", "doc_embedding_pro":"doc_embedding"})

In [14]:
def mean_std_lists(lists):
    lists = np.array([ast.literal_eval(item) for item in lists])
    return np.mean(lists, axis=0).tolist(), np.std(lists, axis=0).tolist()

#Aggregazione per senatore
df_grouped = ling_prof_df.groupby('Speaker_ID', as_index=False).agg({
    'pop_sum': lambda x: x.sum() / x.count(),
    'polarization': lambda x: x.sum() / x.count(),
    'Speaker_party': lambda x: x.mode()[0],
    'Party_orientation': lambda x: x.mode()[0]
})

#Media e std delle rappresentazioni del testo (per tutti i testi di un senatore)
for col in ['linguistic_profile', 'tfidf', 'doc_embedding']:
    df_grouped[col], df_grouped[col + '_std'] = zip(*ling_prof_df.groupby('Speaker_ID')[col].apply(lambda x: mean_std_lists(list(x))))

df_grouped["linguistic_profile"] = df_grouped["linguistic_profile"] + df_grouped["linguistic_profile_std"]
df_grouped["tfidf"] = df_grouped["tfidf"] + df_grouped["tfidf_std"]
df_grouped["doc_embedding"] = df_grouped["doc_embedding"] + df_grouped["doc_embedding_std"]
    
group_sizes = ling_prof_df.groupby('Speaker_ID').size().reset_index(name='Count_Per_Group') #quanti testi per senatore
df_grouped = df_grouped.merge(group_sizes, on='Speaker_ID')

#### Rimuovo i senatori con meno di 11 discorsi

In [15]:
df_grouped = df_grouped[df_grouped["Count_Per_Group"] > 10]

#### Estraggo X e y

In [16]:
X_ling = np.vstack(df_grouped["linguistic_profile"].values)
X_tfidf = np.vstack(df_grouped["tfidf"].values)
X_embed = np.vstack(df_grouped["doc_embedding"].values)

y_pop = np.vstack(df_grouped["pop_sum"].values)
y_pol = np.vstack(df_grouped["polarization"].values)

## Regressione

In [6]:
X_list = [X_ling,X_tfidf,X_embed]
model = LinearRegression()

In [151]:
scaler = StandardScaler()
def regressione(X_list, y):
    for X in X_list:
        X = scaler.fit_transform(X)
        cv_scores = cross_val_score(model, X, y, cv=3, scoring='r2')  # R^2 Score
        cv_predictions = cross_val_predict(model, X, y, cv=3)

        print("Cross-Validation R² Scores:", cv_scores)
        print("Mean R² Score:", cv_scores.mean())

        mse = mean_squared_error(y, cv_predictions)
        spearman = stats.spearmanr(y, cv_predictions)
        
        print("Cross-Validated MSE:", mse)
        print("Spearman:", spearman)
        print("\n\n")

In [165]:
regressione(X_list, y_pop)

Cross-Validation R² Scores: [ 0.26510557 -0.06528716  0.23662885]
Mean R² Score: 0.1454824194421639
Cross-Validated MSE: 0.8548573955855646
Spearman: SignificanceResult(statistic=0.6610623454741413, pvalue=6.109004153408424e-41)



Cross-Validation R² Scores: [0.62089376 0.69518964 0.68555786]
Mean R² Score: 0.667213753967158
Cross-Validated MSE: 0.33768524078182305
Spearman: SignificanceResult(statistic=0.8248523029710477, pvalue=1.628287915449282e-79)



Cross-Validation R² Scores: [0.43020179 0.32932422 0.34027319]
Mean R² Score: 0.3665997339634924
Cross-Validated MSE: 0.632943655167474
Spearman: SignificanceResult(statistic=0.7338055168661475, pvalue=1.71505146919277e-54)





In [166]:
regressione(X_list, y_pol)

Cross-Validation R² Scores: [-0.64498003 -0.59766047 -0.48887788]
Mean R² Score: -0.5771727957108537
Cross-Validated MSE: 0.48467166359955105
Spearman: SignificanceResult(statistic=0.4166182167292887, pvalue=1.1760826875320605e-14)



Cross-Validation R² Scores: [0.53581305 0.57911488 0.5058381 ]
Mean R² Score: 0.5402553448782266
Cross-Validated MSE: 0.1416874331141398
Spearman: SignificanceResult(statistic=0.742848109206971, pvalue=1.7124493886377726e-56)



Cross-Validation R² Scores: [-0.69716258 -0.39784321 -0.11238367]
Mean R² Score: -0.4024631559874166
Cross-Validated MSE: 0.42979735919827833
Spearman: SignificanceResult(statistic=0.535111861596661, pvalue=9.839077590182364e-25)





# Classificazione

In [7]:
X_ling = np.vstack(df_grouped["linguistic_profile"].values)
X_tfidf = np.vstack(df_grouped["tfidf"].values)
X_embed = np.vstack(df_grouped["doc_embedding"].values)

y_pop = np.vstack(df_grouped["pop_sum"].values)
y_pol = np.vstack(df_grouped["polarization"].values)

In [8]:
X_list = [X_ling,X_tfidf,X_embed]
y_pol = [0 if x <= 0.75 else 1 if x<=1.45 else 2 for x in y_pol]
y_pop = [0 if x < 1 else 1 for x in y_pop]

In [9]:
clf_linear_svc = LinearSVC(C=1.0, random_state=42)

In [10]:
scaler = StandardScaler()
def classificazione(X_list, y):
    for X in X_list:
        X = scaler.fit_transform(X)
        predictions = cross_val_predict(clf_linear_svc, X, y, cv=3)
        print(classification_report(y, predictions))

In [57]:
classificazione(X_list, y_pop)

              precision    recall  f1-score   support

           0       0.76      0.82      0.79       136
           1       0.86      0.80      0.83       179

    accuracy                           0.81       315
   macro avg       0.81      0.81      0.81       315
weighted avg       0.82      0.81      0.81       315

              precision    recall  f1-score   support

           0       0.75      0.84      0.79       136
           1       0.87      0.79      0.83       179

    accuracy                           0.81       315
   macro avg       0.81      0.82      0.81       315
weighted avg       0.82      0.81      0.81       315

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       136
           1       0.86      0.83      0.84       179

    accuracy                           0.82       315
   macro avg       0.82      0.82      0.82       315
weighted avg       0.82      0.82      0.82       315





In [70]:
classificazione(X_list, y_pol)



              precision    recall  f1-score   support

           0       0.53      0.61      0.57        89
           1       0.54      0.53      0.53       116
           2       0.72      0.65      0.69       110

    accuracy                           0.59       315
   macro avg       0.60      0.60      0.59       315
weighted avg       0.60      0.59      0.60       315

              precision    recall  f1-score   support

           0       0.65      0.80      0.71        89
           1       0.77      0.66      0.71       116
           2       0.88      0.85      0.86       110

    accuracy                           0.76       315
   macro avg       0.76      0.77      0.76       315
weighted avg       0.77      0.76      0.76       315

              precision    recall  f1-score   support

           0       0.66      0.69      0.67        89
           1       0.62      0.59      0.61       116
           2       0.71      0.72      0.71       110

    accuracy        



### Classificazione top, con ottimizzazione parametri

In [17]:
y_pol = [0 if x <= 0.75 else 1 if x<=1.45 else 2 for x in y_pol]
y_pop = [0 if x < 1 else 1 for x in y_pop]
#X_tfidf

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [19]:
scaler = StandardScaler()
def linearSvcBestParams(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svc = LinearSVC(dual=False, max_iter=10000)

    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'loss': ['squared_hinge', 'hinge'],
        'penalty': ['l1', 'l2'],
        'class_weight': ['balanced']
    }

    grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)

    best_model = LinearSVC(dual=False, **best_params)
    
    X_scaled = scaler.transform(X)
    predictions = cross_val_predict(best_model, X_scaled, y, cv=3)
    
    print(classification_report(y, predictions))

In [20]:
linearSvcBestParams(X_tfidf, y_pol)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 315, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "C:\U

Best Parameters: {'C': 0.1, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'penalty': 'l1'}
Best Cross-Validation Accuracy: 0.7541176470588236
              precision    recall  f1-score   support

           0       0.67      0.67      0.67        89
           1       0.71      0.72      0.71       116
           2       0.83      0.81      0.82       110

    accuracy                           0.74       315
   macro avg       0.74      0.74      0.74       315
weighted avg       0.74      0.74      0.74       315



In [19]:
linearSvcBestParams(X_tfidf, y_pop)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fabio\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 315, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "C:\U

Best Parameters: {'C': 1, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'penalty': 'l1'}
Best Cross-Validation Accuracy: 0.8211764705882352
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       136
           1       0.84      0.84      0.84       179

    accuracy                           0.82       315
   macro avg       0.81      0.81      0.81       315
weighted avg       0.82      0.82      0.82       315



