In [11]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
import ast
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
import statistics

In [2]:
df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", sep=",", encoding="utf-8")

In [3]:
x_tfidf_pro = df["tfidf_pro"]
x_tfidf_pro = np.array([ast.literal_eval(item) for item in x_tfidf_pro])

y_pol = df["polarization"]

y_pop = df["pop_sum"]
y_pop = [0 if x < 2 else 1 for x in y_pop]

In [4]:
features =['parte','molto','lavoro','quando','anni','provvedimento','solo','prima','ancora','presidente','paese','legge','signor','governo','poi','fatto','italia','politica','commissione','già','stato','oggi','articolo','essere','ministro','quindi','fare','decreto','stars','score','neutral','approval','annoyance','realization','disapproval','confusion','disappointment','admiration','anger','disgust','amusement','optimism','excitement','sadness','joy','curiosity','fear','surprise','love','caring','gratitude','embarrassment','desire','grief','pride','relief','nervousness','remorse']
list300 = ["f"+str(i) for i in range(300)]
features_final = list300 + features

In [12]:
def get_coefs(X, Y):
    model = LinearSVC(C=0.1, class_weight='balanced', loss='squared_hinge', penalty='l1', dual=False, max_iter=10000)
    model.fit(X, Y)
    coefs_list = model.coef_
    
    return coefs_list

def coefs_analysis(X, Y, features, thr=1):
    coefs_list = get_coefs(X,Y)

    for coefs in coefs_list:
        diz = dict(zip(features, coefs))
        sorted_items = sorted(diz.items(), key=lambda item: item[1], reverse=True)
        filtered_data = [(s, n) for s, n in sorted_items if abs(n) >= thr]
        print(filtered_data)
        print("------")
        coefs_values = [x[1] for x in filtered_data]
        print("media: ", statistics.mean(coefs_values))
        print("std", statistics.stdev(coefs_values))
        print("median", statistics.median(coefs_values))
        print("min", min(coefs_values))
        print("max", max(coefs_values))
        #print(coefs_values)
        print("--------------")

def coefs_analysis_interpretable(X, Y):
    coefs_list = get_coefs(X,Y)
    
    for coefs in coefs_list:
        diz = dict(zip(features, coefs[301:]))
        sorted_items = sorted(diz.items(), key=lambda item: item[1], reverse=True)
        filtered_data = [(s, n) for s, n in sorted_items if abs(n) >= 0.05]
        print(filtered_data)
        print("------")
        
def how_many_sv(X,Y):
    print("Prestazioni:")
    model = SVC(kernel='linear', C=0.1, random_state=42)
    predictions = cross_val_predict(model, X, Y, cv=3)
    print(classification_report(Y, predictions))
    
    model = SVC(kernel='linear', C=0.1, random_state=42)
    model.fit(X, Y)
    support_vectors = model.support_vectors_
    print("#SV: ", len(support_vectors))

### Populism

In [13]:
coefs_analysis(x_tfidf_pro, y_pop, features_final)

[('f5', 7.566617821730621), ('f19', 3.5866729020432935), ('f3', 3.2281429328938107), ('f30', 2.864822543078501), ('f11', 2.7360443706329987), ('f10', 2.605091305371848), ('f2', 2.299439046229669), ('f26', 2.2411002460599594), ('f51', 2.1442759593212433), ('f22', 1.85628068426205), ('f116', 1.853032409708936), ('f20', 1.5337858984103108), ('f21', 1.135464118871208), ('f31', 1.0593882874546332), ('f71', -1.074343313313792), ('f39', -1.1052047139043608), ('f58', -1.1903667505793791), ('f12', -1.26846115638948), ('f121', -1.2887597678278904), ('f55', -1.3014203081768392), ('f18', -1.3465099779400225), ('f9', -1.3811904530731265), ('f44', -1.4432598574309456), ('f32', -1.4720610664056895), ('f7', -2.2848510631490684), ('f8', -2.5583302324253263), ('f43', -3.1857482103849084), ('f1', -3.2504472448608097)]
------
media:  0.44854301465026586
std 2.532335967794366
median -0.007477512929579433
min -3.2504472448608097
max 7.566617821730621
--------------


In [51]:
coefs_analysis_interpretable(x_tfidf_pro, y_pop)

[('stato', 0.05747833698055603), ('ministro', 0.05312843207764623), ('decreto', -0.07712492061537851), ('lavoro', -0.08106680171590175), ('stars', -0.20005228621286594)]
------


In [64]:
how_many_sv(x_tfidf_pro, y_pop)

Prestazioni:
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      6366
           1       0.78      0.65      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.76      0.76     10840
weighted avg       0.78      0.78      0.77     10840

#SV:  6922


### Polarization

In [15]:
coefs_analysis(x_tfidf_pro, y_pol, features_final)

[('f39', 3.748387252762641), ('f131', 2.738592592972002), ('f73', 2.361999978138488), ('f62', 2.2696264883732082), ('f6', 2.2053322928079266), ('f17', 2.0171616076235317), ('f72', 1.9861393132302811), ('f91', 1.9736522616406897), ('f79', 1.8923264593973383), ('f77', 1.7951934103994158), ('f71', 1.736065540100317), ('f138', 1.6745844122440559), ('f48', 1.6224639524877609), ('f4', 1.508182268570854), ('f28', 1.4769936792601874), ('f114', 1.3190149796137158), ('f9', 1.2993966753907198), ('f80', 1.2766268537529084), ('f65', 1.2550319170367035), ('f11', 1.2517614440614044), ('f33', 1.163189414295332), ('f169', 1.1109509569163507), ('f75', 1.1014113417574358), ('f8', 1.0864777564969326), ('f191', 1.011895662680073), ('f121', -1.0551859851317174), ('f146', -1.1811667607173861), ('f46', -1.3376156646878195), ('f143', -1.4167384295232697), ('f51', -1.4481311003194297), ('f23', -1.468349847976177), ('f52', -1.5543160728783671), ('f90', -1.7298784306628399), ('f7', -1.7495172911895474), ('f97', -

In [52]:
coefs_analysis_interpretable(x_tfidf_pro, y_pol)

[('ancora', 0.083552834200633), ('paese', 0.054372245285676885), ('già', 0.051433153737565525), ('quando', 0.05104282993806386), ('essere', 0.05101189851674764), ('commissione', -0.05961460110504681), ('lavoro', -0.06860178435019187)]
------
[('articolo', 0.05366397876571643), ('parte', -0.052001333089398846), ('poi', -0.08504811929068705), ('oggi', -0.10487147327355666)]
------
[('poi', 0.13514473440287336), ('oggi', 0.12470855914136139), ('lavoro', 0.07728284565577828), ('ministro', 0.06483028616401644), ('commissione', 0.058077055166231455), ('già', -0.06693448778850328), ('stars', -0.37463085059798684)]
------


In [65]:
how_many_sv(x_tfidf_pro, y_pol)

Prestazioni:
              precision    recall  f1-score   support

           0       0.88      0.00      0.01      1651
           1       0.58      0.74      0.65      4957
           2       0.61      0.65      0.63      4232

    accuracy                           0.59     10840
   macro avg       0.69      0.46      0.43     10840
weighted avg       0.64      0.59      0.54     10840

#SV:  8850


# Speaker data

In [16]:
df_speaker = pd.read_csv("data/annotated_dataset/speaker_data.csv", sep=",", encoding="utf-8")

In [17]:
x_tfidf_speaker = df_speaker["tfidf"]
x_tfidf_speaker = np.array([ast.literal_eval(item) for item in x_tfidf_speaker])

y_pol = df_speaker["polarization"]
y_pol = [0 if x <= 0.75 else 1 if x<=1.45 else 2 for x in y_pol]

y_pop = df_speaker["pop_sum"]
y_pop = [0 if x < 2 else 1 for x in y_pop]

In [18]:
interpretable_feats = ['parte','molto','lavoro','quando','anni','provvedimento','solo','prima','ancora','presidente','paese','legge','signor','governo','poi','fatto','italia','politica','commissione','già','stato','oggi','articolo','essere','ministro','quindi','fare','decreto','stars','score','neutral','approval','annoyance','realization','disapproval','confusion','disappointment','admiration','anger','disgust','amusement','optimism','excitement','sadness','joy','curiosity','fear','surprise','love','caring','gratitude','embarrassment','desire','grief','pride','relief','nervousness']
feature_speaker = ["f"+str(i) for i in range(300)] + [x+"_avg" for x in interpretable_feats] + ["f"+str(i) for i in range(358, 658)] + [x+"_std" for x in interpretable_feats]

#### Populism

In [19]:
coefs_analysis(x_tfidf_speaker, y_pop, feature_speaker, 0.01)

[('signor_avg', 0.7429140003979866), ('molto_avg', 0.5048570832888686), ('paese_avg', 0.4643182685672703), ('prima_avg', 0.3880001018644799), ('decreto_avg', 0.21724585291221388), ('italia_avg', 0.1814819164601278), ('politica_avg', 0.17642131214320178), ('commissione_avg', -0.1476567748269452), ('articolo_std', -0.29010781929159135), ('ancora_avg', -0.43577547319040694), ('presidente_avg', -0.48323445963928136), ('già_avg', -0.5263347401398555), ('stars_avg', -0.5859467234358269)]
------
media:  0.015860195777710896
std 0.4500616576212871
median 0.17642131214320178
min -0.5859467234358269
max 0.7429140003979866
--------------


In [45]:
how_many_sv(x_tfidf_speaker, y_pop)

Prestazioni:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84       344
           1       0.79      0.33      0.47       163

    accuracy                           0.76       507
   macro avg       0.77      0.65      0.66       507
weighted avg       0.77      0.76      0.72       507

#SV:  311


#### Polarization

In [20]:
coefs_analysis(x_tfidf_speaker, y_pol, feature_speaker, 0.01)

[('presidente_avg', 0.6599375940811097), ('stato_avg', 0.38618037632415736), ('stars_std', 0.22346443460718274), ('essere_avg', 0.19865754865299498), ('parte_std', 0.17044188780773023), ('signor_std', 0.1137665292854202), ('stars_avg', 0.09929347927494614), ('oggi_avg', 0.056360300068453884), ('provvedimento_avg', 0.0561189929877814), ('anni_avg', 0.03991487540893726), ('politica_avg', 0.034557294544689844), ('parte_avg', -0.01913874886763936), ('lavoro_avg', -0.16228807022263778), ('già_avg', -0.16570224948328813), ('quindi_avg', -0.22918687864580028), ('articolo_avg', -0.2865442741082326), ('legge_avg', -0.39008730623772053), ('quando_avg', -0.5826138939288844)]
------
media:  0.011285105086066706
std 0.28831453950867447
median 0.048016934198359326
min -0.5826138939288844
max 0.6599375940811097
--------------
[('articolo_std', 0.6527827548933043), ('decreto_avg', 0.3268805153762193), ('legge_avg', 0.310485518895897), ('parte_avg', 0.1907360394759764), ('governo_avg', 0.07170269942584

In [51]:
how_many_sv(x_tfidf_speaker, y_pol)

Prestazioni:
              precision    recall  f1-score   support

           0       1.00      0.03      0.06       121
           1       0.53      0.78      0.63       207
           2       0.61      0.68      0.64       179

    accuracy                           0.57       507
   macro avg       0.71      0.50      0.45       507
weighted avg       0.67      0.57      0.50       507

#SV:  464
