In [59]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
import ast
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [11]:
df = pd.read_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", sep=",", encoding="utf-8")

In [17]:
x_tfidf_pro = df["tfidf_pro"]
x_tfidf_pro = np.array([ast.literal_eval(item) for item in x_tfidf_pro])

y_pol = df["polarization"]

y_pop = df["pop_sum"]
y_pop = [0 if x < 2 else 1 for x in y_pop]

In [63]:
features =['parte','molto','lavoro','quando','anni','provvedimento','solo','prima','ancora','presidente','paese','legge','signor','governo','poi','fatto','italia','politica','commissione','già','stato','oggi','articolo','essere','ministro','quindi','fare','decreto','stars','score','neutral','approval','annoyance','realization','disapproval','confusion','disappointment','admiration','anger','disgust','amusement','optimism','excitement','sadness','joy','curiosity','fear','surprise','love','caring','gratitude','embarrassment','desire','grief','pride','relief','nervousness','remorse']

def get_coefs(X, Y):
    model = LinearSVC(C=0.1, class_weight='balanced', loss='squared_hinge', penalty='l1', dual=False, max_iter=10000)
    model.fit(X, Y)
    coefs_list = model.coef_
    
    return coefs_list

def coefs_analysis(X, Y):
    coefs_list = get_coefs(X,Y)

    for coefs in coefs_list:
        list300 = ["f"+str(i) for i in range(300)]
        diz = dict(zip(list300+features, coefs))
        sorted_items = sorted(diz.items(), key=lambda item: item[1], reverse=True)
        filtered_data = [(s, n) for s, n in sorted_items if abs(n) >= 1]
        print(filtered_data)
        print("------")

def coefs_analysis_interpretable(X, Y):
    coefs_list = get_coefs(X,Y)
    
    for coefs in coefs_list:
        diz = dict(zip(features, coefs[301:]))
        sorted_items = sorted(diz.items(), key=lambda item: item[1], reverse=True)
        filtered_data = [(s, n) for s, n in sorted_items if abs(n) >= 0.05]
        print(filtered_data)
        print("------")
        
def how_many_sv(X,Y):
    print("Prestazioni:")
    model = SVC(kernel='linear', C=0.1, random_state=42)
    predictions = cross_val_predict(model, X, Y, cv=3)
    print(classification_report(Y, predictions))
    
    model = SVC(kernel='linear', C=0.1, random_state=42)
    model.fit(X, Y)
    support_vectors = model.support_vectors_
    print("#SV: ", len(support_vectors))

### Populism

In [40]:
coefs_analysis(x_tfidf_pro, y_pop)

[('f5', 7.566545083548244), ('f19', 3.586608866343949), ('f3', 3.2280973569332243), ('f30', 2.864806515487532), ('f11', 2.7360636192172443), ('f10', 2.605006476915449), ('f2', 2.2994145549109906), ('f26', 2.2410041819558826), ('f51', 2.1442968979744546), ('f22', 1.8562598608142868), ('f116', 1.853015305484178), ('f20', 1.5337862427635147), ('f21', 1.135365787331736), ('f31', 1.0593751439680643), ('f71', -1.0744156930274542), ('f39', -1.1051431445616497), ('f58', -1.190401162714236), ('f12', -1.2685178815003957), ('f121', -1.2887906349684597), ('f55', -1.3013362633430992), ('f18', -1.3464241018069916), ('f9', -1.381122191424459), ('f44', -1.4431021533095623), ('f32', -1.4720052740836993), ('f7', -2.2847283062079247), ('f8', -2.5584409058266515), ('f43', -3.1857453717216395), ('f1', -3.2504953774085936)]
------


In [51]:
coefs_analysis_interpretable(x_tfidf_pro, y_pop)

[('stato', 0.05747833698055603), ('ministro', 0.05312843207764623), ('decreto', -0.07712492061537851), ('lavoro', -0.08106680171590175), ('stars', -0.20005228621286594)]
------


In [64]:
how_many_sv(x_tfidf_pro, y_pop)

Prestazioni:
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      6366
           1       0.78      0.65      0.71      4474

    accuracy                           0.78     10840
   macro avg       0.78      0.76      0.76     10840
weighted avg       0.78      0.78      0.77     10840

#SV:  6922


### Polarization

In [42]:
coefs_analysis(x_tfidf_pro, y_pol)

[('f39', 3.7481519824705973), ('f131', 2.7385391808777277), ('f73', 2.3622456487425985), ('f62', 2.2693830248041253), ('f6', 2.2047207838136553), ('f17', 2.0171405778536156), ('f72', 1.9860131380249353), ('f91', 1.9736573042425753), ('f79', 1.8920533794866587), ('f77', 1.7952646203899327), ('f71', 1.736197708135079), ('f138', 1.674231567886354), ('f48', 1.6222513424442078), ('f4', 1.5084264669836167), ('f28', 1.4770414048189002), ('f114', 1.3189615710952296), ('f9', 1.2991259411322562), ('f80', 1.2767455033352144), ('f65', 1.2551478333744879), ('f11', 1.2516898943070411), ('f33', 1.1631476852233364), ('f169', 1.1109576127517922), ('f75', 1.1013722870216074), ('f8', 1.0868137338307922), ('f191', 1.0119573813378389), ('f121', -1.055143867652856), ('f146', -1.1811103658680646), ('f46', -1.3379549697647917), ('f143', -1.4168508852540391), ('f51', -1.4482703321853412), ('f23', -1.4683333159097127), ('f52', -1.5540794235899262), ('f90', -1.7301095936810278), ('f7', -1.7498352206972683), ('f9

In [52]:
coefs_analysis_interpretable(x_tfidf_pro, y_pol)

[('ancora', 0.083552834200633), ('paese', 0.054372245285676885), ('già', 0.051433153737565525), ('quando', 0.05104282993806386), ('essere', 0.05101189851674764), ('commissione', -0.05961460110504681), ('lavoro', -0.06860178435019187)]
------
[('articolo', 0.05366397876571643), ('parte', -0.052001333089398846), ('poi', -0.08504811929068705), ('oggi', -0.10487147327355666)]
------
[('poi', 0.13514473440287336), ('oggi', 0.12470855914136139), ('lavoro', 0.07728284565577828), ('ministro', 0.06483028616401644), ('commissione', 0.058077055166231455), ('già', -0.06693448778850328), ('stars', -0.37463085059798684)]
------


In [65]:
how_many_sv(x_tfidf_pro, y_pol)

Prestazioni:
              precision    recall  f1-score   support

           0       0.88      0.00      0.01      1651
           1       0.58      0.74      0.65      4957
           2       0.61      0.65      0.63      4232

    accuracy                           0.59     10840
   macro avg       0.69      0.46      0.43     10840
weighted avg       0.64      0.59      0.54     10840

#SV:  8850
