In [42]:
import pandas as pd
import sqlite3
import stanza
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [32]:
nlp = spacy.load("it_core_news_sm", disable=["ner", "parser"])

In [24]:
nlp = stanza.Pipeline(lang ='it', processors="tokenize,pos,lemma", use_gpu=True)  # Enable GPU if available

2025-03-11 15:38:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2025-03-11 15:38:22 INFO: Downloaded file to C:\Users\fabio\stanza_resources\resources.json
2025-03-11 15:38:22 INFO: Loading these models for language: it (Italian):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-03-11 15:38:22 INFO: Using device: cpu
2025-03-11 15:38:22 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2025-03-11 15:38:22 INFO: Loading: mwt
2025-03-11 15:38:22 INFO: Loading: pos
2025-03-11 15:38:23 INFO: Loading: lemma
2025-03-11 15:38:23 INFO: Done loading processors!


In [2]:
annotated_texts_pd = pd.read_csv("data/annotated_dataset/annotated_texts.csv", sep=",", encoding="utf-8")

## TF-IDF

In [2]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

In [3]:
vectorizer = TfidfVectorizer()

In [5]:
tfidf_matrix = vectorizer.fit_transform(annotated_texts_df['text'])

In [43]:
n_components = 300
svd = TruncatedSVD(n_components=n_components)
reduced_matrix = svd.fit_transform(tfidf_matrix)

In [50]:
tfidf_final = reduced_matrix.tolist()

In [51]:
annotated_texts_df["tfidf"] = tfidf_final

In [56]:
#annotated_texts_df.to_csv("data/annotated_dataset/annotated_texts_repr.csv", index=False)

## Document Embeddings

In [64]:
sqliteConnection = sqlite3.connect('word_embeddings/itwac128.sqlite')
cursor = sqliteConnection.cursor()
vettori_estratti = {}

In [65]:
def estraiVettore(token):
    if token in vettori_estratti.keys():
        return vettori_estratti[token]
    
    sqlite_select_query = """SELECT * from store WHERE key=?"""
    cursor.execute(sqlite_select_query, (token,))
    record = cursor.fetchall()
    if len(record)>0:
        record = list(record[0])
        vettori_estratti[token] = record[1:-1]
    else:
        return -2
    return record[1:-1] #escludo il primo e l'ultimo elemento (il token e un codice identificativo)

In [66]:
def create_doc_representations(texts_list, pos_option):
    doc_vecs = []
    testi_processati = 0
    for text in texts_list:
        word_vec_list = []
        text_nlp = nlp(text)

        for token in text_nlp:
            if pos_option == False:
                word_vec = estraiVettore(token.lemma_)
            elif token.pos_ in ["NOUN", "VERB", "ADJ"]:
                word_vec = estraiVettore(token.lemma_)
            else:
                continue

            if word_vec == -2:
                continue

            word_vec_list.append(word_vec)

        doc_vec = np.mean(word_vec_list, axis=0).tolist()

        doc_vecs.append(doc_vec)
        testi_processati+=1
        if testi_processati%100 == 0:
            print(testi_processati, end = " ")

    return doc_vecs
    #annotated_texts_pd[column_name] = doc_vecs

In [67]:
texts = annotated_texts_pd.text
doc_vecs = create_doc_representations(texts, False)
annotated_texts_pd["doc_embedding"] = doc_vecs

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 

#### POS Doc Embedding

In [94]:
texts = annotated_texts_pd.text
doc_vecs = create_doc_representations(texts, True)
annotated_texts_pd["doc_embedding_pos"] = doc_vecs

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 

## Linguistic Features

In [99]:
conllu_df = pd.read_csv("data/annotated_dataset/linguistic_profile_conllu.csv", sep="\t", encoding="utf-8")

In [100]:
conllu_df

Unnamed: 0,Filename,n_sentences,n_tokens,tokens_per_sent,char_per_tok,ttr_lemma_chunks_100,ttr_lemma_chunks_200,ttr_form_chunks_100,ttr_form_chunks_200,upos_dist_ADJ,...,principal_proposition_dist,subordinate_proposition_dist,subordinate_post,subordinate_pre,avg_subordinate_chain_len,subordinate_dist_1,subordinate_dist_2,subordinate_dist_3,subordinate_dist_4,subordinate_dist_5
0,separated_texts/ParlaMint-IT_2016-11-08-LEG17-...,34,1252,36.823529,5.165639,0.64,0.525,0.69,0.580,8.226837,...,25.454545,74.545455,97.560976,2.439024,1.333333,73.015873,22.222222,3.174603,1.587302,0.0
1,separated_texts/ParlaMint-IT_2016-09-20-LEG17-...,102,2584,25.333333,4.759769,0.63,0.520,0.71,0.615,4.721362,...,33.453237,66.546763,81.621622,18.378378,1.272109,78.231293,17.687075,2.721088,1.360544,0.0
2,separated_texts/ParlaMint-IT_2021-07-07-LEG18-...,26,915,35.192308,4.633129,0.63,0.515,0.69,0.605,5.792350,...,28.089888,71.910112,81.250000,18.750000,1.489362,63.829787,23.404255,12.765957,0.000000,0.0
3,separated_texts/ParlaMint-IT_2021-06-24-LEG18-...,58,1494,25.758621,4.686377,0.53,0.535,0.62,0.630,6.492637,...,36.764706,63.235294,93.023256,6.976744,1.294118,76.470588,17.647059,5.882353,0.000000,0.0
4,separated_texts/ParlaMint-IT_2022-05-12-LEG18-...,53,1720,32.452830,4.766667,0.63,0.530,0.74,0.605,6.802326,...,30.519481,69.480519,75.700935,24.299065,1.273810,78.571429,16.666667,3.571429,1.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10835,separated_texts/ParlaMint-IT_2015-11-20-LEG17-...,53,1658,31.283019,4.701731,0.70,0.565,0.82,0.670,5.548854,...,28.930818,71.069182,88.495575,11.504425,1.269663,76.404494,20.224719,3.370787,0.000000,0.0
10836,separated_texts/ParlaMint-IT_2016-03-16-LEG17-...,41,920,22.439024,4.662665,0.58,0.500,0.64,0.580,5.108696,...,40.476190,59.523810,86.000000,14.000000,1.282051,76.923077,17.948718,5.128205,0.000000,0.0
10837,separated_texts/ParlaMint-IT_2020-10-21-LEG18-...,51,1470,28.823529,4.868421,0.68,0.545,0.74,0.635,5.578231,...,35.074627,64.925373,87.356322,12.643678,1.205479,84.931507,9.589041,5.479452,0.000000,0.0
10838,separated_texts/ParlaMint-IT_2016-05-12-LEG17-...,83,2035,24.518072,4.677036,0.62,0.490,0.71,0.570,3.341523,...,34.666667,65.333333,88.435374,11.564626,1.361111,68.518519,26.851852,4.629630,0.000000,0.0


In [117]:
conllu_df = pd.read_csv("data/annotated_dataset/linguistic_profile_conllu.csv", sep="\t", encoding="utf-8")

conllu_df["Filename"] = conllu_df["Filename"].str.replace("separated_texts/", "")
conllu_df["Filename"] = conllu_df["Filename"].str.replace(".conllu", "")

columns_to_aggregate = list(conllu_df.columns)
columns_to_aggregate.remove("Filename")

conllu_df['linguistic_profile'] = conllu_df[columns_to_aggregate].apply(lambda row: row.tolist(), axis=1)
conllu_df = conllu_df.drop(columns=columns_to_aggregate)

In [118]:
conllu_df

Unnamed: 0,Filename,linguistic_profile
0,ParlaMint-IT_2016-11-08-LEG17-Senato-sed-717.u65,"[34.0, 1252.0, 36.8235294117647, 5.16563876651..."
1,ParlaMint-IT_2016-09-20-LEG17-Senato-sed-681.u30,"[102.0, 2584.0, 25.33333333333333, 4.759769094..."
2,ParlaMint-IT_2021-07-07-LEG18-Senato-sed-343.u34,"[26.0, 915.0, 35.19230769230769, 4.63312883435..."
3,ParlaMint-IT_2021-06-24-LEG18-Senato-sed-341.u22,"[58.0, 1494.0, 25.75862068965517, 4.6863772455..."
4,ParlaMint-IT_2022-05-12-LEG18-Senato-sed-432.u34,"[53.0, 1720.0, 32.45283018867924, 4.7666666666..."
...,...,...
10835,ParlaMint-IT_2015-11-20-LEG17-Senato-sed-540.u72,"[53.0, 1658.0, 31.28301886792453, 4.7017310252..."
10836,ParlaMint-IT_2016-03-16-LEG17-Senato-sed-594.u71,"[41.0, 920.0, 22.4390243902439, 4.662665066026..."
10837,ParlaMint-IT_2020-10-21-LEG18-Senato-sed-267.u36,"[51.0, 1470.0, 28.823529411764707, 4.868421052..."
10838,ParlaMint-IT_2016-05-12-LEG17-Senato-sed-625.u39,"[83.0, 2035.0, 24.518072289156628, 4.677036199..."


In [119]:
annotated_texts_pd = annotated_texts_pd.merge(conllu_df, how='left', left_on='id', right_on="Filename")
annotated_texts_pd = annotated_texts_pd.drop(columns="Filename")

MergeError: Passing 'suffixes' which cause duplicate columns {'linguistic_profile_x'} is not allowed.

In [123]:
annotated_texts_pd

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.0, 0.011335531563572565, 0.0, 0.0, 0.0, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569..."
1,ParlaMint-IT_2014-02-05-LEG17-Senato-sed-184.u79,Lo dico al senatore Casson e agli altri: capis...,3,0,1,1,1,1,"[0.0, 0.02875879481417657, 0.0, 0.0, 0.0, 0.0,...","[0.01605109330957291, 0.024485928836790936, -0...","[0.003973030663484822, -0.023834898513667484, ...","[74.0, 1771.0, 23.93243243243243, 5.1573248407..."
2,ParlaMint-IT_2019-03-06-LEG18-Senato-sed-97.u7,"Signor Presidente, onorevoli colleghe e colleg...",4,1,1,1,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.011278112216687432, 0.05985850600530272, -0...","[-0.008258688151666298, 0.021317027991727473, ...","[33.0, 1097.0, 33.24242424242424, 4.5995934959..."
3,ParlaMint-IT_2013-05-29-LEG17-Senato-sed-30.u44,"Signora Presidente, colleghi tutti, spero di e...",4,1,1,1,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0006722444550905756, 0.07540853068564031, ...","[-0.014405182751811067, 0.03559116223514138, 0...","[55.0, 1515.0, 27.545454545454547, 4.777949113..."
4,ParlaMint-IT_2022-06-21-LEG18-Senato-sed-443.u59,"Signor Presidente, il MoVimento 5 Stelle dal 2...",2,0,1,0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.027362786197616205, 0.06471848793768352, -0...","[0.029057294426015463, 0.02832268738432608, -0...","[30.0, 869.0, 28.966666666666665, 5.0075282308..."
...,...,...,...,...,...,...,...,...,...,...,...,...
10835,ParlaMint-IT_2022-09-13-LEG18-Senato-sed-464.u21,"Signor Presidente, onorevoli colleghi, quello ...",0,0,0,0,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.016540360350742925, 0.04646638334196503, -0...","[0.0016293232910784535, -0.005975170270886753,...","[32.0, 849.0, 26.53125, 5.015686274509804, 0.6..."
10836,ParlaMint-IT_2022-09-13-LEG18-Senato-sed-464.u23,"Signor Presidente, oggi avrei voluto passare u...",4,1,1,1,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0032089870735787546, 0.06251824687105181, 0...","[-0.012754363530149821, 0.01493947182789026, 0...","[32.0, 925.0, 28.90625, 4.836320191158901, 0.7..."
10837,ParlaMint-IT_2022-09-13-LEG18-Senato-sed-464.u203,"COMINCINI . Signor Presidente, il decreto-legg...",0,0,0,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.019410288011486792, 0.0504892838201821, -0....","[0.014195417501666627, 0.0057480887436942105, ...","[27.0, 878.0, 32.51851851851852, 4.97997496871..."
10838,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.u8,"VALENTE . Signor Presidente, garantisco che pe...",1,0,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.014758021257211556, 0.0771865495514303, -0....","[-0.01461941689657874, 0.030339930842891095, -...","[33.0, 1001.0, 30.33333333333333, 4.8287037037..."


In [124]:
annotated_texts_pd.to_csv("data/annotated_dataset/annotated_texts_repr.csv", index=False)