In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import ast

  from pandas.core import (


In [2]:
annotated_texts_df = pd.read_csv("data/annotated_dataset/annotated_texts_repr.csv", sep=",", encoding="utf-8")

## One-hot Lexic Polarization Based

In [3]:
merged_df = annotated_texts_df.groupby('polarization', as_index=False).agg({'text': ' '.join})
texts = merged_df["text"]
ita_stopwords = stopwords.words('italian')

vectorizer = TfidfVectorizer(stop_words=ita_stopwords)
tfidf_matrix = vectorizer.fit_transform(texts)

feature_names = vectorizer.get_feature_names_out()

top_n = 20
top_20_common_words = []
for i, text in enumerate(texts):
    tfidf_values = tfidf_matrix[i].toarray().flatten()  # Get TF-IDF values for document i
    top_indices = np.argsort(tfidf_values)[::-1][:top_n]  # Get indices of top words
    top_words = [(feature_names[idx], tfidf_values[idx]) for idx in top_indices]
    top_20_common_words += [feature_names[idx] for idx in top_indices]
    
top_20_common_words = list(set(top_20_common_words))

def extract_present_words(text, word_list):
    words = set(text.lower().split())  # Basic tokenization (split by spaces)
    return list(words.intersection(word_list))

annotated_texts_df["matched_words"] = annotated_texts_df["text"].apply(lambda x: extract_present_words(x, top_20_common_words))

mlb = MultiLabelBinarizer(classes=top_20_common_words)
one_hot = mlb.fit_transform(annotated_texts_df["matched_words"])

annotated_texts_df["one_hot"] = list(one_hot)
annotated_texts_df.head(2)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile,matched_words,one_hot
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569...","[legge, stato, signor, ancora, governo, poi, a...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, ..."
1,ParlaMint-IT_2014-02-05-LEG17-Senato-sed-184.u79,Lo dico al senatore Casson e agli altri: capis...,3,0,1,1,1,1,"[0.46272910958786384, 0.09530122244710613, -0....","[0.01605109330957291, 0.024485928836790936, -0...","[0.003973030663484822, -0.023834898513667484, ...","[74.0, 1771.0, 23.93243243243243, 5.1573248407...","[commissione, legge, ancora, governo, poi, ann...","[1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, ..."


In [12]:
len(top_20_common_words)

28

## Sentiment and Emotions Extraction

In [30]:
emotion_texts_df = pd.read_csv("data/annotated_dataset/emotion_annotated_texts.csv", sep=",", encoding="utf-8")
annotated_texts_df = annotated_texts_df.merge(emotion_texts_df, how='left', left_on='id', right_on="id")

sentiment = annotated_texts_df['sentiment']

stars = [int(eval(sent)[0].split()[0]) for sent in sentiment]
sentiment = [eval(sent)[1:] for sent in sentiment]

sentiment = [[star]+sent for star, sent in zip(stars, sentiment)]

annotated_texts_df['sentiment'] = sentiment

## New Columns Creation 

In [31]:
x_onehot = annotated_texts_df["one_hot"]
x_sentiment = annotated_texts_df["sentiment"]

#tf_idf
x_tfidf = annotated_texts_df["tfidf"]
x_tfidf = np.array([ast.literal_eval(item) for item in x_tfidf])
x_tfidf_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_tfidf, x_onehot, x_sentiment)]
annotated_texts_df["tfidf_pro"] = x_tfidf_pro

#doc_embedding
x_docembedding = annotated_texts_df["doc_embedding"]
x_docembedding = np.array([ast.literal_eval(item) for item in x_docembedding])
x_docembedding_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_docembedding, x_onehot, x_sentiment)]
annotated_texts_df["doc_embedding_pro"] = x_docembedding_pro

#doc_embedding_pos
x_docembedding_pos = annotated_texts_df["doc_embedding_pos"]
x_docembedding_pos = np.array([ast.literal_eval(item) for item in x_docembedding_pos])
x_docembedding_pos_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_docembedding_pos, x_onehot, x_sentiment)]
annotated_texts_df["docembedding_pos_pro"] = x_docembedding_pos_pro

#linguistic_profile
x_linguistic_profile = annotated_texts_df["linguistic_profile"]
x_linguistic_profile = np.array([ast.literal_eval(item) for item in x_linguistic_profile])
x_linguistic_profile_pro = [list(a)+list(b)+list(c) for a,b,c in zip(x_linguistic_profile, x_onehot, x_sentiment)]
annotated_texts_df["linguistic_profile_pro"] = x_linguistic_profile_pro

In [33]:
annotated_texts_df.head(2)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,polarization,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile,matched_words,one_hot,sentiment,tfidf_pro,doc_embedding_pro,docembedding_pos_pro,linguistic_profile_pro
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569...","[essere, ancora, lavoro, governo, già, politic...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, ...","[1, 0.9576303899288178, 0.012579217460006476, ...","[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569..."
1,ParlaMint-IT_2014-02-05-LEG17-Senato-sed-184.u79,Lo dico al senatore Casson e agli altri: capis...,3,0,1,1,1,1,"[0.46272910958786384, 0.09530122244710613, -0....","[0.01605109330957291, 0.024485928836790936, -0...","[0.003973030663484822, -0.023834898513667484, ...","[74.0, 1771.0, 23.93243243243243, 5.1573248407...","[essere, ancora, lavoro, governo, già, politic...","[1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, ...","[2, 0.9442833840847016, 0.014085201174020767, ...","[0.46272910958786384, 0.09530122244710613, -0....","[0.01605109330957291, 0.024485928836790936, -0...","[0.003973030663484822, -0.023834898513667484, ...","[74.0, 1771.0, 23.93243243243243, 5.1573248407..."


## New columns creation

In [34]:
annotated_texts_df.to_csv("data/annotated_dataset/annotated_texts_repr_pro_complete.csv", index=False)