# Cargamos interacciones etiquetadas

In [2]:
import pandas as pd

df = pd.read_json("./data/covid_twitter_sample_2020-04-16_interactions_labeled_chatgpt_revised_full.json")
df.shape

(970, 4)

In [3]:
df.columns

Index(['tweet1', 'tweet2', 'razonamiento', 'etiqueta'], dtype='object')

In [4]:
df_interactions = pd.read_csv("data/covid_twitter_sample_2020-04-16_interactions.csv")
df_interactions.columns

Index(['created_date', 'topic', 'topic_prob', 'original_created',
       'original_tweet_id', 'original_tweet_text', 'original_user_id',
       'reply_created', 'reply_tweet_id', 'reply_text', 'reply_user_id'],
      dtype='object')

In [5]:
df = df.merge(
    df_interactions[['original_tweet_id', 'original_tweet_text', 'reply_tweet_id', 'reply_text']],
    left_on=["tweet1", "tweet2"],
    right_on=['original_tweet_text', "reply_text"],
    how="inner")

In [6]:
df.shape

(970, 8)

In [7]:
df_originals = df[["original_tweet_id", "original_tweet_text"]].rename(
    columns={"original_tweet_id": "tweet_id", "original_tweet_text": "full_text"}
).drop_duplicates()

df_replies = df[["reply_tweet_id", "reply_text"]].rename(
        columns={"reply_tweet_id": "tweet_id", "reply_text": "full_text"}
    )

df_single_tuits = pd.concat([df_originals, df_replies]).drop_duplicates()

In [8]:
df_single_tuits.shape

(1110, 2)

In [9]:
df_tuit_ids = df_single_tuits.tweet_id.values.tolist()

In [10]:
len(df_tuit_ids)

1110

# Generamos _features_
Inspiradas en el trabajo
[Explotando características contextuales para la detección de posturas en Twitter en el marco de la vacunación del COVID-19 en Argentina.](https://docs.google.com/document/d/10kzaOA857nJynijoRkFd9J-OuBVBsMphwoVQr7EiRs8/edit?usp=sharing)

(Trabajo Especial de Licenciatura en Ciencias de la Computación, , FaMAF, UNC. Mariano Schmidt (2021))


## Sociales
Para cada tweet y su respuesta, extraemos las siguientes features de actividad social:

- Cantidad de Retweets: Número de retweets que tiene el tweet analizado.
- Cantidad de Replies
- Cantidad de Likes
- Es cita: Si el tweet analizado es una cita de otro tweet.
- Cantidad de citas al tuit analizado.


In [11]:
from pymongo import MongoClient

In [12]:
client = MongoClient()
db = client["twits_db"]
db.list_collection_names()
col = db["db.5e8fc63ddd8efca7e56c3215"]

In [14]:
from datetime import datetime
start_ts = datetime.fromisoformat('2020-04-12T00:00:00+00:00')
end_ts =  datetime.fromisoformat('2020-04-19T00:00:00+00:00')

In [19]:
col.find_one()

{'_id': ObjectId('5e8fc682dd8efca7e56c329e'),
 'created_at': 'Fri Apr 10 01:05:07 +0000 2020',
 'id': 1248416689315692544,
 'id_str': '1248416689315692544',
 'text': 'RT @Hispantv: El Ministerio iraní de Petróleo dice que el acuerdo entre Rusia y la OPEP para reducir la producción masiva de petróleo no af…',
 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 801507974422228992,
  'id_str': '801507974422228992',
  'name': 'LUCI ESPERANZA',
  'screen_name': 'luci_esperanza1',
  'location': None,
  'url': None,
  'description': 'Busquemos mejorar a México con tu ayuda... Eliminemos a los malos gobiernos es la única solución! INFORMAR ES LA ÚNICA ARMA ANTE LA MANIPULACIÓN.\n#VaPorMéxico',
  'translator_type': 'none',
  'protected': False,
  'verified': Fal

In [20]:
df_single_tuits["tweet_id"].values.tolist()

[1250575897687666694,
 1250575908295016450,
 1250576574207987714,
 1250579583470055425,
 1250577707777159169,
 1250583051312250881,
 1250579339281862658,
 1250583534575779842,
 1250581294574194688,
 1250586109035327489,
 1250586649198813184,
 1250587987685052418,
 1250590481677004801,
 1250586418948329475,
 1250592728959811584,
 1250594178188402689,
 1250589340016705537,
 1250584505708052482,
 1250598499252568065,
 1250591944780087296,
 1250582988166967297,
 1250602587059040257,
 1250599957276643328,
 1250606178884898816,
 1250584493464915970,
 1250625142444695554,
 1250625731111108613,
 1250575964456849408,
 1250628144790454274,
 1250630404731789312,
 1250626608827904003,
 1250638099786412032,
 1250584102979461122,
 1250629056590200832,
 1250646550596014080,
 1250657731306033164,
 1250652645003583488,
 1250662717880492032,
 1250639405955629062,
 1250662142904348672,
 1250648615053393922,
 1250677828699521024,
 1250678670244622340,
 1250687811382059010,
 1250685113341542400,
 125070934

In [23]:
pipeline = [
    {
        '$project': {
            "id": 1,
            "_id": 0,
            'created_at': {
                '$dateFromString': {
                    'dateString': '$created_at'
                }
            },
            'is_quote_status': 1,
            'quote_count': 1,
            'reply_count': 1,
            'retweet_count': 1,
            'favorite_count': 1,
            'favorited': 1,
            'retweeted': 1
        }
    },
    {
        '$match': {
            "created_at": {"$gte": start_ts, "$lt": end_ts},
            "id": {"$in": df_single_tuits["tweet_id"].values.tolist()}
        }
    }
]

cursor = col.aggregate(pipeline)
social_feats_rows = []
for doc in cursor:
    del doc["created_at"]
    social_feats_rows.append(doc)

social_feats_df = pd.DataFrame(social_feats_rows)

In [24]:
social_feats_df.shape

(1110, 8)

In [25]:
social_feats_df

Unnamed: 0,id,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted
0,1250575897687666694,False,0,0,0,0,False,False
1,1250575908295016450,False,0,0,0,0,False,False
2,1250575964456849408,True,0,0,0,0,False,False
3,1250576180253728776,False,0,0,0,0,False,False
4,1250576574207987714,False,0,0,0,0,False,False
...,...,...,...,...,...,...,...,...
1105,1251583711126011904,False,0,0,0,0,False,False
1106,1251584868988841984,False,0,0,0,0,False,False
1107,1251624849967022081,False,0,0,0,0,False,False
1108,1251657306712006657,False,0,0,0,0,False,False


In [26]:
social_feats_df.to_csv("data/social_features.csv", index=False)

In [84]:
import re
import numpy as np
import pandas as pd

def extract_structural_features(df, text_col='full_text', inplace=False):
    """
    Extract structural features from `text_col` in `df`.
    Returns dataframe (modified copy unless inplace=True) with new columns:
      - retweet_count, is_retweet
      - question_mark (bool), question_mark_count
      - starts_with_question_mark (bool), ends_with_question_mark (bool)
      - upper_ratio
      - exclamation_mark (bool), exclamation_mark_count
      - ellipsis_occurrence (bool)
      - hashtag_presence (bool)
      - text_length (chars after removing hashtags, urls, and handles)
      - url_count
      - quotes (bool)
    """
    if not inplace:
        df = df.copy()

    s = df[text_col].fillna('').astype(str)

    # Basic punctuation counts / indicators
    df['question_mark_count'] = s.str.count(r'\?') + s.str.count(r'¿')
    df['question_mark'] = df['question_mark_count'] > 0
    df['starts_with_question_mark'] = s.str.lstrip().str.startswith('?') | s.str.lstrip().str.startswith('¿')
    df['ends_with_question_mark'] = s.str.rstrip().str.endswith('?')

    df['exclamation_mark_count'] = s.str.count(r'!') + s.str.count(r'¡')
    df['exclamation_mark'] = df['exclamation_mark_count'] > 0

    # Ellipsis (3 or more dots)
    df['ellipsis_occurrence'] = s.str.contains(r'\.{3,}', regex=True)

    # Hashtags presence
    df['hashtag_presence'] = s.str.contains(r'#\w', regex=True)

    # URL count
    df['url_count'] = s.str.count(r'https?://\S+|www\.\S+')

    # Quotes presence (pairs of common quote characters)
    df['quotes'] = s.str.contains(r'["“”«»\'].*?["“”«»\']', regex=True)

    # Text length after removing hashtags, urls and twitter handles (@user)
    cleaned = s.str.replace(r'https?://\S+|www\.\S+|@\w+|#\w+', '', regex=True)
    df['text_length'] = cleaned.str.strip().str.len()
    df['text_word_count'] = cleaned.str.strip().replace(r'\s+', ' ', regex=True).str.split().map(lambda x: len(x) if isinstance(x, list) else 0)

    # Upper ratio: uppercase letters / total alphabetic letters
    def upper_ratio_fn(text):
        letters = [c for c in text if c.isalpha()]
        total_letters = len(letters)
        if total_letters == 0:
            return 0.0
        upper = sum(1 for c in letters if c.isupper())
        return upper / total_letters
    df['upper_ratio'] = s.map(upper_ratio_fn)

    return df

In [85]:
df_structural = extract_structural_features(df_single_tuits, text_col='full_text')

In [86]:
df_structural

Unnamed: 0,tweet_id,full_text,question_mark_count,question_mark,starts_with_question_mark,ends_with_question_mark,exclamation_mark_count,exclamation_mark,ellipsis_occurrence,hashtag_presence,url_count,quotes,text_length,text_word_count,upper_ratio
0,1250575897687666694,Ministerio de Trabajo analiza suspensión labor...,0,False,False,False,0,False,False,False,2,False,113,20,0.121951
1,1250575908295016450,"Director de la OMS pidió a Trump ""poner en cua...",0,False,False,False,0,False,False,False,1,True,163,27,0.103448
2,1250576574207987714,En México hay 5 mil 847 casos confirmados de #...,0,False,False,False,0,False,False,True,1,False,203,34,0.031847
3,1250579583470055425,"Ahora sí lo escuché todo, venezolanos, nicarag...",0,False,False,False,0,False,False,True,1,False,168,30,0.067073
4,1250577707777159169,Gobernadores de PAN afirman que devolverán las...,0,False,False,False,0,False,False,False,1,False,132,19,0.109375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,1251583711126011904,@Javier_Alatorre Los expertos no han dado luz ...,0,False,False,False,1,True,False,False,0,False,265,46,0.035398
966,1251584868988841984,@CarlosAlvQ https://t.co/kpiF1eh270 lo creo M...,0,False,False,False,0,False,False,False,1,False,49,11,0.080645
967,1251624849967022081,"@ladivaza Si no es el COVID, alguna venérea po...",0,False,False,False,0,False,False,False,0,False,59,10,0.109091
968,1251657306712006657,@alvarosarco Todos los muertos valen lo mismo ...,0,False,False,False,0,False,False,False,0,False,96,18,0.011364



## Textuales

- Sentence embedding de tweet y respuesta (mismos que utilizamos para el modelado de temas).

In [8]:
import pandas as pd
import numpy as np

df_sample = pd.read_csv("data/covid_twitter_sample_2020-04-16.csv")
embeddings_sample = np.load(f"data/covid_twitter_sample_2020-04-16.npy")

In [11]:
df_sample.shape

(971679, 5)

In [9]:
embeddings_sample.shape

(971679, 384)

In [12]:
df_sample.columns

Index(['tweet_id', 'user_id', 'created_at', 'full_text', 'created_date'], dtype='object')

In [69]:
inds_in_sample = np.where(df_sample["tweet_id"].isin(df_tuit_ids))[0]

In [70]:
len(inds_in_sample)

882

In [52]:
embeddings_by_tweet_id = {
    df_sample["tweet_id"].values[ind]: embeddings_sample[i]
    for i, ind in enumerate(inds_in_sample)
}

In [71]:
len(embeddings_by_tweet_id)

882

In [54]:
# Agregamos embeddings para los tweets faltantes

In [55]:
from sentence_transformers import SentenceTransformer

embeddings_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [72]:
df_faltantes = df_single_tuits[
    ~df_single_tuits["tweet_id"].isin(embeddings_by_tweet_id.keys())
]

In [73]:
df_faltantes.shape

(228, 2)

In [74]:
embeddings_faltantes = embeddings_model.encode(df_faltantes["full_text"].values,
                                               batch_size=1024, show_progress_bar=True
                                               )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [76]:
for i, tweet_id in enumerate(df_faltantes["tweet_id"]):
    embedding = embeddings_faltantes[i]
    embeddings_by_tweet_id[tweet_id] = embedding

In [101]:
len(embeddings_by_tweet_id)

1110


## Conversacionales
- Levenhstein entre texto de tweet y respuesta
- Similaridad coseno entre tweet y respuesta (sentence embeddings)

In [100]:
# !pip install levenshtein

In [97]:
from Levenshtein import distance as levenshtein_distance
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
df_interactions["levehnstein"] = df.apply(
    lambda row: levenshtein_distance(
        row["original_tweet_text"].lower(),
        row["reply_text"].lower()
    ),
    axis=1
)

In [99]:
df_interactions["cosine_similarity"] = df.apply(
    lambda row: cosine_similarity(
        embeddings_by_tweet_id[row["original_tweet_id"]].reshape(1, -1),
        embeddings_by_tweet_id[row["reply_tweet_id"]].reshape(1, -1)
    )[0, 0],
    axis=1
)


## Afectivas

- Conteos de palabras afectivas (positivas, negativas) en tweet y respuesta (según léxico NRC en Español)

In [103]:
# !pip install NRCLex

In [104]:
from nrclex import NRCLex

In [105]:
df_lex = pd.read_csv("data/lexico_nrc_es.csv")

In [108]:
lex_json = {}
for _, row in df_lex.iterrows():
    palabra = row["palabra"]
    sentimiento = row["sentimiento"]
    if palabra not in lex_json:
        lex_json[palabra] = []
    lex_json[palabra].append(sentimiento)

In [109]:
import json
with open("data/lexico_nrc_es.json", "w") as f:
    json.dump(lex_json, f, ensure_ascii=False, indent=2)

In [112]:
import re

import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)

In [113]:
import spacy
import string

# Load the Spanish library from SpaCy
nlp = spacy.load("es_core_news_sm")

# Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords from spaCy
es_stopwords = spacy.lang.es.stop_words.STOP_WORDS
es_stopwords = list(es_stopwords) + ["https", "http", "com", "covid", "covid19", "19", "co", "coronavirus", "rt"]

In [116]:
def count_nrc_sentiments(text, lexicon):
    text = text.lower()
    text = remove_accents(text)
    doc = nlp(text)
    tokens = [token.text for token in doc if token.text not in punctuations and token.text not in es_stopwords]

    sentiment_counts = {}
    for token in tokens:
        if token in lexicon:
            for sentiment in lexicon[token]:
                sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
    return sentiment_counts

In [130]:
sentiment_count_rows = []
for _, row in df_single_tuits.iterrows():
    tweet_id = row["tweet_id"]
    text = row["full_text"]
    sentiment_count_row = {"tweet_id": tweet_id} | count_nrc_sentiments(text, lex_json)
    sentiment_count_rows.append(sentiment_count_row)

sentiment_counts_df = pd.DataFrame(sentiment_count_rows).fillna(0)
sentiment_counts_df

Unnamed: 0,tweet_id,positivo,negativo,miedo,anticipación,confianza,alegría,enfado,tristeza,asco,sorpresa
0,1250575897687666694,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1250575908295016450,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1250576574207987714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1250579583470055425,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1250577707777159169,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1105,1251583711126011904,4.0,6.0,2.0,1.0,2.0,2.0,3.0,2.0,2.0,1.0
1106,1251584868988841984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1107,1251624849967022081,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1108,1251657306712006657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Particionamos en train y test

In [83]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["etiqueta"])

# Entrenamos clasificador

# Evaluamos en test