In [151]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import re
import emoji

# Preparation of final dataset

We merge the information in ```df_tweets``` with the annotations.
We extract the most common hashtags, mentions and emojis and turn them into binary variables (one-hot-encoding).

## Merge annotations

In [152]:
# Load sample information and annotations information
df_tweets = pickle.load(open("results/df_tweets.pkl", "rb"))
df_multilabel_annotations = pd.read_csv("datasets/annotated_datasets/df_multilabel_annotations.csv")
df_multilabel_annotations['new_id'] = df_multilabel_annotations['new_id'].astype(str)

# Left join the two dataframes
df_multilabel_annotations = df_multilabel_annotations.merge(df_tweets, on="new_id", how="left", suffixes = ("_x", "")) \

# Keep only the relevant columns
variables = ['new_id', 'component_id', 'main_tweet', 'previous_context', 'posterior_context', 'Ad Hominem', 'Appeal to Fear', 'Appeal to Ridicule', 'False Dilemma', 'Hasty Generalization', 'Loaded Language', 'None of the above', 'created_at', 'followers', 'tweet_count', 'hashtags', 'cashtags', 'mentions', 'retweet_count', 'reply_count', 'like_count', 'quote_count']
df_multilabel_annotations = df_multilabel_annotations[variables]

# Replace NaN values with empty strings in previous and posterior context
df_multilabel_annotations['previous_context'] = df_multilabel_annotations['previous_context'].fillna('')
df_multilabel_annotations['posterior_context'] = df_multilabel_annotations['posterior_context'].fillna('')

# Remove [main_tweet] from the main tweet and [context] from previous and posterior context
df_multilabel_annotations['main_tweet'] = df_multilabel_annotations['main_tweet'].apply(lambda x: re.sub(r'\[main_tweet\]', '', x))
df_multilabel_annotations['previous_context'] = df_multilabel_annotations['previous_context'].apply(lambda x: re.sub(r'\[context\]', '', x))
df_multilabel_annotations['posterior_context'] = df_multilabel_annotations['posterior_context'].apply(lambda x: re.sub(r'\[context\]', '', x))

df_multilabel_annotations

Unnamed: 0,new_id,component_id,main_tweet,previous_context,posterior_context,Ad Hominem,Appeal to Fear,Appeal to Ridicule,False Dilemma,Hasty Generalization,...,created_at,followers,tweet_count,hashtags,cashtags,mentions,retweet_count,reply_count,like_count,quote_count
0,144793,249,[user104337]: @user @user ... @user Kyrie Irv...,[user47446]: @user @user @user yeh bringing b...,[user79987]: @user @user ... @user Totally d...,0,0,0,0,0,...,2020-06-13 19:03:08+00:00,252,92808,(),(),"(nabinn_, TATAbox503, atantum99, BleacherReport)",1,1,2,0
1,124801,249,[user79987]: @user @user ... @user Totally di...,[user47446]: @user @user @user yeh bringing b...,[user104337]: @user @user ... @user That's s...,0,0,0,0,0,...,2020-06-13 19:38:45+00:00,386,6510,(),(),"(ddpage369, nabinn_, TATAbox503, atantum99, Bl...",0,1,4,0
2,83279,249,[user104337]: @user @user ... @user That's so...,[user104337]: @user @user ... @user Kyrie Irv...,[user79987]: @user @user ... @user The unint...,0,0,0,0,0,...,2020-06-13 19:50:00+00:00,252,92808,(),(),"(LuvLyricsQuotes, nabinn_, TATAbox503, atantum...",0,1,2,0
3,124800,249,[user79987]: @user @user ... @user The uninte...,[user79987]: @user @user ... @user Totally di...,[user1779]: @user @user ... @user facts. if ...,1,0,0,0,0,...,2020-06-13 20:13:49+00:00,386,6510,(),(),"(ddpage369, nabinn_, TATAbox503, atantum99, Bl...",0,6,2,0
4,165415,249,[user47446]: @user @user ... @user It's been ...,[user79987]: @user @user ... @user The uninte...,[user1779]: @user @user ... @user this shit'...,0,0,0,0,0,...,2020-06-13 21:21:31+00:00,127,4229,(),(),"(wealljusteggsfr, LuvLyricsQuotes, ddpage369, ...",0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,152989,108897,[user16135]: @user just stop! Go grab a drink...,,,1,0,1,0,1,...,2020-04-27 00:35:17+00:00,83,6649,"(WorstPresidentInHistory,)",(),"(realDonaldTrump,)",0,0,0,0
2912,334394,118011,[user39585]: This is in reference to the @use...,,,0,1,0,0,1,...,2021-01-25 06:53:31+00:00,140,760,(),(),"(WHO, NIH)",0,0,0,0
2913,194751,119026,[user20600]: Christina Cuomo Says She Took Cl...,,,0,0,0,0,0,...,2020-04-25 16:54:28+00:00,2096,18165,(),(),(),0,0,0,0
2914,269651,120326,[user23717]: Moderna To Seek Limited Emergenc...,,,0,0,0,0,0,...,2020-09-17 17:41:20+00:00,95,100176,(),(),(),0,0,0,0


## Hashtags, emojis and mentions

In [153]:
def create_binary_columns(df, column_name, elements_to_check):
    '''
    This function creates binary columns for each element in the list elements_to_check.

    Args:
    df: pandas DataFrame
    column_name: str
    elements_to_check: list

    Returns:
    df: pandas DataFrame
    '''
    new_columns = {}
    for element in elements_to_check:
        new_column_name = f'{column_name}_{element}'
        new_columns[new_column_name] = df[column_name].apply(lambda x: 1 if element in x else 0)
    df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
    return df

def extract_emojis(text):
    '''
    Extracts emojis from a text, including compound emojis.

    Args:
    text (str): The text from which to extract emojis.

    Returns:
    tuple: A tuple containing the emojis found in the text.
    '''
    emoji_list = []

    # Find all emojis using emoji's emojize function and regex
    i = 0
    while i < len(text):
        # Check for flags
        if i+1<len(text) and re.match(r'[\U0001F1E6-\U0001F1FF]', text[i]) and re.match(r'[\U0001F1E6-\U0001F1FF]', text[i+1]):
            emoji_list.append(text[i]+text[i+1])
            i += 2

        elif text[i] in emoji.EMOJI_DATA:
            # Check for skin tone modifiers
            if i+1<len(text) and re.match(r'[\U0001F3FB-\U0001F3FF]', text[i+1]):
                if i+2<len(text) and text[i+2] in ['\U00002642','\U00002640','\u2640', '\u2642' ]:
                    emoji_list.append(text[i]+text[i+1]+text[i+2])
                    i += 3
                emoji_list.append(text[i]+text[i+1])
                i += 2
            # Check for gender modifiers
            elif i+1<len(text) and text[i+1] in ['\U00002642','\U00002640','\u2640', '\u2642' ]:
                emoji_list.append(text[i]+text[i+1])
                i += 2
            else:
                emoji_list.append(text[i])
                i += 1
        else:
            i += 1

    return tuple(emoji_list)


In [154]:
# Load selected hashtags, mentions and emojis
with open('results/selected_elements.json', 'r') as json_file:
    selected_elements = json.load(json_file)

In [155]:
# Extract emojis from the main tweet
df_multilabel_annotations['emojis'] = df_multilabel_annotations['main_tweet'].apply(extract_emojis)

# Create binary columns for each selected element
df_multilabel_annotations = create_binary_columns(df_multilabel_annotations, 'hashtags', selected_elements['selected_hashtags'])
df_multilabel_annotations = create_binary_columns(df_multilabel_annotations, 'mentions', selected_elements['selected_mentions'])
df_multilabel_annotations = create_binary_columns(df_multilabel_annotations, 'emojis', selected_elements['selected_emojis'])

df_multilabel_annotations


Unnamed: 0,new_id,component_id,main_tweet,previous_context,posterior_context,Ad Hominem,Appeal to Fear,Appeal to Ridicule,False Dilemma,Hasty Generalization,...,emojis_⤵,emojis_😷,emojis_🥳,emojis_🦠,emojis_😭,emojis_🤥,emojis_😁,emojis_➡,emojis_😉,emojis_🇨🇦
0,144793,249,[user104337]: @user @user ... @user Kyrie Irv...,[user47446]: @user @user @user yeh bringing b...,[user79987]: @user @user ... @user Totally d...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,124801,249,[user79987]: @user @user ... @user Totally di...,[user47446]: @user @user @user yeh bringing b...,[user104337]: @user @user ... @user That's s...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,83279,249,[user104337]: @user @user ... @user That's so...,[user104337]: @user @user ... @user Kyrie Irv...,[user79987]: @user @user ... @user The unint...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,124800,249,[user79987]: @user @user ... @user The uninte...,[user79987]: @user @user ... @user Totally di...,[user1779]: @user @user ... @user facts. if ...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,165415,249,[user47446]: @user @user ... @user It's been ...,[user79987]: @user @user ... @user The uninte...,[user1779]: @user @user ... @user this shit'...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,152989,108897,[user16135]: @user just stop! Go grab a drink...,,,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2912,334394,118011,[user39585]: This is in reference to the @use...,,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2913,194751,119026,[user20600]: Christina Cuomo Says She Took Cl...,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2914,269651,120326,[user23717]: Moderna To Seek Limited Emergenc...,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Sentiment analysis scores

### VADER

Hutto, C., & Gilbert, E. (2014). VADER: A Parsimonious Rule-Based Model for Sentiment Analysis of Social Media Text. Proceedings of the International AAAI Conference on Web and Social Media, 8(1), 216–225. https://doi.org/10.1609/icwsm.v8i1.14550

Why this sentiment score? 
* Multidimentional (provides positive, negative, neutral, and compound).
* Process raw text (it considers the effect of capital letters, punctuation, emojis...).
* Performs exeptionally well in the social media domain.

In [156]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
VADER_analyzer = SentimentIntensityAnalyzer()

def get_vader_scores(text):
    scores = VADER_analyzer.polarity_scores(text)
    return pd.Series([scores['neg'], scores['neu'], scores['pos'], scores['compound']])

df_multilabel_annotations[['VADER_neg', 'VADER_neu', 'VADER_pos', 'VADER_compound']] = df_multilabel_annotations['main_tweet'].apply(get_vader_scores)


### VAD Lexicon

Mohammad, S. M. (2018). Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words. Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL).


Why this score? 

* Multidimensional 
* Previously used for propaganda detection with good results.

In [157]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /user/machaves/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /user/machaves/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [158]:
# Load the VAD lexicon
vad_lexicon = {}
with open('NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt', 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        word = parts[0]
        valence = float(parts[1])
        arousal = float(parts[2])
        dominance = float(parts[3])
        vad_lexicon[word] = (valence, arousal, dominance)

In [159]:
text = '[user1779]: @user @user ... @user none can read😂that\'s why i stopped responding. bro said "how is it forgotten" after i said "it WILL BE forgotten" comprehension is tough for them apparently so im done wasting my time. if they can\'t see it\'s gonna be a distraction from reality then 🤷🏾♂ \n '

def get_vad_lexicon_scores(text):
    '''
    This function calculates the average VAD scores for a text using the NRC VAD Lexicon.
    It tokenizes and lowercases the text since the lexicon comprises lowercase words.
    Lemmatization and stemming are not performed since the lexicon contains only base words.

    Args:
    text (str): The text for which to calculate the VAD scores.

    Returns:
    pd.Series: A pandas Series containing the average valence, arousal, and dominance scores for the text.
    '''

    # tokenize and lowercase the text
    words = word_tokenize(text.lower())

    # get the VAD scores for each word in the text
    valence_scores = []
    arousal_scores = []
    dominance_scores = []
    for word in words:
        if word in vad_lexicon:
            valence, arousal, dominance = vad_lexicon[word]
            valence_scores.append(valence)
            arousal_scores.append(arousal)
            dominance_scores.append(dominance)

    # get the average VAD scores for the text, or 0.5 if no words had scores (0.5 is the neutral score)
    avg_valence = np.mean(valence_scores) if len(valence_scores) > 0 else 0.5
    avg_arousal = np.mean(arousal_scores) if len(arousal_scores) > 0 else 0.5
    avg_dominance = np.mean(dominance_scores) if len(dominance_scores) > 0 else 0.5

    return pd.Series([avg_valence, avg_arousal, avg_dominance])

In [160]:
df_multilabel_annotations[['VAD_valence', 'VAD_arousal', 'VAD_dominance']] = df_multilabel_annotations['main_tweet'].apply(get_vad_lexicon_scores)

## POS tags

We experimented with POS tags generated by ```pos_tag``` from ```nltk```, ```textblob```, and ```spacy```.
We observed better results with the later one. 

In [161]:
sentence = '[user101185]: Please DO NOT eat or drink any bleach or cleaners. This will NOT clean your insides or prevent you from getting #COVID19 I understand that @user has suggested this. HE IS WRONG. Do Not harm yourself by ingesting anything but food  and beverage. 🤦🏾♀ \n '

# Example nltk pos_tag
import nltk
from nltk.tokenize import word_tokenize
wordtokens = word_tokenize(sentence)
print(nltk.pos_tag(wordtokens),end='\n\n')

# Example textblob
from textblob import TextBlob
text_blob = TextBlob(sentence)
pos_tags = text_blob.tags
print(pos_tags, end='\n\n')

# Example spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence)
print([(token.text, token.pos_)for token in doc])


[('[', 'JJ'), ('user101185', 'JJ'), (']', 'NN'), (':', ':'), ('Please', 'NNP'), ('DO', 'NNP'), ('NOT', 'NNP'), ('eat', 'VB'), ('or', 'CC'), ('drink', 'VB'), ('any', 'DT'), ('bleach', 'NN'), ('or', 'CC'), ('cleaners', 'NNS'), ('.', '.'), ('This', 'DT'), ('will', 'MD'), ('NOT', 'VB'), ('clean', 'VB'), ('your', 'PRP$'), ('insides', 'NNS'), ('or', 'CC'), ('prevent', 'NN'), ('you', 'PRP'), ('from', 'IN'), ('getting', 'VBG'), ('#', '#'), ('COVID19', 'NNP'), ('I', 'PRP'), ('understand', 'VBP'), ('that', 'IN'), ('@', 'NNP'), ('user', 'NN'), ('has', 'VBZ'), ('suggested', 'VBN'), ('this', 'DT'), ('.', '.'), ('HE', 'NNP'), ('IS', 'VBZ'), ('WRONG', 'NNP'), ('.', '.'), ('Do', 'NNP'), ('Not', 'RB'), ('harm', 'VB'), ('yourself', 'PRP'), ('by', 'IN'), ('ingesting', 'VBG'), ('anything', 'NN'), ('but', 'CC'), ('food', 'NN'), ('and', 'CC'), ('beverage', 'NN'), ('.', '.'), ('🤦🏾♀', 'NN')]

[('[', 'JJ'), ('user101185', 'JJ'), (']', 'NN'), ('Please', 'NNP'), ('DO', 'NNP'), ('NOT', 'NNP'), ('eat', 'VB'), ('or

In [162]:
sentence = '💊The drug fostamatinib could be repurposed to treat acute lung injury arising from #COVID19 infection, says a preprint featuring new research led by @user , with @user collaborators @user & @user 🔗 Read the preprint: \n '

# Example nltk pos_tag
import nltk
from nltk.tokenize import word_tokenize
wordtokens = word_tokenize(sentence)
print(nltk.pos_tag(wordtokens),end='\n\n')

# Example textblob
from textblob import TextBlob
text_blob = TextBlob(sentence)
pos_tags = text_blob.tags
print(pos_tags, end='\n\n')

# Example spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence)
print([(token.text, token.pos_)for token in doc])

[('💊The', 'JJ'), ('drug', 'NN'), ('fostamatinib', 'NN'), ('could', 'MD'), ('be', 'VB'), ('repurposed', 'VBN'), ('to', 'TO'), ('treat', 'VB'), ('acute', 'JJ'), ('lung', 'NN'), ('injury', 'NN'), ('arising', 'VBG'), ('from', 'IN'), ('#', '#'), ('COVID19', 'NNP'), ('infection', 'NN'), (',', ','), ('says', 'VBZ'), ('a', 'DT'), ('preprint', 'NN'), ('featuring', 'VBG'), ('new', 'JJ'), ('research', 'NN'), ('led', 'VBN'), ('by', 'IN'), ('@', 'NNP'), ('user', 'NN'), (',', ','), ('with', 'IN'), ('@', 'NNP'), ('user', 'NN'), ('collaborators', 'NNS'), ('@', 'NNP'), ('user', 'NNP'), ('&', 'CC'), ('@', 'NNP'), ('user', 'VBP'), ('🔗', 'NNP'), ('Read', 'NNP'), ('the', 'DT'), ('preprint', 'NN'), (':', ':')]

[('💊The', 'JJ'), ('drug', 'NN'), ('fostamatinib', 'NN'), ('could', 'MD'), ('be', 'VB'), ('repurposed', 'VBN'), ('to', 'TO'), ('treat', 'VB'), ('acute', 'JJ'), ('lung', 'NN'), ('injury', 'NN'), ('arising', 'VBG'), ('from', 'IN'), ('COVID19', 'NNP'), ('infection', 'NN'), ('says', 'VBZ'), ('a', 'DT'), (

In [163]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def count_pos_tags(text):
    # Process the text with spaCy
    doc = nlp(text)

    # All possible POS tags
    # I got them from the source code of spaCy: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
    all_pos_tags = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

    # Initialize dictionary with all possible POS tags set to 0
    pos_counts = {tag: 0 for tag in all_pos_tags}

    # Count each POS tag in the text
    for token in doc:
        pos_counts[token.pos_] += 1

    return pos_counts

# Apply count_pos_tags function to the 'text' column and expand the result into separate columns
pos_counts_df = df_multilabel_annotations['main_tweet'].apply(count_pos_tags).apply(pd.Series)
# Rename columns to add prefix 'POS_'
pos_counts_df = pos_counts_df.add_prefix('POS_')

# Concatenate the original DataFrame with the new POS counts DataFrame
df_multilabel_annotations = pd.concat([df_multilabel_annotations, pos_counts_df], axis=1)

df_multilabel_annotations

Unnamed: 0,new_id,component_id,main_tweet,previous_context,posterior_context,Ad Hominem,Appeal to Fear,Appeal to Ridicule,False Dilemma,Hasty Generalization,...,POS_NUM,POS_PART,POS_PRON,POS_PROPN,POS_PUNCT,POS_SCONJ,POS_SYM,POS_VERB,POS_X,POS_SPACE
0,144793,249,[user104337]: @user @user ... @user Kyrie Irv...,[user47446]: @user @user @user yeh bringing b...,[user79987]: @user @user ... @user Totally d...,0,0,0,0,0,...,0,4,8,4,9,1,0,8,3,2
1,124801,249,[user79987]: @user @user ... @user Totally di...,[user47446]: @user @user @user yeh bringing b...,[user104337]: @user @user ... @user That's s...,0,0,0,0,0,...,0,0,6,3,12,2,3,8,3,2
2,83279,249,[user104337]: @user @user ... @user That's so...,[user104337]: @user @user ... @user Kyrie Irv...,[user79987]: @user @user ... @user The unint...,0,0,0,0,0,...,0,2,7,4,6,0,0,5,2,2
3,124800,249,[user79987]: @user @user ... @user The uninte...,[user79987]: @user @user ... @user Totally di...,[user1779]: @user @user ... @user facts. if ...,1,0,0,0,0,...,0,5,7,1,7,2,0,11,3,2
4,165415,249,[user47446]: @user @user ... @user It's been ...,[user79987]: @user @user ... @user The uninte...,[user1779]: @user @user ... @user this shit'...,0,0,0,0,0,...,2,3,9,3,7,1,0,6,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,152989,108897,[user16135]: @user just stop! Go grab a drink...,,,1,0,1,0,1,...,0,0,4,3,4,1,0,8,4,2
2912,334394,118011,[user39585]: This is in reference to the @use...,,,0,1,0,0,1,...,1,1,6,6,7,0,0,4,3,2
2913,194751,119026,[user20600]: Christina Cuomo Says She Took Cl...,,,0,0,0,0,0,...,0,0,1,7,1,0,0,2,3,2
2914,269651,120326,[user23717]: Moderna To Seek Limited Emergenc...,,,0,0,0,0,0,...,0,1,0,8,1,1,0,2,2,2


# Train, validation, and test split

We split the dataset in train, validation and test. The sampling is done considering the components, not individual tweets. Because of this, the classes can not be totally balanced. However we check that they are not too unbalanced in our split. 

In [164]:
def train_val_test_split_by_groups(dataset, groups_variable, train_split, val_split, seed):
    """
    Splits a dataset into training, validation, and test sets based on groups.

    Parameters:
    - dataset (pd.DataFrame): The input dataset.
    - groups_variable (str): The name of the variable containing group labels.
    - train_split (float): The proportion of data to be included in the training set.
    - val_split (float): The proportion of data to be included in the validation set.
    - seed (int): Seed for reproducibility.

    Returns:
    - train (pd.DataFrame): Training set.
    - val (pd.DataFrame): Validation set.
    - test (pd.DataFrame): Test set.
    """

    # Get test split
    test_split = 1 - train_split - val_split

    # rows in dataset
    n = len(dataset)

    # get all groups
    groups = np.unique(dataset[groups_variable])

    # Generate random states for each split
    np.random.seed(seed)
    random1 = np.random.randint(0, 2**32)
    random2 = np.random.randint(0, 2**32)

    # Initialize sets for train, validation, and test groups
    train_groups = set()
    val_groups = set()
    test_groups = set()

    # Split data into train, validation, and test sets
    train_groups, test_groups = train_test_split(groups, test_size=test_split, random_state=random1)
    if val_split > 0:
        train_groups, val_groups = train_test_split(train_groups, test_size=val_split / (train_split + val_split),
                                                   random_state=random2)

    # Create subsets based on selected groups
    train = dataset[dataset[groups_variable].isin(set(train_groups))]
    val = dataset[dataset[groups_variable].isin(set(val_groups))]
    test = dataset[dataset[groups_variable].isin(set(test_groups))]

    # Print the percentage of rows in each set
    print('% of rows in train', len(train) / n)
    print('% of rows in validation', len(val) / n)
    print('% of rows in test', len(test) / n)

    return train, val, test

In [165]:
# Get the train validation and test sets
df_train, df_val , df_test = train_val_test_split_by_groups(dataset=df_multilabel_annotations, groups_variable='component_id', train_split=0.6, val_split=0.2, seed=42)

% of rows in train 0.6210562414266118
% of rows in validation 0.1886145404663923
% of rows in test 0.19032921810699588


In [166]:
# Check how balanced the datasets are

# Create a list of dataframes
dataframes = {"train" : df_train, "validation" : df_val, "test" : df_test}

# Create an empty dataframe to store the means
fallacies = ['Ad Hominem', 'Appeal to Fear', 'Appeal to Ridicule', 'False Dilemma', 'Hasty Generalization', 'Loaded Language', 'None of the above']
means_df = pd.DataFrame(index=fallacies)

# Calculate means for each dataframe and store them in the means_df
for name, df in dataframes.items():
  # Calculate means for numeric columns (assuming binary variables are numeric 0/1)
  means_df[name] = df[fallacies].mean(axis=0)*100

# Create a crosstab with features as rows and dataframes as columns
print('Percentage of each fallacy in each dataset:')
print(means_df)

Percentage of each fallacy in each dataset:
                          train  validation       test
Ad Hominem             9.166207    8.727273   8.108108
Appeal to Fear         5.577029    4.909091   5.225225
Appeal to Ridicule     8.117062    8.181818   8.288288
False Dilemma          6.294865    4.727273   5.045045
Hasty Generalization   3.036996    1.818182   4.684685
Loaded Language       15.405853   15.636364  16.576577
None of the above     64.494754   67.818182  65.945946


In [167]:
# Save the datasets
df_train.to_csv("datasets/train_val_test_sets/df_train.csv", index=False)
df_val.to_csv("datasets/train_val_test_sets/df_val.csv", index=False)
df_test.to_csv("datasets/train_val_test_sets/df_test.csv", index=False)