In [1]:
import pickle
import pandas as pd
import networkx as nx
import numpy as np
pd.set_option('display.max_colwidth', None)

# Exclude some topics

After the topic modeling analysis (see ```topic_modelling.ipynb```), we identified some topic IDs that are not relevant for our study. 
We exclude components that contain such topics. 

In [2]:
# Load data
df_tweets = pickle.load(open("results/df_tweets.pkl", "rb"))
# Load tweet ids to exclude
tweets_to_exclude_given_topic = pickle.load(open("results/tweets_to_exclude_given_topic.pkl", "rb"))

df_tweets_selected_topics = df_tweets[~df_tweets['new_id'].isin(tweets_to_exclude_given_topic)]
print('Number of tweets before removal', len(df_tweets))
print('Number of tweets after removal', len(df_tweets_selected_topics))

Number of tweets before removal 366946
Number of tweets after removal 273947


In [13]:
# Statistics of graph after excluding topics
G_modified_with_attributes = pickle.load(open("results/G_modified_with_attributes.pkl", "rb"))
nodes = set(df_tweets_selected_topics['new_id'])
G_filtered_topics = G_modified_with_attributes.to_undirected().subgraph(nodes)
print('Number of nodes:', G_filtered_topics.number_of_nodes())
print('Number of arcs:', G_filtered_topics.number_of_edges())
print('Number of components:',len(set(df_tweets_selected_topics['component_id']))) # This one takes a while to run

Number of nodes: 273947
Number of arcs: 144646
Number of components: 127689


# Sample for annotations

In [12]:
# Desired number of tweets in the sample
desired_sample_size = 3000

# Average tweets by components
avg_component_size = np.mean(df_tweets_selected_topics.groupby('component_id').size())
# Estimated number of components that I need to sample to get the desired number of tweets
components_sample_size = int(desired_sample_size / avg_component_size)

# Get unique components
components = np.unique(df_tweets_selected_topics['component_id'])

# Sample of components
np.random.seed(1232)
sample_components = np.random.choice(components, size=components_sample_size, replace=False)

# Sample dataset
df_tweets_basic_sample = df_tweets[df_tweets['component_id'].isin(sample_components)]

print('number of components in sample', components_sample_size)
print('number of components in the population', len(components))
print('number of tweets in sample', len(df_tweets_basic_sample))

number of components in sample 1398
number of components in the population 127689
number of tweets in sample 2942


# Extract context information from graph

Thanks to the graph structure we can now extract context information of tweets that preceded and followed the main tweet. 
For each node we extract the text of up to 6 of its first and second degree neighbors, if available. 
If a tweet has more than 6 first and second degree neighbors, we include all in-neighbors and the out-neighbors closer in time (when the tweet was posted).
Remember that in our dataset, a node can have a maximum of 2 in-neighbors.
So for instance, the context of a tweet could include 1 in-neighbor and 5 out-neighbors. 
While extracting the text of first degree neighbors can take some hours, once we include the information of second degree neighbors it can take days. 
Therefore, this context extraction is done only on the sample for annotations.



In [97]:
def get_n_order_neighbors_simplified(graph, node, n):
    '''
    Function to get neighbors of order <= n of a node
    '''
    graph = graph.to_undirected()
    subgraph = nx.ego_graph(graph,node,radius=n)
    return set(subgraph.nodes)

def concatenate_text_in_chronological_order(graph, node_id, datetime_attribute = 'created_at', text_attribute = 'text', user_attribute = 'anonymised_user_id', max_neighbors=6):
    '''
    Given a graph it concatenates the text of a node and its first degree neighbors in chronological order.
    The max_neighbors parameters defines the maximum of neighbors to take into account. It takes the n oldest.
    '''

    if graph.degree(node_id) == 0:
        concatenated_text = f'[main_tweet] [{graph.nodes[node_id][user_attribute]}]: {graph.nodes[node_id][text_attribute]} \n [/main_tweet]'

    else:

        # Get the neighbors up to order 2 of the current tweet_id
        # neighbors = list(get_n_order_neighbors(graph, node_id, n = 2))
        neighbors = list(get_n_order_neighbors_simplified(graph, node_id, n = 2) - {node_id})
        # neighbors = list(graph.neighbors(node_id))

        # Sort the neighbors based on the datetime attribute
        neighbors.sort(key=lambda x: pd.to_datetime(graph.nodes[x][datetime_attribute]))

        # Keep only the first 'max_neighbors' neighbors
        neighbors = neighbors[:max_neighbors]

        # Add the main tweet to the neighbors
        neighbors = neighbors + [node_id]

        # Sort the neighbors and the main tweet based on the datetime attribute
        neighbors.sort(key=lambda x: pd.to_datetime(graph.nodes[x][datetime_attribute]))

        # Extract text from neighbors and main node
        texts = []
        for neighbor in neighbors:
            # Check if the current neighbor is the main tweet
            if neighbor == node_id:
                # texts.append(f'[main_tweet] {graph.nodes[node_id][text_attribute]}')
                texts.append(f'[main_tweet] [{graph.nodes[node_id][user_attribute]}]: {graph.nodes[node_id][text_attribute]} \n [/main_tweet]')
            else:
                # texts.append(f'[context] {graph.nodes[neighbor][text_attribute]}')
                texts.append(f'[context] [{graph.nodes[neighbor][user_attribute]}]: {graph.nodes[neighbor][text_attribute]} \n [/context]')

        # Join the texts to create the concatenated text
        concatenated_text = ' '.join(texts)
    return concatenated_text



In [98]:
# Since we sampled entire components, we can extract a subgraph that contains those components. This makes the process of text concatenation much faster.
nodes_in_sample = set(df_tweets_basic_sample['new_id'])
G_modified_sample = G_modified_with_attributes.to_undirected().subgraph(nodes_in_sample)

In [99]:
# Sort sample by component and created date
df_tweets_basic_sample = df_tweets_basic_sample[['new_id', 'text', 'component_id', 'created_at']] \
    .sort_values(by=['component_id', 'created_at'])
# Get context information from graph
df_tweets_basic_sample['text_and_context'] = df_tweets_basic_sample['new_id'].apply(lambda x: concatenate_text_in_chronological_order(G_modified_sample, x))

# Save result
pickle.dump(df_tweets_basic_sample, open("results/df_tweets_basic_sample.pkl", "wb"))

df_tweets_basic_sample = pickle.load(open("results/df_tweets_basic_sample.pkl", "rb"))


# Format of the dataset for annotation tool

In [100]:
# Remove [/context]
df_annotations_set = df_tweets_basic_sample[['new_id', 'text', 'text_and_context']]
df_annotations_set['text_and_context'] = df_tweets_basic_sample['text_and_context'].str.replace('[/context]', '')

# Anonymize tagged users in tweet's text
df_annotations_set['text_and_context'] = df_annotations_set['text_and_context'].str.replace(r'@\w+\s?', '@user ', regex=True)
df_annotations_set['text_and_context'] = df_annotations_set['text_and_context'].str.replace(r'(@user\s*){4,}', '@user @user ... @user ', regex=True)

# Split the text into three columns
df_annotations_set[['previous_context', 'main_tweet', 'posterior_context']] = df_annotations_set['text_and_context'].str.split('\[main_tweet\]|\[/main_tweet\]', expand=True, n=2)
df_annotations_set['main_tweet'] = '[main_tweet]' + df_annotations_set['main_tweet']
# Drop the text_and_context
df_annotations_set = df_annotations_set.drop(columns=['text_and_context', 'text'])

# Check for NAs
print('Are there NAs? \n', df_annotations_set.isna().any())

# Check for the percent of the sample without context tweets
print('% of tweets without any context', sum((df_annotations_set['previous_context'] == '') & (df_annotations_set['posterior_context'] == '')) / len(df_annotations_set))

# Save as csv
df_annotations_set.to_csv('results/df_final_annotations_set.csv', index=False)
df_annotations_set

Are there NAs? 
 new_id               False
previous_context     False
main_tweet           False
posterior_context    False
dtype: bool
% of tweets without any context 0.2957171991842284


Unnamed: 0,new_id,previous_context,main_tweet,posterior_context
6887,106213,,"[main_tweet] [user47446]: @user @user @user yeh bringing back an organization with mostly black athletes that distracts people from the hard truths of the world is wrong, but supporting riots that distract from your movement is all right. He's just wrong cuz of his skin color, which he can't change, it's okay. \n","[context] [user104337]: @user @user ... @user Kyrie Irving just doesn't get it. Nobody is against fighting police brutality. We are not a race of quitters, we find ways to affect change from the inside-out. These owners don't care if you boycott, helps them tear up your contract for next season. \n [context] [user79987]: @user @user ... @user Totally disagree here. You hit these owners (and more importantly advertisers) right where it hurts. The only thing they care about is $$$. Period. Kyrie understands that going back & playing sends a message that ""Some"" BLM more than others. He is completely on point imo. \n"
49569,144793,"[context] [user47446]: @user @user @user yeh bringing back an organization with mostly black athletes that distracts people from the hard truths of the world is wrong, but supporting riots that distract from your movement is all right. He's just wrong cuz of his skin color, which he can't change, it's okay. \n","[main_tweet] [user104337]: @user @user ... @user Kyrie Irving just doesn't get it. Nobody is against fighting police brutality. We are not a race of quitters, we find ways to affect change from the inside-out. These owners don't care if you boycott, helps them tear up your contract for next season. \n","[context] [user79987]: @user @user ... @user Totally disagree here. You hit these owners (and more importantly advertisers) right where it hurts. The only thing they care about is $$$. Period. Kyrie understands that going back & playing sends a message that ""Some"" BLM more than others. He is completely on point imo. \n [context] [user104337]: @user @user ... @user That's so unintelligent. You can't hurt somebody that never cared about you in the 1st place. NBA owners will use this boycott as an excuse to tear up the contract for next season too. They became rich before being a part of the NBA. \n"
27444,124801,"[context] [user47446]: @user @user @user yeh bringing back an organization with mostly black athletes that distracts people from the hard truths of the world is wrong, but supporting riots that distract from your movement is all right. He's just wrong cuz of his skin color, which he can't change, it's okay. \n [context] [user104337]: @user @user ... @user Kyrie Irving just doesn't get it. Nobody is against fighting police brutality. We are not a race of quitters, we find ways to affect change from the inside-out. These owners don't care if you boycott, helps them tear up your contract for next season. \n","[main_tweet] [user79987]: @user @user ... @user Totally disagree here. You hit these owners (and more importantly advertisers) right where it hurts. The only thing they care about is $$$. Period. Kyrie understands that going back & playing sends a message that ""Some"" BLM more than others. He is completely on point imo. \n",[context] [user104337]: @user @user ... @user That's so unintelligent. You can't hurt somebody that never cared about you in the 1st place. NBA owners will use this boycott as an excuse to tear up the contract for next season too. They became rich before being a part of the NBA. \n [context] [user79987]: @user @user ... @user The unintelligent thing to do would be to go on living like nothing happened. If you have leverage use it. See how quickly a rich ass owner who doesn't care will hold on to something that doesn't generate profits. Contracts don't mean a damn thing anymore...haven't in years. \n
362345,83279,"[context] [user104337]: @user @user ... @user Kyrie Irving just doesn't get it. Nobody is against fighting police brutality. We are not a race of quitters, we find ways to affect change from the inside-out. These owners don't care if you boycott, helps them tear up your contract for next season. \n [context] [user79987]: @user @user ... @user Totally disagree here. You hit these owners (and more importantly advertisers) right where it hurts. The only thing they care about is $$$. Period. Kyrie understands that going back & playing sends a message that ""Some"" BLM more than others. He is completely on point imo. \n",[main_tweet] [user104337]: @user @user ... @user That's so unintelligent. You can't hurt somebody that never cared about you in the 1st place. NBA owners will use this boycott as an excuse to tear up the contract for next season too. They became rich before being a part of the NBA. \n,"[context] [user79987]: @user @user ... @user The unintelligent thing to do would be to go on living like nothing happened. If you have leverage use it. See how quickly a rich ass owner who doesn't care will hold on to something that doesn't generate profits. Contracts don't mean a damn thing anymore...haven't in years. \n [context] [user1779]: @user @user ... @user facts. if a plethora of star players boycott coming back so they can have time to protest and raise awareness for something more important than bball, shit's gonna get fixed quick. those big players have endorsements and stuff as income so the games are nothing but entertainment \n"
27443,124800,"[context] [user79987]: @user @user ... @user Totally disagree here. You hit these owners (and more importantly advertisers) right where it hurts. The only thing they care about is $$$. Period. Kyrie understands that going back & playing sends a message that ""Some"" BLM more than others. He is completely on point imo. \n [context] [user104337]: @user @user ... @user That's so unintelligent. You can't hurt somebody that never cared about you in the 1st place. NBA owners will use this boycott as an excuse to tear up the contract for next season too. They became rich before being a part of the NBA. \n",[main_tweet] [user79987]: @user @user ... @user The unintelligent thing to do would be to go on living like nothing happened. If you have leverage use it. See how quickly a rich ass owner who doesn't care will hold on to something that doesn't generate profits. Contracts don't mean a damn thing anymore...haven't in years. \n,"[context] [user1779]: @user @user ... @user facts. if a plethora of star players boycott coming back so they can have time to protest and raise awareness for something more important than bball, shit's gonna get fixed quick. those big players have endorsements and stuff as income so the games are nothing but entertainment \n [context] [user47446]: @user @user ... @user It's been what..2, 3 weeks and what has happened? Protests overshadowed by everything other than sports, which haven't been allowed. Why ask players to sacrifice their paycheck now also? Not all of them have Supermax deals \n [context] [user104337]: @user @user ... @user We got some crab-mentality going on here, in the hood. The Clippers, Celtics, Lakers, and Milwaukee Bucks will not fall for this. I GUARANTEE YOU !!! They will rejoin protests in these city streets after the NBA Playoffs are completed. \n"
...,...,...,...,...
235486,312886,,[main_tweet] [user114952]: THIS WAS A BILL GATES EVENT FROM OCTOBER 2019 \n,
55441,150094,,[main_tweet] [user28348]: @user @user @user But at least the president tells the truth a gives great advice. Pass the syringe of bleach I need another anti Covid shot. \n,
378436,97804,,"[main_tweet] [user67894]: @user for your concern. An Ivermectin COVID treatment hearing took place and Sen. Peters postured a barrage of lies, tainting analysis for its members. This is of live & death importance, please call and attend a follow up hearing. \n",
114123,203129,,[main_tweet] [user54507]: Do you think Trump needs to be prosecuted for crimes against humanity? \n,
