# __Network Creation and Analysis__

In [1]:
import pandas as pd
import networkx as nx
from datetime import datetime, timedelta
import random

### Data

In [2]:
data_path = {
    'tweets': '../data/tweets/',
    'network': '../data/network/'
}

In [3]:
tweets_df = pd.read_csv(data_path['tweets']+'merged_data.csv')
tweets_df = tweets_df.query('is_reply==False and is_quote==False')

tweets_df.shape

(6979, 14)

In [4]:
tweets_df.sample(5)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
269,2022-11-23 01:44:39,1595231825957380102,Subindo pra trend:\n- Bolsonaro 51%\n- Mula de...,ALutaPeloBrasil,"{'id': 1595207720419971077, 'name': 'LUTApeloB...",False,,False,,False,,True,"{'user': 'DCDM115', 'user_id': 134986098320492...","['#SOSFORCASARMADAS', '#BolsonaroReeleito']"
4013,2022-11-22 22:17:09,1595179606423781381,O LULA TÁ ELEITO! https://t.co/sHXCsF4dHA,madelcostas,"{'id': 2176156801, 'name': 'Maria del Carmen',...",False,,False,,False,,True,"{'user': 'lazarorosa25', 'user_id': 545643493,...",
2791,2022-11-22 22:12:49,1595178516382978049,Maior verdade #BolsonaroReeleito 🇧🇷🇧🇷🇧🇷 https:...,lzaana_hotmail,"{'id': 333755785, 'name': 'Ana maria', 'descri...",False,,False,,False,,True,"{'user': 'BrunoCalixto22', 'user_id': 13734622...",['#BolsonaroReeleito']
4215,2022-11-22 22:42:40,1595186027630665729,O LULA TÁ ELEITO\nO LULA VAI SUBIR A RAMPA DO ...,CrisSilvaSJC,"{'id': 1090562052169625600, 'name': 'CrisSilva...",False,,False,,False,,False,,
6957,2022-11-22 22:57:05,1595189656257236998,O LULA TÁ ELEITO! https://t.co/sHXCsF4dHA,santossonia407,"{'id': 1577837580439228418, 'name': 'Sonia San...",False,,False,,False,,True,"{'user': 'lazarorosa25', 'user_id': 545643493,...",


In [5]:
tweets_df.sort_values(by='created_at', inplace=True)

In [6]:
tweets_df.head(1)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
5832,2022-11-22 19:28:22,1595137129977638912,Bora ganhar também o 4º turno dessas eleições....,_Janoninho,"{'id': 1563897907446747136, 'name': 'Janoninho...",False,,False,,False,,False,,


In [7]:
tweets_df.tail(1)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
218,2022-11-23 02:58:59,1595250529164304390,Bolsonaro 51%\n#BolsonaroReeleito \n\nNão me C...,guiferreiracode,"{'id': 1497584853184921604, 'name': 'Guilherme...",False,,False,,False,,True,"{'user': 'KaiolOof', 'user_id': 15031679523712...",['#BolsonaroReeleito']


In [8]:
# converting date to GMT-3

def convert_2_gmt_br(date):
    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    date = date - timedelta(hours=3)
    date = datetime.strftime(date, '%Y-%m-%d %H:%M:%S')
    return date

tweets_df['created_at'] = tweets_df['created_at'].apply(lambda x: convert_2_gmt_br(str(x)))

print(f'''
Min date/time: {tweets_df['created_at'].min()}
Max date/time: {tweets_df['created_at'].max()}
''')


Min date/time: 2022-11-22 16:28:22
Max date/time: 2022-11-22 23:58:59



### Network

Converting data to graph

In [9]:
twitter_graph = nx.DiGraph()

# for idx, row in tweets_df.iterrows():
#     user = row['user']
#     tweet = row['tweet_id']
#     timestamp = row['created_at']
#     if row['is_retweet']==True:
#         twitter_graph.add_node(user, node_type='user')
#         twitter_graph.add_node(tweet, node_type='tweet')
#         twitter_graph.add_edge(user, tweet, edge_type='retweet', timestamp=timestamp)
#     else:
#         twitter_graph.add_node(user, node_type='user')
#         twitter_graph.add_node(tweet, node_type='tweet')
#         twitter_graph.add_edge(tweet, user, edge_type='tweet', timestamp=timestamp)

for idx, row in tweets_df.iterrows():
    user = eval(row['user_info'])['id']
    tweet = row['tweet_id']
    timestamp = row['created_at']
    twitter_graph.add_node(user, node_type='user')
    twitter_graph.add_node(tweet, node_type='tweet')
    twitter_graph.add_edge(tweet, user, edge_type='tweet', timestamp=timestamp)

for idx, row in tweets_df.query('is_retweet==True').iterrows():
    user = eval(row['user_info'])['id']
    tweet = eval(row['retweeted_from'])['tweet_id']
    timestamp = row['created_at']
    if twitter_graph.has_node(tweet):
        twitter_graph.add_node(user, node_type='user')
        twitter_graph.add_node(tweet, node_type='tweet')
        twitter_graph.add_edge(user, tweet, edge_type='retweet', timestamp=timestamp)

nx.write_gexf(twitter_graph, data_path['network']+'raw_twitter_graph.gexf')

In [10]:
len(twitter_graph.nodes())

8260

In [11]:
len(twitter_graph.edges())

8890

In [12]:
list(twitter_graph.edges(data=True))[:5]

[(1563897907446747136,
  1595137129977638912,
  {'edge_type': 'retweet', 'timestamp': '2022-11-22 18:37:33'}),
 (1595137129977638912,
  1563897907446747136,
  {'edge_type': 'tweet', 'timestamp': '2022-11-22 16:28:22'}),
 (1273599615535853569,
  1595137129977638912,
  {'edge_type': 'retweet', 'timestamp': '2022-11-22 16:29:08'}),
 (1595137323385376768,
  1273599615535853569,
  {'edge_type': 'tweet', 'timestamp': '2022-11-22 16:29:08'}),
 (1594892901355511808,
  1595137129977638912,
  {'edge_type': 'retweet', 'timestamp': '2022-11-22 16:29:32'})]

#### Converting graph

In [13]:
out_degrees = dict(twitter_graph.out_degree())
in_degrees = dict(twitter_graph.in_degree())
print('[Original Twitter graph]: {} nodes, {} edges'.format(len(twitter_graph), len(twitter_graph.edges())))

[Original Twitter graph]: 8260 nodes, 8890 edges


In [14]:
min_retweets_users = [x for x, data in twitter_graph.nodes(data=True)
        if out_degrees[x] >= 6 and data['node_type']=='user']

min_retweets_tweets = [x for x, data in twitter_graph.nodes(data=True)
        if in_degrees[x] >= 6 and data['node_type']=='tweet']

min_tweets_users = [x for x, data in twitter_graph.nodes(data=True)
        if in_degrees[x] >= 3 and data['node_type']=='user']

allowed_tweets = set()
for tweet, user in twitter_graph.in_edges(min_tweets_users):
    allowed_tweets.add(tweet)

allowed_tweets = list(set(min_retweets_tweets).intersection(allowed_tweets))

allowed_users = set()
for user, tweet in twitter_graph.in_edges(allowed_tweets):
    allowed_users.add(user)

allowed_users = list(set(min_retweets_users).intersection(allowed_users))

#allowed_nodes = set(min_retweets_users + min_retweets_tweets + min_tweets_users)
allowed_nodes = set(allowed_users + allowed_tweets + min_tweets_users)

densified_twitter_graph = nx.subgraph(twitter_graph, allowed_nodes).copy()
print('[Densified Twitter graph]: {} nodes, {} edges\n'.format(len(densified_twitter_graph), len(densified_twitter_graph.edges())))


[Densified Twitter graph]: 827 nodes, 1455 edges



In [15]:
timestamps = list(nx.get_edge_attributes(twitter_graph, 'timestamp').values())
timestamps.sort()

In [16]:
int((len(timestamps)-1) * 0.2)

1777

In [17]:
mode = 'target_network'
scenario = 'hard'
from tqdm import tqdm
if mode == 'target_network':
    cutoff_idx = int((len(timestamps)-1) * 0.2)
    cutoff_timestamp = timestamps[cutoff_idx]

    reduced_edges = [(x,y,z) for x,y,z in densified_twitter_graph.edges(data=True) 
            if z['timestamp'] <= cutoff_timestamp]

    reduced_graph = nx.DiGraph(reduced_edges)
    node_types = nx.get_node_attributes(twitter_graph, 'node_type')
    nx.set_node_attributes(reduced_graph, node_types, 'node_type')
    print('[Temporal Split Graph] {} nodes, {} edges'.format(len(reduced_graph), len(reduced_graph.edges())))

    if scenario == 'hard':
        all_users = [str(x) for x,y in densified_twitter_graph.nodes(data=True) if y['node_type']=='user']
        #all_tweets = [str(x) for x,y in densified_twitter_graph.nodes(data=True) if y['node_type']=='tweet']

        new_users_at_t = [x for x in all_users if x not in reduced_graph]

        new_retweets_at_t = []
        new_retweeting_users_at_t = []
        for head, tail, data in tqdm(densified_twitter_graph.edges(data=True)):
            if data['edge_type'] == 'retweet' and head in new_users_at_t:
                new_retweets_at_t.append((head,tail,data))
                new_retweeting_users_at_t.append(head)

        new_retweet_graph = nx.DiGraph(new_retweets_at_t)
        trimmed_edges = []
        for user in tqdm(new_users_at_t):
            user_retweets = list(new_retweet_graph.edges(user, data=True))
            if len(user_retweets) > 0:
                random_retweet = random.choice(user_retweets)
                trimmed_edges.append(random_retweet)

        tweets_at_t = [(head,tail,y) for head,tail,y in densified_twitter_graph.edges(data=True) if y['edge_type']=='tweet']

        reduced_graph.update(trimmed_edges)
        reduced_graph.update(tweets_at_t)
        nx.set_node_attributes(reduced_graph, node_types, 'node_type')
        print('[Scenario graph] {} nodes, {} edges'.format(len(reduced_graph), len(reduced_graph.edges())))

    ''' Filter '''
    min_retweets_users = [x for x, data in reduced_graph.nodes(data=True)
        if out_degrees[x] >= 6 and data['node_type']=='user']

    min_retweets_tweets = [x for x, data in reduced_graph.nodes(data=True)
        if in_degrees[x] >= 6 and data['node_type']=='tweet']

    min_tweets_users = [x for x, data in reduced_graph.nodes(data=True)
        if in_degrees[x] >= 3 and data['node_type']=='user']

    allowed_nodes = set(min_retweets_users + min_retweets_tweets + min_tweets_users)

    reduced_graph = nx.subgraph(reduced_graph, allowed_nodes).copy()

    largest_cc = max(nx.connected_components(reduced_graph.to_undirected()), key=len)
    reduced_graph = reduced_graph.subgraph(largest_cc).copy()
    print('[Final Starting graph] {} nodes, {} edges'.format(len(reduced_graph), len(reduced_graph.edges())))

    target_graph = nx.subgraph(densified_twitter_graph, reduced_graph.nodes).copy()
    print('[Final Target graph] {} nodes, {} edges'.format(len(target_graph), len(target_graph.edges())))


elif mode == 'temporal_bins':
    temporal_bins = [0.3, 0.3, 0.4]
    cutoff_timestamps = [timestamps[0]]
    for i, temporal_cutoff in enumerate(temporal_bins):
        temporal_cutoff = sum(temporal_bins[:i+1])
        cutoff_idx = int((len(timestamps)-1) * temporal_cutoff)
        cutoff_timestamp = timestamps[cutoff_idx]
        cutoff_timestamps.append(cutoff_timestamp)

    cutoff_ranges = list(zip(cutoff_timestamps[:-1], cutoff_timestamps[1:]))

    temporal_graphs = []

    running_full_graph = nx.DiGraph()
    running_trimmed_graph = nx.DiGraph()
    # TODO: This could be done better. 
    # This version doesn't include previous edges of newly added users
    for i, (from_, to_) in enumerate(cutoff_ranges):
        reduced_edges = [(x,y,z) for x,y,z in twitter_graph.edges(data=True) 
            if z['timestamp'] >= from_ and z['timestamp'] <= to_]

        reduced_graph = nx.DiGraph(reduced_edges)
        node_types = nx.get_node_attributes(twitter_graph, 'node_type')
        nx.set_node_attributes(reduced_graph, node_types, 'node_type')
        new_nodes = [x for x in reduced_graph.nodes() if x not in running_trimmed_graph.nodes] 
        print('{}: [Pre-filter graph] {} nodes, {} edges'.format(i, len(reduced_graph), len(reduced_graph.edges())))

        running_full_graph.update(reduced_graph)
        out_degrees = dict(running_full_graph.out_degree())
        in_degrees = dict(running_full_graph.in_degree()) 
        print('{}: [Running full graph] {} nodes, {} edges'.format(i, len(running_full_graph), len(running_full_graph.edges())))

        ''' Filter '''
        min_retweets_users = [x for x, data in reduced_graph.nodes(data=True)
            if out_degrees[x] >= 6 and data['node_type']=='user']

        min_retweets_tweets = [x for x, data in reduced_graph.nodes(data=True)
            if in_degrees[x] >= 6 and data['node_type']=='tweet']

        min_tweets_users = [x for x, data in reduced_graph.nodes(data=True)
            if in_degrees[x] >= 3 and data['node_type']=='user']

        print('{}: [Filtering] Users: min retweets - {}, min tweets: {} | Tweets: min retweets - {}'.format(
            i, len(min_retweets_users), len(min_tweets_users), len(min_retweets_tweets)
        ))
        allowed_nodes = set(min_retweets_users + min_retweets_tweets + min_tweets_users)
        print("{}: [Combine] Node count: {}".format(i, len(allowed_nodes)))

        reduced_graph = nx.subgraph(reduced_graph, allowed_nodes).copy()

        all_allowed_edges = nx.subgraph(running_full_graph, allowed_nodes)
        for h,t, data in all_allowed_edges.edges(data=True):
            if not running_trimmed_graph.has_edge(h,t):
                reduced_graph.add_edge(h,t,**data)

        largest_cc = max(nx.connected_components(reduced_graph.to_undirected()), key=len)
        reduced_graph = reduced_graph.subgraph(largest_cc).copy()

        running_trimmed_graph.update(reduced_graph)
        temporal_graphs.append(reduced_graph)

        #users = [str(x) for x,y in reduced_twitter_graph.nodes(data=True) if y['node_type']=='user']
        #tweets = [str(x) for x,y in reduced_twitter_graph.nodes(data=True) if y['node_type']=='tweet']
        #allowed_nodes = list(reduced_twitter_graph.nodes)
        print("{}: [Largest CC]: {} nodes, {} edges \n".format(i, len(reduced_graph), len(reduced_graph.edges())))

    running_graph = nx.DiGraph()

    scenario = 'hard'
    edges = dict()
    for t, graph_at_t in enumerate(temporal_graphs):
        if t == 0 or scenario == 'hard':

            users_at_t = [str(x) for x,y in graph_at_t.nodes(data=True) if y['node_type']=='user']
            new_users_at_t = [x for x in users_at_t if x not in running_graph]

            new_retweets_at_t = []
            new_retweeting_users_at_t = []
            for head, tail, data in graph_at_t.edges(data=True):
                if data['edge_type'] == 'retweet' and head in new_users_at_t:
                    new_retweets_at_t.append((head,tail,data))
                    new_retweeting_users_at_t.append(head)


            new_retweet_graph = nx.DiGraph(new_retweets_at_t)
            trimmed_edges = []
            for user in new_users_at_t:
                user_retweets = list(new_retweet_graph.edges(new_retweeting_users_at_t))
                random_retweet = random.choice(user_retweets)
                trimmed_edges.append(random_retweet)

            tweets_at_t = [(head,tail,y) for head,tail,y in graph_at_t.edges(data=True) if y['edge_type']=='tweet']

            trimmed_graph_at_t = nx.DiGraph(new_retweets_at_t+tweets_at_t)


            running_graph.update(graph_at_t)
            print('{}: Running graph: {} nodes, {} edges'.format(t, len(running_graph), len(running_graph.edges())))
            print('{}: Graph at t: {} nodes, {} edges'.format(t, len(graph_at_t), len(graph_at_t.edges())))
            print('{}: Trimmed graph at t: {} nodes, {} edges\n'.format(t, len(trimmed_graph_at_t), len(trimmed_graph_at_t.edges())))

        else:
            ''' '''
            new_nodes = set(graph_at_t.nodes) - set(running_graph.nodes)
            new_graph_at_t = graph_at_t.subgraph(new_nodes)

            running_graph.update(graph_at_t)
            running_graph.remove_nodes_from(n for n in running_graph.copy() if n in new_graph_at_t)
            print('{}: Updated running graph: {} nodes, {} edges'.format(t, len(running_graph), len(running_graph.edges())))

            if scenario == 'easy':
                graph_at_t.remove_nodes_from(new_graph_at_t.copy())
                print('{}: Filtered graph at t: {} nodes, {} edges\n'.format(t, len(graph_at_t), len(graph_at_t.edges())))

                edges[t] = list(graph_at_t.edges())
            if scenario == 'medium': # TODO
                running_users = [str(x) for x,y in running_graph.nodes(data=True) if y['node_type']=='user']
                running_tweets = []
                for head, tail, data in graph_at_t.edges(data=True):
                    if data['edge_type']=='tweet' and tail in running_users:
                        running_tweets.append(head)
                    elif data['edge_type']=='retweet' and head in running_users:
                        running_tweets.append(tail)

                filtered_graph_at_t = graph_at_t.subgraph(running_users+running_tweets)
                print('{}: Filtered graph at t: {} nodes, {} edges\n'.format(t, len(filtered_graph_at_t), len(filtered_graph_at_t.edges())))

                running_graph.update(filtered_graph_at_t)
                print('{}: Updated running graph: {} nodes, {} edges\n'.format(t, len(running_graph), len(running_graph.edges())))

''' Temporal Split of network '''
users = [str(x) for x,y in target_graph.nodes(data=True) if y['node_type']=='user']
tweets = [str(x) for x,y in target_graph.nodes(data=True) if y['node_type']=='tweet']

reduced_user_graph = nx.DiGraph()
for user in users:
    retweet_edges = reduced_graph.out_edges(user)
    for _, retweeted_tweet in retweet_edges:
        tweet_edges = list(reduced_graph.out_edges(retweeted_tweet))
        for _, tweet_author in tweet_edges:
            head = users.index(user)
            tail = users.index(tweet_author)

            reduced_user_graph.add_edge(head, tail) 


''' Remove all nodes with degree greater than '''
remove = [node for node,degree in dict(reduced_user_graph.in_degree()).items() if degree > 50]
print(len(remove))

target_graph.remove_nodes_from(remove)

reduced_graph.remove_nodes_from(remove)
largest_cc = max(nx.connected_components(reduced_graph.to_undirected()), key=len)
reduced_graph = reduced_graph.subgraph(largest_cc).copy()

reduced_user_graph.remove_nodes_from(remove)
largest_cc = max(nx.connected_components(reduced_user_graph.to_undirected()), key=len)
reduced_user_graph = reduced_user_graph.subgraph(largest_cc).copy()

''' Create Result Edges '''
result_edges = []
tweet_start_idx = len(users)
for head, tail in reduced_user_graph.edges:
    result_edges.append((
        head, tail, 'follow',
        '2-chain', 'user', 'user', 0
    ))


for head, tail, data in reduced_graph.edges(data=True):
    edge_type = data['edge_type']

    if edge_type == 'retweet':
        head = users.index(head)
        head_type = 'user'

        tail = tweets.index(tail) + tweet_start_idx
        tail_type = 'tweet'
    elif edge_type == 'tweet':
        head = tweets.index(head) + tweet_start_idx
        head_type = 'tweet'

        tail = users.index(tail)
        tail_type = 'user'
    else:
        raise Exception("Edge type {} not supported".format(edge_type))

    result_edges.append((
        head, tail, edge_type,
        '1-chain', head_type, tail_type, 0
    ))



target_user_graph = nx.DiGraph()
for user in users:
    retweet_edges = target_graph.out_edges(user)
    for _, retweeted_tweet in retweet_edges:
        tweet_edges = list(target_graph.out_edges(retweeted_tweet))
        for _, tweet_author in tweet_edges:
            head = users.index(user)
            tail = users.index(tweet_author)

            target_user_graph.add_edge(head, tail) 

print("Target Twitter Graph: {} nodes, {} edges".format(len(target_graph), len(target_graph.edges())))
print("Reduced Twitter Graph: {} nodes, {} edges".format(len(reduced_graph), len(reduced_graph.edges())))
print("Target User Graph: {} nodes, {} edges".format(len(target_user_graph), len(target_user_graph.edges())))
print("Reduced User Graph: {} nodes, {} edges".format(len(reduced_user_graph), len(reduced_user_graph.edges())))

nx.write_gexf(reduced_graph, os.path.join(data_dir, "reduced_twitter_graph.gexf"))
nx.write_gexf(target_graph, os.path.join(data_dir, "target_twitter_graph.gexf"))

reduced_community_mapper = init_community_mapper(reduced_user_graph)
nx.set_node_attributes(reduced_user_graph, reduced_community_mapper, 'block')
nx.write_gexf(reduced_user_graph, os.path.join(data_dir, "reduced_user_graph.gexf"))

target_community_mapper = init_community_mapper(target_user_graph)
nx.set_node_attributes(target_user_graph, target_community_mapper, 'block')
nx.write_gexf(target_user_graph, os.path.join(data_dir, "target_user_graph.gexf"))

out_edge_df = pd.DataFrame(result_edges, columns=['head', 'tail', 'edge_type', 'query_type', 'head_type', 'tail_type', 'bin'])
out_edge_df.to_csv(os.path.join(data_dir, "edges.csv"))

with open(os.path.join(data_dir, "num_entities.csv"), 'w', newline='') as f:  
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['user', 'tweet'])
    csv_writer.writerow([len(users), len(tweets)])

# TODO: Improve
with open(os.path.join(data_dir, "edge_types.csv"), 'w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['edge_type', 'edge_tuple'])
    csv_writer.writerow(['retweet', (0,)])
    csv_writer.writerow(['tweet', (1,)])
    csv_writer.writerow(['follow', (0,1)])

[Temporal Split Graph] 135 nodes, 297 edges


100%|███████████████████████████████████| 1455/1455 [00:00<00:00, 132299.52it/s]
100%|█████████████████████████████████████| 780/780 [00:00<00:00, 322861.65it/s]

[Scenario graph] 159 nodes, 316 edges
[Final Starting graph] 137 nodes, 301 edges
[Final Target graph] 137 nodes, 598 edges
0





ValueError: max() arg is an empty sequence