Sample some cascades to plot the network (for high- and low-quality messages)

In [1]:
## Group & sort by cascade size.
## Merge node uid with original network to get political affiliation
## Save file to .gml

In [2]:
import simsom.utils as utils
import matplotlib.pyplot as plt
import os
import glob
import json
from collections import defaultdict
import pandas as pd
import seaborn as sns

## Reshare cascade

In [3]:
import igraph as ig

In [7]:
# Read in reshare data
verbose_fpath = "results_base/message_info.json.gz"
data = utils.read_json_compressed(verbose_fpath)
print(data.keys())

messages = pd.DataFrame.from_records(data['all_messages'])

dict_keys(['verbose', 'tracktimestep', 'save_message_info', 'output_cascades', 'mu', 'quality', 'diversity', 'discriminative_pow', 'quality_timestep', 'exposure_timestep', 'age_timestep', 'all_messages', 'reshared_messages', 'feeds_message_ids', 'feeds_shares', 'feeds_ages'])


In [8]:
# Read in og graph data
network_path = "data/network_baseline.gml"
net = ig.Graph.Read_GML(network_path)

## Get reshare cascade by message id 

In [9]:
reshares = messages.explode("spread_via_agents")
reshares = reshares.drop_duplicates(subset=["id", "spread_via_agents"])

In [35]:
df = reshares
message_type = "human"  # bot or human
# CASCADE_DIR = os.path.join(
#     "/N/slate/baotruon/marketplace/results", "cascade_net", folder
# )

# if not os.path.exists(CASCADE_DIR):
#     os.makedirs(CASCADE_DIR)

if message_type == "human":
    filtered = df[df["is_by_bot"] == 0]
    sizes = filtered.groupby(
        "id"
    ).spread_via_agents.count()  # return a Series where message id the index
elif message_type == "bot":
    filtered = df[df["is_by_bot"] == 1]
    sizes = filtered.groupby("id").spread_via_agents.count()

# Get the largest cascade
reshare_cascades = sizes.sort_values(ascending=False).reset_index()
largest = reshare_cascades.head(1)
print(f"Id of the message with the largest cascade (size={largest.loc[0, 'spread_via_agents']}): {largest.loc[0, 'id']} ")

message_id = largest.loc[0, 'id']

Id of the message with the largest cascade (size=673): 164 


In [50]:
cascade_fpath = "results_base/reshare.csv"
cascade = pd.read_csv(cascade_fpath, dtype={"source": str, "target": str})
cascade.head()

Unnamed: 0,message_id,timestep,source,target
0,2,1,159,733
1,3,1,193,736
2,5,1,82,91
3,4,1,118,308
4,4,1,118,362


In [51]:
import igraph


def get_reshare_cascade(message_id, cascade_inpath):
    """
    Get reshare cascade by message_id
    cascade_fpath (str): fpath of reshare  data

    """
    cascade = pd.read_csv(cascade_inpath, sep=",")
    cascade_data = cascade[cascade.message_id == message_id]
    # Get vertices
    nodes = list(cascade_data.agent1.unique()) + list(cascade_data.agent2.unique())
    # convert list of lists to list of tuples
    edges = [tuple(pair) for pair in cascade_data[["agent1", "agent2"]].values]

    graph = igraph.Graph(directed=True)
    graph.add_vertices(nodes)
    graph.add_edges(edges)

    # Set attributes of nodes from og graph data
    og_nodes = [i for i in net.vs if i["uid"] in nodes]
    graph.vs["party"] = [
        float(node["party"]) if node["party"] != "None" else 0 for node in og_nodes
    ]
    graph.vs["misinfo"] = [
        float(node["misinfo"]) if node["party"] != "None" else 100 for node in og_nodes
    ]
    graph.vs["bot"] = [int(node["bot"]) for node in og_nodes]
    return graph

In [52]:
cascade.head()

Unnamed: 0,message_id,timestep,source,target
0,2,1,159,733
1,3,1,193,736
2,5,1,82,91
3,4,1,118,308
4,4,1,118,362


In [53]:
reshare_nodes = df[df.id == message_id]["spread_via_agents"].values
cascade_data = cascade[
    (cascade.message_id == message_id)
    & (cascade.source.isin(reshare_nodes))
    & (cascade.target.isin(reshare_nodes))
]

In [54]:
reshare_nodes

array(['0', '539', '1', '550', '814', '386', '276', '29', '51', '41', '4',
       '175', '8', '115', '310', '17', '275', '139', '35', '72', '889',
       '427', '499', '992', '558', '604', '20', '885', '25', '559', '554',
       '28', '372', '772', '984', '445', '18', '53', '109', '250', '2',
       '131', '174', '63', '135', '257', '129', '15', '96', '79', '861',
       '60', '899', '12', '612', '313', '407', '396', '493', '319', '668',
       '10', '289', '147', '254', '86', '337', '133', '367', '52', '749',
       '187', '213', '273', '312', '121', '486', '251', '48', '214',
       '232', '761', '482', '5', '517', '374', '911', '630', '464', '200',
       '303', '640', '198', '363', '87', '675', '238', '786', '75', '390',
       '108', '488', '483', '98', '588', '924', '182', '631', '434',
       '698', '882', '130', '778', '103', '127', '55', '995', '138',
       '681', '170', '535', '728', '359', '101', '651', '739', '471',
       '216', '546', '917', '167', '940', '62', '314', '4

In [55]:
cascade[
    (cascade.message_id == message_id)]

Unnamed: 0,message_id,timestep,source,target
484,164,1,0,1
490,164,1,0,2
498,164,1,0,4
508,164,1,0,5
539,164,1,0,7
...,...,...,...,...
47577,164,16,82,583
47585,164,16,877,972
47590,164,16,82,600
47601,164,16,82,608


In [56]:
reshare_nodes

array(['0', '539', '1', '550', '814', '386', '276', '29', '51', '41', '4',
       '175', '8', '115', '310', '17', '275', '139', '35', '72', '889',
       '427', '499', '992', '558', '604', '20', '885', '25', '559', '554',
       '28', '372', '772', '984', '445', '18', '53', '109', '250', '2',
       '131', '174', '63', '135', '257', '129', '15', '96', '79', '861',
       '60', '899', '12', '612', '313', '407', '396', '493', '319', '668',
       '10', '289', '147', '254', '86', '337', '133', '367', '52', '749',
       '187', '213', '273', '312', '121', '486', '251', '48', '214',
       '232', '761', '482', '5', '517', '374', '911', '630', '464', '200',
       '303', '640', '198', '363', '87', '675', '238', '786', '75', '390',
       '108', '488', '483', '98', '588', '924', '182', '631', '434',
       '698', '882', '130', '778', '103', '127', '55', '995', '138',
       '681', '170', '535', '728', '359', '101', '651', '739', '471',
       '216', '546', '917', '167', '940', '62', '314', '4

In [57]:
cascade_data

Unnamed: 0,message_id,timestep,source,target
484,164,1,0,1
490,164,1,0,2
498,164,1,0,4
508,164,1,0,5
539,164,1,0,7
...,...,...,...,...
47563,164,16,82,283
47577,164,16,82,583
47585,164,16,877,972
47590,164,16,82,600


In [58]:
# Get vertices
nodes = reshare_nodes
# nodes = list(cascade_data.agent1.unique()) + list(cascade_data.agent2.unique())

In [59]:
cascade_data[["source", "target"]].values

array([['0', '1'],
       ['0', '2'],
       ['0', '4'],
       ...,
       ['877', '972'],
       ['82', '600'],
       ['82', '608']], dtype=object)

In [60]:
# convert list of lists to list of tuples
edges = [tuple(pair) for pair in cascade_data[["source", "target"]].values]

In [61]:
import igraph

graph = igraph.Graph(directed=True)
graph.add_vertices(nodes)
graph.add_edges(edges)

In [62]:
net.vs.attributes()

['id', 'party', 'misinfo', 'bot', 'uid']

In [63]:
og_nodes = [i for i in net.vs if i["uid"] in nodes]

In [64]:
VIZ_DIR = "viz"
if not os.path.exists(VIZ_DIR):
    os.makedirs(VIZ_DIR)
# Set attributes of nodes from og graph data
graph.vs["party"] = [
    float(node["party"]) if node["party"] != "None" else 0 for node in og_nodes
]
graph.vs["misinfo"] = [
    float(node["misinfo"]) if node["party"] != "None" else 100 for node in og_nodes
]
graph.vs["bot"] = [int(node["bot"]) for node in og_nodes]
graph.write_gml(os.path.join(VIZ_DIR, f"{message_type}_{message_id}__reshare.gml"))

In [65]:
graph.vcount()

673

In [68]:
# Check that attributes are properly saved
c = ig.Graph.Read_GML(os.path.join(VIZ_DIR, f"{message_type}_{message_id}__reshare.gml"))
c.vs["party"]

[0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
 0.188561594202899,
 0.343641935483871,
 -0.202001040118871,
 0.291375342465753,
 0.25182972972973,
