In [1]:
!git clone https://github.com/ValeriaPante/coordinatedActivity/

Cloning into 'coordinatedActivity'...
remote: Enumerating objects: 907, done.[K
remote: Counting objects: 100% (385/385), done.[K
remote: Compressing objects: 100% (241/241), done.[K
remote: Total 907 (delta 266), reused 251 (delta 143), pack-reused 522[K
Receiving objects: 100% (907/907), 2.39 MiB | 3.71 MiB/s, done.
Resolving deltas: 100% (601/601), done.


In [1]:
import pathlib
import pandas as pd
import numpy as np
import networkx as nx

from coordinatedActivity import coRetweet, coURL, hashtagSeq, textSimilarity, fastRetweet


[nltk_data] Downloading package stopwords to /home/minici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DATASET_NAME = 'UAE_sample'

CONTROL_FILE_IDX, IO_FILE_IDX = 0, 1
filename_dict = {'UAE_sample': ['control_driver_tweets_uae_082019.jsonl', 'uae_082019_tweets_csv_unhashed.csv']}
base_dir = pathlib.Path.cwd().parent
data_dir = base_dir / 'data' / 'raw'


In [15]:
io_df = pd.read_csv(data_dir / DATASET_NAME / filename_dict[DATASET_NAME][IO_FILE_IDX], sep=",")
control_df = pd.read_json(data_dir / DATASET_NAME / filename_dict[DATASET_NAME][CONTROL_FILE_IDX], lines=True)


#### CoRetweet Network construction

In [25]:
control, treated = control_df.copy(), io_df.copy()
control_column_names = ['id', 'user', 'retweeted_status']
treated_column_names = ['userid', 'tweetid', 'retweet_tweetid']
control = control[control_column_names]
treated = treated[treated_column_names]


In [26]:
coRetweet_network = coRetweet.coRetweet(control, treated)


In [27]:
print(f'CoRetweet network has {coRetweet_network.number_of_nodes()} nodes and {coRetweet_network.number_of_edges()} edges')

CoRetweet network has 4320 nodes and 315393 edges


#### CoURL Network construction

In [28]:
control, treated = control_df.copy(), io_df.copy()
control_column_names = ['user', 'entities', 'id']
treated_column_names = ['tweetid', 'userid', 'urls']
control = control[control_column_names]
treated = treated[treated_column_names]


In [29]:
coURL_network = coURL.coURL(control, treated)


In [30]:
print(f'CoURL network has {coURL_network.number_of_nodes()} nodes and {coURL_network.number_of_edges()} edges')


CoURL network has 6112 nodes and 414276 edges


#### FastRetweet Network construction

In [12]:
control, treated = control_df.copy(), io_df.copy()
control_column_names =  ['user', 'retweeted_status', 'id']
treated_column_names = ['tweetid', 'userid', 'retweet_tweetid', 'retweet_userid']
control = control[control_column_names]
treated = treated[treated_column_names]


In [6]:
fastRetweet_network = fastRetweet.fastRetweet(control_df, io_df)


AttributeError: Can only use .dt accessor with datetimelike values

#### FastRetweet Network construction

In [22]:
control, treated = control_df.copy(), io_df.copy()
control_column_names =  ['retweeted_status', 'user', 'in_reply_to_status_id', 'full_text', 'id']
treated_column_names = ['retweet_tweetid', 'userid', 'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'tweet_text', 'tweetid']
control = control[control_column_names]
treated = treated[treated_column_names]


In [23]:
hashtag_network = hashtagSeq.hashSeq(control, treated, minHashtags=5)


NameError: name 'i' is not defined

#### Fuse networks

In [31]:
fused_network = nx.compose(coRetweet_network, coURL_network)


In [32]:
print(f'CoURL network has {fused_network.number_of_nodes()} nodes and {fused_network.number_of_edges()} edges')

CoURL network has 6577 nodes and 518006 edges


In [33]:
connected_components_ordered_list = sorted(nx.connected_components(fused_network), key=len, reverse=True)
fused_network_lcc = nx.Graph(fused_network.subgraph(connected_components_ordered_list[0]))
fused_network_lcc.remove_edges_from(nx.selfloop_edges(fused_network_lcc))
print(f'nodes: {fused_network_lcc.number_of_nodes()} edges: {fused_network_lcc.number_of_edges()}')


nodes: 5711 edges: 509469


In [56]:
# Remap nodes
noderemapping = {nodeid: i for i, nodeid in enumerate(fused_network_lcc.nodes())}
noderemapping_rev = {v: k for k,v in noderemapping.items()}
node_labels = np.zeros(len(noderemapping))
all_io_users = set(io_df.userid.astype(str).values.tolist())
for nodeid in noderemapping_rev:
    raw_nodeid = noderemapping_rev[nodeid]
    node_labels[nodeid] = 1 if raw_nodeid in all_io_users else 0
