# __Social Network Analysis by Week__

First Step: creating the week network graph

In [1]:
import pandas as pd
import networkx as nx
import datetime, os

In [2]:
DATA_PATH_TT = 'data/tweets/'
DATA_PATH_NT = 'data/networks/'

In [3]:
def create_networks(tweets_df):
    nodes = []
    edges = []
    for u, rt, qrt, ir, mu in tweets_df[['user', 'retweetedTweet', 'quotedTweet', 'inReplyToUser', 'mentionedUsers']].values:
        user = eval(u)
        retweeted = eval(rt)
        quoted = eval(qrt)
        in_reply = eval(ir)
        mentioned = eval(mu)
        nodes.append((user['username'], user['displayname'], user['description']))
        if retweeted:
            if user['username'] != retweeted['user']['username']:
                edges.append((user['username'], retweeted['user']['username'], 'retweet'))
                nodes.append((retweeted['user']['username'], retweeted['user']['displayname'], retweeted['user']['description']))
        if quoted:
            if user['username'] != quoted['user']['username']:
                edges.append((user['username'], quoted['user']['username'], 'quoted_rt'))
                nodes.append((quoted['user']['username'], quoted['user']['displayname'], quoted['user']['description']))
        if in_reply:
            if user['username'] != in_reply['username']:
                edges.append((user['username'], in_reply['username'], 'reply'))
                nodes.append((in_reply['username'], in_reply['displayname'], in_reply['description']))
        if mentioned:
            for r_user in mentioned:
                if user['username'] != r_user['username']:
                    edges.append((user['username'], r_user['username'], 'mention'))
                    nodes.append((r_user['username'], r_user['displayname'], r_user['description']))
                
    nodes_df = pd.DataFrame(nodes, columns=['Id', 'DisplayName', 'Description'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'LinkType'])
    nodes_df.drop_duplicates(subset=['Id'], keep='last', ignore_index=True, inplace=True)
    
    return nodes_df, edges_df

def df_2_networkx(nodes, edges):
    w_edges = []
    G = nx.DiGraph()
    for usr, disp_n, desc in nodes.values:
        G.add_node(str(usr), displayname=str(disp_n), description=str(desc if desc else ''))
    edges_tuples = [tuple(x) for x in edges.to_numpy()]
    edges_count = pd.Series(edges_tuples).value_counts()
    for i in range(edges_count.shape[0]):
        w_edges.append(edges_count.index[i]+(edges_count.values[i],))
    for edge in w_edges:
        G.add_edge(str(edge[0]), str(edge[1]), interac_type=str(edge[2]), weight=int(edge[3]))
        
    G_quoted = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'quoted_rt'])
    G_rtweet = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'retweeted'])
    G_reply = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'reply'])
    G_mention = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'mention'])
    
    return G, G_quoted, G_rtweet, G_reply, G_mention

week_list = [week_dir for week_dir in os.listdir(DATA_PATH_TT) if os.path.isdir(DATA_PATH_TT+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]

week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [4]:
for week in week_list:
    print(f'Starting {week}...')
    f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith('merged_week')][0]
    week_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
    nodes_df, edges_df = create_networks(week_tt_df)
    print(f'Saving node list in: {DATA_PATH_NT+week}/{week}_nodes_list.parquet')
    nodes_df.to_parquet(DATA_PATH_NT+week+f'/{week}_node_list.parquet', index=False)
    print(f'Creating networkx graph from {week}...')
    G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes_df, edges_df)
    print('Saving complete networkx graph...')
    nx.write_gexf(G, DATA_PATH_NT+week+f'/{week}_complete_network_graph.gexf')
    print('Saving retweet networkx graph...')
    nx.write_gexf(G_rtweet, DATA_PATH_NT+week+f'/{week}_rt_network_graph.gexf')
    print('Saving quoted rt networkx graph...')
    nx.write_gexf(G_quoted, DATA_PATH_NT+week+f'/{week}_quoted_network_graph.gexf')
    print('Saving quoted reply networkx graph...')
    nx.write_gexf(G_reply, DATA_PATH_NT+week+f'/{week}_reply_network_graph.gexf')
    print('Saving quoted mention networkx graph...')
    nx.write_gexf(G_mention, DATA_PATH_NT+week+f'/{week}_mention_network_graph.gexf')
    print(f'Finished {week}\n')
    

Starting week_01...
Saving node list in: data/networks/week_01/week_01_nodes_list.parquet
Creating networkx graph from week_01...
Saving complete networkx graph...
Saving retweet networkx graph...
Saving quoted rt networkx graph...
Saving quoted reply networkx graph...
Saving quoted mention networkx graph...
Finished week_01

Starting week_02...
Saving node list in: data/networks/week_02/week_02_nodes_list.parquet
Creating networkx graph from week_02...
Saving complete networkx graph...
Saving retweet networkx graph...
Saving quoted rt networkx graph...
Saving quoted reply networkx graph...
Saving quoted mention networkx graph...
Finished week_02

Starting week_03...
Saving node list in: data/networks/week_03/week_03_nodes_list.parquet
Creating networkx graph from week_03...
Saving complete networkx graph...
Saving retweet networkx graph...
Saving quoted rt networkx graph...
Saving quoted reply networkx graph...
Saving quoted mention networkx graph...
Finished week_03

Starting week_04

 ## SNA

In [5]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.style.use('seaborn-darkgrid')
plt.close('all')

In [6]:
nodes_count = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

edges_count = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

density = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

for week in week_list:
    print(f'Starting {week}...')
    f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith('merged_week')][0]
    week_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
    nodes_df, edges_df = create_networks(week_tt_df)
    print(f'Creating networkx graph from {week}...')
    G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes_df, edges_df)
    #G = nx.read_gexf(DATA_PATH_NT+f'{week}/{week}_complete_network_graph.gexf')
    #G_mention = nx.read_gexf(DATA_PATH_NT+f'{week}/{week}_mention_network_graph.gexf')
    #G_quoted = nx.read_gexf(DATA_PATH_NT+f'{week}/{week}_quoted_network_graph.gexf')
    #G_reply = nx.read_gexf(DATA_PATH_NT+f'{week}/{week}_reply_network_graph.gexf')
    nodes_count['complete'].append(len(G.nodes()))
    nodes_count['mention'].append(len(G_mention.nodes()))
    nodes_count['quoted'].append(len(G_quoted.nodes()))
    nodes_count['reply'].append(len(G_reply.nodes()))
    edges_count['complete'].append(len(G.edges()))
    edges_count['mention'].append(len(G_mention.edges()))
    edges_count['quoted'].append(len(G_quoted.edges()))
    edges_count['reply'].append(len(G_reply.edges()))
    density['complete'].append(format(nx.density(G), 'f'))
    density['mention'].append(format(nx.density(G_mention), 'f'))    
    density['quoted'].append(format(nx.density(G_quoted), 'f'))    
    density['reply'].append(format(nx.density(G_reply), 'f'))    

Starting week_01...
Creating networkx graph from week_01...
Starting week_02...
Creating networkx graph from week_02...
Starting week_03...
Creating networkx graph from week_03...
Starting week_04...
Creating networkx graph from week_04...
Starting week_05...
Creating networkx graph from week_05...
Starting week_06...
Creating networkx graph from week_06...
Starting week_07...
Creating networkx graph from week_07...
Starting week_08...
Creating networkx graph from week_08...
Starting week_09...
Creating networkx graph from week_09...
Starting week_10...
Creating networkx graph from week_10...
Starting week_11...
Creating networkx graph from week_11...
Starting week_12...
Creating networkx graph from week_12...
Starting week_pr_01...
Creating networkx graph from week_pr_01...
Starting week_pr_02...
Creating networkx graph from week_pr_02...
Starting week_13...
Creating networkx graph from week_13...
Starting week_14...
Creating networkx graph from week_14...
Starting week_15...
Creating

In [7]:
for k in nodes_count:
    figure(figsize=(10, 6), dpi=150)
    plt.plot(nodes_count[k], color='red', label='nodes')
    plt.plot(edges_count[k], color='blue', label='edges')
    plt.title(f'{k} network per week nodes and edges comparison')
    plt.xlabel('week')
    plt.ylabel('amount')
    plt.legend()
    plt.savefig(f'plot_{k}_nodes_edges.png')
    plt.close()

In [8]:
figure(figsize=(10, 6), dpi=150)
plt.plot(density['complete'], color='red', label='complete')
plt.plot(density['mention'], color='blue', label='mention')
plt.plot(density['quoted'], color='black', label='quoted')
plt.plot(density['reply'], color='green', label='reply')
plt.title('density per week')
plt.xlabel('week')
plt.ylabel('density')
plt.yticks(rotation = 45)
ax = plt.gca()
for label in ax.get_yaxis().get_ticklabels()[::2]:
   label.set_visible(False)
ax.set_ylabel('density', fontsize = 8)
plt.legend()
plt.savefig(f'plot_week_density.png')
plt.close()

### day

In [9]:
day_list = ['day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7']

nodes_count = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

edges_count = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

density = {
    'complete': [],
    'mention': [],
    'quoted': [],
    'reply': []
}

for week in week_list:
    print(f'Starting {week}...')
    for day in day_list:
        print(f'Starting {day}...')
        f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith(f'merged_{day}')][0]
        day_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
        nodes_df, edges_df = create_networks(day_tt_df)
        print(f'Creating networkx graph from {week}...')
        G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes_df, edges_df)
        nodes_count['complete'].append(len(G.nodes()))
        nodes_count['mention'].append(len(G_mention.nodes()))
        nodes_count['quoted'].append(len(G_quoted.nodes()))
        nodes_count['reply'].append(len(G_reply.nodes()))
        edges_count['complete'].append(len(G.edges()))
        edges_count['mention'].append(len(G_mention.edges()))
        edges_count['quoted'].append(len(G_quoted.edges()))
        edges_count['reply'].append(len(G_reply.edges()))
        density['complete'].append(format(nx.density(G), 'f'))
        density['mention'].append(format(nx.density(G_mention), 'f'))    
        density['quoted'].append(format(nx.density(G_quoted), 'f'))    
        density['reply'].append(format(nx.density(G_reply), 'f'))

Starting week_01...
Starting day_1...
Creating networkx graph from week_01...
Starting day_2...
Creating networkx graph from week_01...
Starting day_3...
Creating networkx graph from week_01...
Starting day_4...
Creating networkx graph from week_01...
Starting day_5...
Creating networkx graph from week_01...
Starting day_6...
Creating networkx graph from week_01...
Starting day_7...
Creating networkx graph from week_01...
Starting week_02...
Starting day_1...
Creating networkx graph from week_02...
Starting day_2...
Creating networkx graph from week_02...
Starting day_3...
Creating networkx graph from week_02...
Starting day_4...
Creating networkx graph from week_02...
Starting day_5...
Creating networkx graph from week_02...
Starting day_6...
Creating networkx graph from week_02...
Starting day_7...
Creating networkx graph from week_02...
Starting week_03...
Starting day_1...
Creating networkx graph from week_03...
Starting day_2...
Creating networkx graph from week_03...
Starting day

In [10]:
for k in nodes_count:
    figure(figsize=(10, 6), dpi=150)
    plt.plot(nodes_count[k], color='red', label='nodes')
    plt.plot(edges_count[k], color='blue', label='edges')
    plt.title(f'{k} network per day nodes and edges comparison')
    plt.xlabel('day')
    plt.ylabel('amount')
    plt.legend()
    #plt.savefig(f'figs/plot_{k}_nodes_edges_daily.png')
    plt.close()

In [11]:
figure(figsize=(10, 6), dpi=150)
plt.plot(density['complete'], color='red', label='complete')
plt.plot(density['mention'], color='blue', label='mention')
plt.plot(density['quoted'], color='black', label='quoted')
plt.plot(density['reply'], color='green', label='reply')
plt.title('density per day')
plt.xlabel('day')
plt.ylabel('density')
plt.yticks(rotation = 45)
ax = plt.gca()
for label in ax.get_yaxis().get_ticklabels()[::10]:
   label.set_visible(False)
ax.set_ylabel('density', fontsize = 8)
plt.legend()
#plt.savefig('figs/plot_day_density.png')
plt.close()

## Community Detection

In [12]:
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from networkx.algorithms.community.label_propagation import asyn_lpa_communities

In [13]:
G9_mention = nx.read_gexf(DATA_PATH_NT+'week_09/week_09_mention_network_graph.gexf')
print(f'''
Number of nodes: {len(G9_mention.nodes())}
Number of edges: {len(G9_mention.edges())}
''')


Number of nodes: 47293
Number of edges: 105432



In [14]:
G9_reply = nx.read_gexf(DATA_PATH_NT+'week_09/week_09_reply_network_graph.gexf')
print(f'''
Number of nodes: {len(G9_reply.nodes())}
Number of edges: {len(G9_reply.edges())}
''')


Number of nodes: 14915
Number of edges: 19006



In [15]:
#comm_grivan_mention = list(girvan_newman(G))

In [16]:
comm_modularity_mention = list(greedy_modularity_communities(G9_mention, weight='weight'))

In [17]:
len(comm_modularity_mention)

1924

In [18]:
for idx, cluster in enumerate(comm_modularity_mention):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(0, 8583)
(1, 7530)
(2, 7350)
(3, 5090)
(4, 4568)
(5, 3236)
(6, 1669)
(7, 1515)
(8, 1012)
(9, 670)
(10, 203)
(11, 112)
(12, 106)
(13, 87)
(14, 72)
(15, 66)
(16, 33)
(17, 33)
(18, 33)
(19, 32)
(20, 29)
(21, 23)
(22, 23)
(23, 22)
(24, 22)
(25, 20)
(26, 20)
(27, 19)
(28, 19)
(29, 19)
(30, 19)
(31, 18)
(32, 18)
(33, 18)
(34, 18)
(35, 18)
(36, 17)
(37, 17)
(38, 16)
(39, 15)
(40, 15)
(41, 15)
(42, 14)
(43, 14)
(44, 14)
(45, 14)
(46, 13)
(47, 13)
(48, 13)
(49, 13)
(50, 12)
(51, 12)
(52, 11)
(53, 11)
(54, 11)
(55, 11)
(56, 10)
(57, 10)
(58, 10)
(59, 10)
(60, 10)
(61, 10)
(62, 10)


In [19]:
comm_label_prop_mention =  list(asyn_lpa_communities(G_mention, weight='weight', seed=123))

In [20]:
len(comm_label_prop_mention)

870

In [21]:
for idx, cluster in enumerate(comm_label_prop_mention):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(2, 11)
(3, 10)
(7, 35)
(14, 13)
(33, 31)
(44, 11)
(52, 13)
(102, 10)
(106, 10)


In [22]:
comm_modularity_reply = list(greedy_modularity_communities(G9_reply, weight='weight'))

In [23]:
len(comm_modularity_reply)

1621

In [24]:
for idx, cluster in enumerate(comm_modularity_reply):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(0, 2045)
(1, 1592)
(2, 1405)
(3, 1008)
(4, 869)
(5, 697)
(6, 608)
(7, 585)
(8, 424)
(9, 302)
(10, 291)
(11, 242)
(12, 226)
(13, 195)
(14, 143)
(15, 96)
(16, 72)
(17, 72)
(18, 42)
(19, 29)
(20, 29)
(21, 28)
(22, 27)
(23, 26)
(24, 24)
(25, 22)
(26, 19)
(27, 19)
(28, 18)
(29, 18)
(30, 17)
(31, 16)
(32, 15)
(33, 15)
(34, 13)
(35, 12)
(36, 11)
(37, 10)
(38, 10)


In [25]:
node_groups = []
for com in (comm_modularity_reply):
  node_groups.append(list(com))

print(len(node_groups))

1621


In [26]:
node_groups[2][:20]

['jose__nobody',
 'LuizdeSouzaAlve',
 'cezarherrera',
 'mariayjandre',
 'Jos17846367',
 'anttoniofc',
 'ReginaM05144847',
 'GiuliaJules88',
 'srtaempoderada',
 'DaniHel31521687',
 'cury_flavio',
 'MateusLFFerrari',
 'julyavilelaa',
 'Ockto',
 'jackdferreira',
 'GlauPiment',
 'Aeudson',
 'Marcellems',
 'TNaitzhe',
 'danirenucci']

In [27]:
comm_label_prop_reply =  list(asyn_lpa_communities(G_reply, weight='weight', seed=123))

In [28]:
len(comm_label_prop_reply)

133

In [29]:
for idx, cluster in enumerate(comm_label_prop_reply):
    if len(cluster) >= 10:
        print((idx, len(cluster)))