# Interactive Network Visualization with Bokeh

In [1]:
import pandas as pd
import networkx
from networkx.algorithms.community import modularity_max
import matplotlib.pyplot as plt
import numpy as np
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, EdgesAndLinkedNodes, NodesAndLinkedEdges, Label
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import viridis
from bokeh.transform import linear_cmap
from datetime import datetime
from sklearn import preprocessing

In [829]:
output_notebook()

### 0. Configurations for use cases

In [None]:
# Choose a configuration
config = 0 # if 0=Hashtag, else Mention

In [1088]:
# Configurations
if config == 0:
    # Hashtag network
    path = r'C:\..\data\hashtags_weighted_edges.json'
    title_subject = 'Hashtag'
    min_weight = 325 # for pre-filtering of data
    network_layout = networkx.kamada_kawai_layout
    layout_args = {'scale':10, 'weight': 'weight'}
else:
    # Mention network
    path = r'C:\..\data\mentions_weighted_edges.json'
    title_subject = 'Mention'
    min_weight = 400 # for pre-filtering of data
    network_layout = networkx.spring_layout
    layout_args = {'scale':10, 'k': 2, 'iterations': 50, 'seed': 2, 'weight': 'weight'}

### 1. Create Network From Pandas DataFrame

In [1089]:
# load preprocessed file
df = pd.read_json(path)

### 2. Filter data

In [1090]:
# Filter out relations with a weight less than min_weight
df_filtered = df.drop(df[df.weight < min_weight].index)
print('Shape of data:',df_filtered.shape)
#df_filtered

Shape of data: (227, 3)


### 3. Create network graph

In [1091]:
# Create network graph
G = networkx.from_pandas_edgelist(df_filtered, 'source', 'target', 'weight')

### 4. Remove small components

In [1092]:
# Remove components which less n connections 
# (so that there will be no free floating mini-graphs disconnected from the network)
n_connections = 4
for component in list(networkx.connected_components(G)):
    if len(component)<n_connections:
        for node in component:
            G.remove_node(node)

### 5. Calculate degree for each node and add as node attribute

In [1093]:
degrees = dict(networkx.degree(G))
networkx.set_node_attributes(G, name='degree', values=degrees)

### 6. Slightly adjust degree so that the nodes with very small degrees are still visible

In [1094]:
number_to_adjust_by = 5
adjusted_node_size = dict([(node, degree+number_to_adjust_by) for node, degree in networkx.degree(G)])
networkx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size)

### 7. Adjust weight so that the width of the edges does not exceed the smaller node size

In [1095]:
weight_list = [attr['weight'] for _, _, attr in G.edges(data=True)]
weight_min = 1
weight_max = 10
weight_scaler = preprocessing.MinMaxScaler(feature_range=(weight_min, weight_max))
weight_normalized = weight_scaler.fit_transform(np.asarray(weight_list).reshape(-1, 1))

# Add normalized edge width
edge_attrs = {}
for edge, weight_norm in zip(G.edges(data=True), weight_normalized):
    edge_attrs[(edge[0], edge[1])] = weight_norm[0]
networkx.set_edge_attributes(G, edge_attrs, "normalized_weight")

### 8. Calculate communities

In [1096]:
# Calcualte communities
from networkx.algorithms import community
communities = community.greedy_modularity_communities(G)
print("Number of communities:", len(communities))

Number of communities: 8


### 9. Add modularity class and color as attributes to network graph

In [1098]:
# The biggest n communities should be distinguishable by color
n_biggest_communities = 8
if len(communities) < n_biggest_communities:
    color_palette = list(viridis(len(communities)))
    color_palette.reverse()
else:    
    color_palette = list(viridis(n_biggest_communities))
    color_palette.reverse()
    color_palette.extend(['black'] * (len(communities)-8))

# Create empty dictionaries
modularity_class = {}
modularity_color = {}
#Loop through each community in the network
for community_number, community in enumerate(communities):
    #For each member of the community, add their community number and a distinct color
    for name in community: 
        modularity_class[name] = community_number        
        modularity_color[name] = color_palette[community_number]

# Add modularity class and color as attributes from the network above
networkx.set_node_attributes(G, modularity_class, 'modularity_class')
networkx.set_node_attributes(G, modularity_color, 'modularity_color')

### 10. Add edge color

In [1099]:
def get_halfway_color(c1, c2):
    r1, g1, b1 = [int(c1[p:p+2], 16) for p in range(1,6,2)]
    r2, g2, b2 = [int(c2[p:p+2], 16) for p in range(1,6,2)]
    c = '#{:02x}{:02x}{:02x}'.format((r1+r2) // 2, (g1+g2) //2, (b1+b2)// 2)
    return c

# Add edge color
edge_attrs = {}
for start_node, end_node, _ in G.edges(data=True):
    edge_color = G.nodes[start_node]['modularity_color'] if G.nodes[start_node]['modularity_color'] == G.nodes[end_node]['modularity_color'] else get_halfway_color(G.nodes[start_node]['modularity_color'], G.nodes[end_node]['modularity_color'])
    edge_attrs[(start_node, end_node)] = edge_color
networkx.set_edge_attributes(G, edge_attrs, "edge_color")

### 11. Plot the network graph

In [1100]:
# Choose colors for node and edge highlighting
node_highlight_color = 'white'
selection_color = 'skyblue'
hover_color = 'red'

# Choose attributes from G network to size and color by — setting manual size (e.g. 10) or color (e.g. 'skyblue') also allowed
size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'modularity_color'

# Choose a title
title = f'{title_subject} Network'

# Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [
        (f"{title_subject}", "@index"),
        ("Degree", "@degree"),
        ("Modularity Class", "@modularity_class"),
        ("Modularity Color", "$color[swatch]:modularity_color"),
]

# Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="tap,pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
              x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title, width=1000, height=1000)

# Create a network graph object
network_graph = from_networkx(G, network_layout, **layout_args, center=(0, 0))

# Set node sizes and colors according to node degree (color as category from attribute)
network_graph.node_renderer.glyph = Circle(size=size_by_this_attribute, fill_color=color_by_this_attribute, fill_alpha = 1)
# Set node highlight colors
network_graph.node_renderer.selection_glyph = Circle(size=size_by_this_attribute, fill_color=node_highlight_color, line_color=selection_color, line_width=2)
network_graph.node_renderer.hover_glyph = Circle(size=size_by_this_attribute, fill_color=node_highlight_color, line_color=hover_color, line_width=2)

# Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_color='edge_color', line_width='normalized_weight', line_alpha=0.3)
# Set edge highlight colors
network_graph.edge_renderer.selection_glyph = MultiLine(line_color=selection_color, line_width='normalized_weight')
network_graph.edge_renderer.hover_glyph = MultiLine(line_color=hover_color, line_width='normalized_weight')

# Highlight nodes and edges
network_graph.selection_policy = NodesAndLinkedEdges()
network_graph.inspection_policy = NodesAndLinkedEdges()
plot.renderers.append(network_graph)

# Add Labels (with normalized font size)
x, y = zip(*network_graph.layout_provider.graph_layout.values())
node_labels = list(G.nodes())
font_size_min = 8
font_size_max = 25
font_size_raw = [G.degree(node_labels[i]) for i in range(len(x))]
font_size_scaler = preprocessing.MinMaxScaler(feature_range=(font_size_min, font_size_max))
font_size_normalized = font_size_scaler.fit_transform(np.asarray(font_size_raw).reshape(-1, 1))
font_size_normalized = [str(label_font_size[0]) + 'px' for label_font_size in font_size_normalized]
source = ColumnDataSource({'x': x, 'y': y, 'name': [node_labels[i] for i in range(len(x))], 'font_size_normalized': font_size_normalized})
labels = []
for x, y, label, fontsize in zip(source.data['x'], source.data['y'], source.data['name'], source.data['font_size_normalized']):
    labels.append(Label(x=x, y=y, text=label, level='glyph', text_font_style='bold', background_fill_color='white', background_fill_alpha=.6, text_align ='center', text_baseline = 'bottom', text_font_size=fontsize))
    plot.add_layout(labels[-1])

show(plot)

## The hashtag network 

#### Data
- The aggregated data contains the hashtags (i.e. #hashtag) in each tweet as edges between the hashtags used in one tweet, e.g. if the hashtags #btw17 and #spd were both used in one tweet it was counted as an edge between those nodes

#### Description of the plot
- The visualization shows the hashtags used. The size of the hashtag label is scaled according to the node size
- The data was filtered by the weight of the edges. The weight of the edges is defined as the number of times two hashtags were used in the same tweet. We enforced a minimum weight to ensure that the network does not get to big, thus, focusing on the most important relations (edges with highest weights)
- The color of the nodes represents different communities within the network which were detected by an algorithm
- The size of the nodes represents the number of different edges the nodes has, i.e. the number of different accounts it was mentioned from
- The color of an edges also represents the communities this node is connect to, if the communities of the connected nodes differ the edge was given a color between those two community colors
- The width of the edges is given by the weight, i.e. how often the two connected hashtags were used together
- Interactivity: It is possible to zoom in and out, to hover over nodes and to select a node

#### Reason for plot choice (Why a network graph?)
- A network graph allows to make the information which is contained in over 100.000 tweets easily graspable for the viewer. The size of nodes and edges as well as the color give us additional tools to make the visualization more intuitive 
- This type of graph also makes it easy to verify certain assumptions, e.g. someone would assume that political parties and their election slogans are often used together in a tweet
- Another advantage of this network implementation is it´s interactivity. Problems of static network graphs like overlaying lables can be easily overcome by zooming into the network. Moreover, the hovering and selecting functions enable the viewers to explore the data themselves. Especially after giving a more straightforward analysis of the data in the previous parts of our report, this last part is intendet to give the viewers the possibility to interact with the data more in a less predefined way
- While showing data that was already used in our previous analyses (statistics of the most used hashtags) the hashtag network presents the user additionall information in form of the hashtags with which the most widespread hashtags were used together (and which are on their own to small to appear in the top ten most used hashtags)

#### Interpretation
- The detected communities/clusters give us a chance to detect topics which were important during the election campaign:
    - Clusters which represent topics like <i>G20</i> (g20 summit), <i>Dieselgate</i> (Diesel emissions scandal), <i>Erdogan</i>, <i>Ehe für alle</i> (same-sex marriage) but also clusters which are dominated by one political party like the <i>AfD</i<
- As we already have seen through the previous analyses the AFD-related hashtags #afd and #traudichdeutschland are along with #btw17 the most connected hashtags in this network
    - Moreover, we can observe that the parties <i>SPD</i>, <i>Grüne</i>, <i>Linke</i>, <i>FDP</i>, <i>CDU</i> and <i>CSU</i> are considered to be in one cluster while the <i>AFD</i> is represented by two clusters
    - This shows how dominating the <i>AfD</i> related tweets were during the election campaign compared to the other political parties

## The mentions network 

#### Data
- The aggregated data contains the mentions (i.e. @screen_name) in each tweet as edges between the accounts mentioned in one tweet, e.g. if the accounts @dielinke and @die_gruenen were both mentioned in one tweet it was counted as an edge between those nodes
- The data does not(!) account for the relationship between the account who created or retweeted the given tweet and the accounts mentioned in the tweet.

#### Description of the plot
- The visualization shows the screen_name of the accounts. The size of the screen_names is scaled according to the node size
- The data was filtered by the weight of the edges. The weight of the edges is defined as the number of times two accounts were mentioned in the same tweet. We enforced a minimum weight to ensure that the network gets not too big, thus, focusing on the most important relations (edges with most weight)
- The color of the nodes represents different communities within the network which were detected by an algorithm
- The size of the nodes represents the number of different edges the nodes has, i.e. the number of different accounts it was mentioned with
- The color of the edges also represent the communities of the nodes they connect, if the communities of the connected nodes differ the edge was given a color between those two communities colors
- The width of the edges is given by the weight, i.e. how often the two connected accounts were mentioned together
- Interactivity: It is possible to zoom in and out, to hover over nodes and to select a node

#### Reason for plot choice (Why a network graph?)
- A network graph allows to make the information which is contained in over 100.000 tweets easily graspable for the viewer. The size of nodes and edges and also the color give us additional tools to make the visualization more intuitive 
- This type of graph also makes it easy to verify certain assumptions, e.g. someone would assume that politicians and their party are often mentioned together in a tweet. As the network visualization has shown this assumption is right
- Another advantage of this network implementation is it´s interactivity, problems of static network graphs like e.g. overlaying lables can be easily overcome by zooming into the network. Moreover, the hovering and selecting functions enable the viewers to explore the data themselves. Especially after giving a more straightforward analysis of the data in the previous parts of our report, this last part should be give the viewers the chance to interact themselves with the data

#### Interpretation
- The detected communities/clusters have a large overlap with the different political parties:
    - The parties <i>FDP, Die Grünen, AFD</i> and <i>Die Linke</i> have each there own cluster while <i>SPD</i> and <i>CDU</i> are sharing a cluster
    - The network also allows us to identify for each political party their most prominent party member: Christian Lindner (<i>FDP</i>), Cem Özdemir (<i>Die Grünen</i>), Beatrice von Storch (<i>AfD</i>) and Martin Schulz (<i>SPD</i>). For <i>Die Linke</i> Sarah Wagenknecht and Dietmar Bartsch are equal prominent. For the <i>CDU</i> there is no prominent figure to detect
- In the <i>SPD/CDU</i> Cluster we can also identify a group of media companies like Spiegel, Bild, ARD, etc. which are closely connected with eachother, which means there were often mentioned it tweets togehter. This allows the assumption that some tweets were refering to articles written by those companies
- The last cluster can be found around Peter Tauber, he was involved in a longe twitter thread in which he has drawn anger upon himself because of his statement "Wenn Sie was Ordentliches gelernt haben, brauchen Sie keine drei Minijobs" (If you have learned something proper, you don't need three mini-jobs.)

# Save the network graph

In [674]:
now = datetime.now()
datetime_string = now.strftime("%d-%m-%Y_%Hh%M")

### Save graph as html

In [None]:
save(plot, filename=f"{title}_{datetime_string}.html")

### Save graph in gephi format

In [None]:
networkx.write_gexf(G,f"{title}_{datetime_string}.gexf") 