# Ibizagate

compare ibiza relevant articles to other articles, look at differences in #interactions and activity (out_degrees) 

__Question: Is there a correlation between the de/in-crease of metrics and the distance to an event (ibizagate)?__

metrics:
* interactions
* activity (out_degrees)
* centrality TODO

In [1]:
# automatically reload imports before executing any line in case you changed something
%load_ext autoreload
%autoreload 2

In [99]:
import utils
import read_graph
import networkx as nx
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
from tqdm import tqdm
from random import sample 

## Load Data

In [3]:
all_postings = utils.read_all_postings()
all_votes = utils.read_all_votes()

## Filter for relevant posts

* ArticleRessort == FPÖ
* Title contains FPÖ
* Title contains Strache
* Title contains Ibiza

In [86]:
selection = (
(all_postings.ArticleRessortName == 'FPÖ') |
(all_postings['ArticleTitle'].str.contains('Ibiza', regex=False)) |
(all_postings['ArticleTitle'].str.contains('ibiza', regex=False)) |
(all_postings['ArticleTitle'].str.contains('Strache', regex=False)))

# posts
relevant_posts = all_postings[selection]
irrelevant_posts = all_postings[[not x for x in selection]]
print(all_postings.shape, relevant_posts.shape, irrelevant_posts.shape)

# group for articles
relevant_articles = relevant_posts.groupby(by=['ID_Article', 'ArticlePublishingDate']).count().sort_values(by='ID_Posting', ascending=False)['ID_Posting']
irrelevant_articles = irrelevant_posts.groupby(by=['ID_Article', 'ArticlePublishingDate']).count().sort_values(by='ID_Posting', ascending=False)['ID_Posting']
print(relevant_articles.shape, irrelevant_articles.shape)

(739094, 14) (80413, 14) (658681, 14)
(136,) (4215,)


## Calculate Metrics

### relevant articles

In [67]:
result = {}

# loop over all relevant articles!
for art_idx in tqdm([x[0] for x in relevant_articles.index]):
    posts = relevant_posts[relevant_posts.ID_Article == art_idx]
    interactions = read_graph.get_all_users_interactions(posts, all_votes, salvage_original_node_ids=True)
    communities = utils.get_communities(interactions, min_size=100)
    
    # for com in communies
    com_metrics = []
    for com in communities:
        com_interactions = interactions.subgraph(com)
        metric = len(com_interactions.edges) / len(com_interactions.nodes)
        # metric = len(com_interactions.edges)
        com_metrics.append(metric)
    
    result[art_idx] = com_metrics

100%|██████████| 136/136 [39:00<00:00, 17.21s/it]


### irrelevant articles

In [103]:
# use random subsample of irrelevant articles, since 4215 is too many articles
N = 100
rnd_irr_articles = sample(list(irrelevant_articles.index), N)

# loop over all irrelevant articles!
ir_result = {}
for art_idx in tqdm([x[0] for x in rnd_irr_articles]):
    posts = irrelevant_posts[irrelevant_posts.ID_Article == art_idx]
    interactions = read_graph.get_all_users_interactions(posts, all_votes, salvage_original_node_ids=True)
    communities = utils.get_communities(interactions, min_size=100)
    
    # for com in communies
    com_metrics = []
    for com in communities:
        com_interactions = interactions.subgraph(com)
        metric = len(com_interactions.edges) / len(com_interactions.nodes)
        # metric = len(com_interactions.edges)
        com_metrics.append(metric)
    
    ir_result[art_idx] = com_metrics

100%|██████████| 100/100 [28:53<00:00, 17.33s/it]


## Visualization

In [374]:
# prepare viz
viz_arr = [
    (all_postings[all_postings.ID_Article == x]['ArticlePublishingDate'].iloc[0].split()[0],
     np.mean(result[x]), 
     all_postings[all_postings.ID_Article == x]['ArticleTitle'].iloc[0])
    for x in result.keys() if len(result[x]) > 0]

viz_df = pd.DataFrame(viz_arr, columns = ['date', 'mean', 'title'])
viz_df = viz_df.sort_values(by='date')
grouped_viz_df = viz_df.groupby('date').max()

In [367]:
# prepare irrelevant data
irr_arr = [
    (all_postings[all_postings.ID_Article == x]['ArticlePublishingDate'].iloc[0].split()[0],
    np.mean(ir_result[x]),
    all_postings[all_postings.ID_Article == x]['ArticleTitle'].iloc[0])
    for x in ir_result if len(ir_result[x]) > 0]

irr_df = pd.DataFrame(irr_arr, columns=['date', 'mean', 'title'])
irr_df = irr_df.sort_values(by='date')
grouped_irr_df = irr_df.groupby('date').max()

In [458]:
# prepare displayed articles
# display_rel_art_ids = 
rel_title_selection = (
                        (grouped_viz_df.index == '2019-05-17') 
                       | (grouped_viz_df.index == '2019-05-05') 
                       #| (grouped_viz_df.index == '2019-05-26') 
                       #| (grouped_viz_df.index == '2019-05-20') 
                       #| (grouped_viz_df.index == '2019-05-09')
                      )
rel_titles = [grouped_viz_df.iloc[x].title if rel_title_selection[x] else ' ' for x in range(0,len(grouped_viz_df))]

irr_title_selection = (
                        (grouped_irr_df.index == '2019-05-19') 
                       #| (grouped_irr_df.index == '2019-05-12') 
                      )
irr_titles = [grouped_irr_df.iloc[x].title if irr_title_selection[x] else ' ' for x in range(0,len(grouped_irr_df))]

In [459]:
# visualize
x = list(grouped_viz_df.index)
y = list(grouped_viz_df['mean'])
ix = list(grouped_irr_df.index)
iy = list(grouped_irr_df['mean'])

fig = go.Figure([
    go.Scatter(
        x=x,
        y=y,
        line=dict(color='rgb(0,100,80)', shape='spline', smoothing=0.5),
        mode='lines+text',
        name='related',
        text=rel_titles
    ),
    
    go.Scatter(
        x=ix,
        y=iy,
        line=dict(color='rgb(80,0,100)', shape='spline', smoothing=0.5),
        mode='lines+text',
        name='unrelated',
        opacity=0.5,
        text=irr_titles
    ),
    
    go.Scatter(
        x=x,
        y=[np.mean(y)] * len(x),
        line=dict(color='rgb(0,100,80)', dash='dot'),
        mode='lines',
        name='related mean',
        opacity=0.5,
    ),
    
    go.Scatter(
        x=x,
        y=[np.mean(iy)] * len(x),
        line=dict(color='rgb(80,0,100)', dash='dot'),
        mode='lines',
        name='unrelated mean',
        opacity=0.5
    )
])

fig.update_layout(
    yaxis_title='avg interactions/member',
    title='Average Community Activity',
)

fig.add_shape(
    x0="2019-05-17", x1="2019-05-17", y0=0, y1=5,
    fillcolor="LightSalmon", 
    opacity=0.8,
    layer="below", 
    line_width=2
)

fig.update_traces(textposition='top right')

fig.show()