# Ibizagate

compare ibiza relevant articles to other articles, look at differences in #interactions and activity (out_degrees) 

__Question: Is there a correlation between the de/in-crease of metrics and the distance to an event (ibizagate)?__

metrics:
* interactions
* activity (out_degrees)
* centrality TODO

In [2]:
# automatically reload imports before executing any line in case you changed something
%load_ext autoreload
%autoreload 2

In [3]:
import utils
import read_graph
import networkx as nx
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
from tqdm import tqdm
from random import sample 

## Load Data

In [4]:
all_postings = utils.read_all_postings()
all_votes = utils.read_all_votes()

## Filter for relevant posts

* ArticleRessort == FPÖ
* Title contains FPÖ
* Title contains Strache
* Title contains Ibiza

In [5]:
selection = (
(all_postings.ArticleRessortName == 'FPÖ') |
(all_postings['ArticleTitle'].str.contains('Ibiza', regex=False)) |
(all_postings['ArticleTitle'].str.contains('ibiza', regex=False)) |
(all_postings['ArticleTitle'].str.contains('Strache', regex=False)))

# posts
relevant_posts = all_postings[selection]
irrelevant_posts = all_postings[[not x for x in selection]]
print(all_postings.shape, relevant_posts.shape, irrelevant_posts.shape)

# group for articles
relevant_articles = relevant_posts.groupby(by=['ID_Article', 'ArticlePublishingDate']).count().sort_values(by='ID_Posting', ascending=False)['ID_Posting']
irrelevant_articles = irrelevant_posts.groupby(by=['ID_Article', 'ArticlePublishingDate']).count().sort_values(by='ID_Posting', ascending=False)['ID_Posting']
print(relevant_articles.shape, irrelevant_articles.shape)

(739094, 14) (80413, 14) (658681, 14)
(136,) (4215,)


## Calculate Metrics

### relevant articles

In [12]:
result = {}

# loop over all relevant articles!
for art_idx in tqdm([x[0] for x in relevant_articles.index]):
    posts = relevant_posts[relevant_posts.ID_Article == art_idx]
    interactions = read_graph.get_all_users_interactions(posts, all_votes, salvage_original_node_ids=True)
    communities = utils.get_communities(interactions, min_size=100)
    
    # for com in communies
    com_metrics = []
    for com in communities:
        com_interactions = interactions.subgraph(com)
        metric = len(com_interactions.edges) / len(com_interactions.nodes)
        # metric = len(com_interactions.edges)
        com_metrics.append(metric)
    
    result[art_idx] = com_metrics

100%|██████████| 136/136 [38:05<00:00, 16.81s/it]


### irrelevant articles

In [13]:
# use random subsample of irrelevant articles, since 4215 is too many articles
N = 1000
irr_pool_threshold = 100
irr_pool = irrelevant_articles[irrelevant_articles > irr_pool_threshold]
rnd_irr_articles = sample(list(irr_pool.index), N)

# loop over all irrelevant articles!
ir_result = {}
for art_idx in tqdm([x[0] for x in rnd_irr_articles]):
    posts = irrelevant_posts[irrelevant_posts.ID_Article == art_idx]
    interactions = read_graph.get_all_users_interactions(posts, all_votes, salvage_original_node_ids=True)
    communities = utils.get_communities(interactions, min_size=100)
    
    # for com in communies
    com_metrics = []
    for com in communities:
        com_interactions = interactions.subgraph(com)
        metric = len(com_interactions.edges) / len(com_interactions.nodes)
        # metric = len(com_interactions.edges)
        com_metrics.append(metric)
    
    ir_result[art_idx] = com_metrics

100%|██████████| 1000/1000 [4:32:20<00:00, 16.34s/it] 


## Visualization

In [119]:
# prepare viz (relevant)
viz_arr = [
    (all_postings[all_postings.ID_Article == x]['ArticlePublishingDate'].iloc[0].split()[0],
     np.mean(result[x]), 
     all_postings[all_postings.ID_Article == x]['ArticleTitle'].iloc[0])
    for x in result.keys() if len(result[x]) > 0]

viz_df = pd.DataFrame(viz_arr, columns = ['date', 'val', 'title'])
viz_df = viz_df.sort_values(by='date')

# group by date and min, mean, max
grouped_viz_df = viz_df.groupby('date').max()
grouped_viz_df['mean'] = viz_df.groupby('date').mean().val
grouped_viz_df['minn ']= viz_df.groupby('date').min().val
grouped_viz_df.columns = ['max', 'max_title', 'mean', 'min']
grouped_viz_df.head()

Unnamed: 0_level_0,max,max_title,mean,min
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-01,4.806245,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,4.806245,4.806245
2019-05-02,4.367188,Heinz-Christian Strache: Er will halt kein Lul...,4.367188,4.367188
2019-05-03,3.619891,FPÖ-Politikerin ist auf umstrittenes RFJ-Plaka...,3.298258,2.976626
2019-05-05,4.807058,"Salvini, Orbán, Strache – und Kurz?",4.807058,4.807058
2019-05-08,3.084184,Strache gratulierte Sportlern mit Inseraten um...,3.084184,3.084184


In [120]:
# prepare viz (irrelevant)
irr_arr = [
    (all_postings[all_postings.ID_Article == x]['ArticlePublishingDate'].iloc[0].split()[0],
    np.mean(ir_result[x]),
    all_postings[all_postings.ID_Article == x]['ArticleTitle'].iloc[0])
    for x in ir_result if len(ir_result[x]) > 0]

irr_df = pd.DataFrame(irr_arr, columns=['date', 'val', 'title'])
irr_df = irr_df.sort_values(by='date')

# grouped by date and min, mean, max
grouped_irr_df = irr_df.groupby('date').max()
grouped_irr_df['mean'] = irr_df.groupby('date').mean().val
grouped_irr_df['minn ']= irr_df.groupby('date').min().val
grouped_irr_df.columns = ['max', 'max_title', 'mean', 'min']
grouped_irr_df.head()

Unnamed: 0_level_0,max,max_title,mean,min
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-01,4.005896,Ärgern und Staunen über Nichtabschaffung der k...,3.416208,2.659206
2019-05-02,4.66875,Was stört Sie am Großraumbüro?,3.370298,1.927495
2019-05-03,4.785981,Österreich droht unter der jetzigen Regierung ...,3.339855,2.243935
2019-05-04,4.845144,Wieso gelten Mao-T-Shirts als Spaßprodukt und ...,3.255261,1.764784
2019-05-05,4.942029,"Österreichische Streamerinnen: ""Als Frau kämpf...",3.181648,2.110263


In [131]:
# prepare displayed articles
rel_title_selection = (
                        (grouped_viz_df.index == '2019-05-17') 
                       | (grouped_viz_df.index == '2019-05-01') 
                       #| (grouped_viz_df.index == '2019-05-05') 
                       #| (grouped_viz_df.index == '2019-05-20') 
                      )
rel_titles = [grouped_viz_df.iloc[x].max_title if rel_title_selection[x] else ' ' for x in range(0,len(grouped_viz_df))]

# prepare displayed irrelevant articles 
irr_title_selection = (
                        (grouped_irr_df.index == '2019-05-25') 
                       #| (grouped_irr_df.index == '2019-05-12') 
                      )
irr_titles = [grouped_irr_df.iloc[x].max_title if irr_title_selection[x] else ' ' for x in range(0,len(grouped_irr_df))]

In [162]:
# visualize
x = list(grouped_viz_df.index)
y = list(grouped_viz_df['mean'])
ix = list(grouped_irr_df.index)
iy = list(grouped_irr_df['mean'])

y_lower = list(grouped_viz_df['min'])
y_upper = list(grouped_viz_df['max'])
iy_lower = list(grouped_irr_df['min'])
iy_upper = list(grouped_irr_df['max'])


fig = go.Figure([
    # means
    go.Scatter(
        x=x,
        y=y,
        line=dict(color='rgb(0,100,80)', shape='spline', smoothing=0.5),
        mode='lines+markers+text',
        name='related',
        text=rel_titles
    ),
    go.Scatter(
        x=ix,
        y=iy,
        line=dict(color='rgb(80,0,100)', shape='spline', smoothing=0.5),
        mode='lines+markers+text',
        name='unrelated',
        opacity=0.5,
        text=irr_titles
    ),
    
    # means
    go.Scatter(
        x=x,
        y=[np.mean(y)] * len(x),
        line=dict(color='rgb(0,100,80)', dash='dot'),
        mode='lines',
        name='related mean',
        opacity=0.5,
    ),
    go.Scatter(
        x=x,
        y=[np.mean(iy)] * len(x),
        line=dict(color='rgb(80,0,100)', dash='dot'),
        mode='lines',
        name='unrelated mean',
        opacity=0.5
    ),
    
    # min/max bands
    go.Scatter(
        x=x+x[::-1], # x, then x reversed
        y=y_upper+y_lower[::-1], # upper, then lower reversed
        fill='toself',
        fillcolor='rgba(0,100,80,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo='skip',
        showlegend=False,
        opacity=0.8
    ),
    go.Scatter(
        x=ix+ix[::-1], # x, then x reversed
        y=iy_upper+iy_lower[::-1], # upper, then lower reversed
        fill='toself',
        fillcolor='rgba(80,0,100,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo='skip',
        showlegend=False,
        opacity=0.2
    )
])

fig.update_layout(
    yaxis_title='avg community out-degree',
    title='Community Activity: Ibizagate related vs unrelated',
)

fig.add_shape(
    x0="2019-05-17", x1="2019-05-17", y0=0, y1=7,
    fillcolor="LightSalmon", 
    opacity=0.8,
    layer="below", 
    line_width=2
)

fig.update_traces(textposition='top center')

fig.show()

In [28]:
import pickle
pickle.dump(ir_result, open('ir_result.p', 'wb'))