In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation, peak_analysis, peak_ranges
from scipy import stats

In [2]:
# read only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()

In [3]:
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)

Unnamed: 0,date,hashtag,count
0,2017-05-29,150jahrekapital,1
1,2017-05-29,a19,1
2,2017-05-29,abschiebung,14


In [4]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)

Unnamed: 0,queryterm,party,gender
0,wolfgang stefinger,CSU,male
1,kai whittaker,CDU,male
2,katrin albsteiger,CSU,female


In [5]:
cluster_cat = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster_categories.csv', delimiter=',')
cluster_cat.drop(columns='suggestion', inplace=True)
cluster_cat.head(3)

Unnamed: 0,cluster,category
0,-1,Rauschen
1,0,Rauschen
2,1,Personen


In [6]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')

In [7]:
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))

In [8]:
# join suggestion cluster and  group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)

Unnamed: 0,date,queryterm,party,gender,cluster,count
0,2017-05-29,achim post,SPD,male,2,4
1,2017-05-29,achim post,SPD,male,5,12
2,2017-05-29,achim post,SPD,male,75,4


In [9]:
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores']) 
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df.merge(cluster_cat, how='left', on='cluster')
similarity_df.head(3)

Unnamed: 0,cluster,hashtags,similarity_scores,category
0,0,afdwählen,0.032451,Rauschen
1,0,afghanistan,-0.03211,Rauschen
2,0,altersarmut,-0.032286,Rauschen


In [10]:
# filter everything with sim_score >= 0.4
sim_df = similarity_df[similarity_df['similarity_scores']>=0.4].reset_index(drop=True)

cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)

cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)

cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)

hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)

In [11]:
delays = []
for i in range(0, 71, 7):
    delays.append(i)

In [12]:
dfs = []
for i in delays:
    dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

  0%|          | 0/3462 [00:00<?, ?it/s]

In [13]:
for i in range(len(dfs)):
    dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')

In [14]:
# set to *.json to load all
input_loc = '../../data/Analysis/*.json'
input_files = glob.glob(input_loc)

dfs = []
for file in input_files:
    data = pd.read_json(file)
    data = data.merge(cluster_cat, how='left', on='cluster')
    #data = data[(data['pearsonr']>=0)&(data['p_value']<=0.05)&(data['gender']=='all')&(data['party']=='all')]
    data = data[(data['pearsonr']>=0)]
    dfs.append(data)

In [15]:
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)

### Deskriptives
Outputs:
* DBSCAN Größe der Klassen und Cluster, Boxplots 

In [16]:
# load cluster_df and join categories
cluster_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
cluster_df = cluster_df.merge(cluster_cat, how='left', on='cluster')

tmp = pd.DataFrame()
tmp['Cluster'] = cluster_df['cluster'].value_counts().index
tmp['Clustergröße'] = cluster_df['cluster'].value_counts().values
tmp = tmp.merge(cluster_cat, how='left', left_on='Cluster', right_on='cluster')
tmp = tmp[tmp['category']!='Rauschen']
tmp2 = cluster_df.groupby('category', as_index=False)['cluster'].nunique().sort_values(by='cluster', ascending=False)
tmp = tmp.merge(tmp2, on='category')
tmp['category'] = tmp.apply(lambda x: x['category'] + f' ({x["cluster_y"]} Cluster)', axis=1)

tmp.rename(columns={'category':'Kategorie'}, inplace=True)
fig = px.box(tmp[tmp['Kategorie']!='Rauschen'], x='Kategorie', y='Clustergröße',# points='all',
             color='Kategorie',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [17]:
cluster_df.head(3)

Unnamed: 0,t-SNE(x),t-SNE(y),suggestion,cluster,vector,category
0,-4.129993,-11.599258,"[büro, lorenz, caffier]",-1,"[63.0744781494, -21.0450344086]",Rauschen
1,-2.404372,-19.263092,"[peter, uldall, juhl]",-1,"[-33.2196960449, 26.409576416]",Rauschen
2,-52.736607,1.273095,"[cloud, 7]",-1,"[-42.6088409424, 38.0669670105]",Rauschen


In [18]:
fig = px.scatter(cluster_df, x='t-SNE(x)', y='t-SNE(y)', color='category', hover_name='suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [19]:
# number of relevant combinations: sim>=0.4, r>=0
num = []

for i in range(len(dfs)):
    num.append(len(dfs[i]))
    
fig = px.line(x=delays, y=num, labels={'x':'Delay', 'y':'Anzahl relevanter Kombinationen'},
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [20]:
# plot similarity of hashtags with cluster categories
sim_plot = pd.crosstab(index=similarity_df['hashtags'], columns=similarity_df['category'],
                       values=similarity_df['similarity_scores'], aggfunc='mean')

fig = go.Figure()

fig.add_trace(go.Heatmap(z=sim_plot, x=sim_plot.columns,
                         y=sim_plot.index,
                         colorscale=px.colors.sequential.RdBu))

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

### Wie lange dauert die Diffusion im Durchschnitt und nach den jeweiligen Dimensionen?
Messung: TLCC mit Pearson R und p-Wert (stats.combine_pvalues)

#### Betrachtung im Durchschnitt:
Outputs:
* Tabelle: Spalten: *R, p*, Zeilen: *Delay*

In [21]:
delay_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    delay_list.append(delays[i])
    df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
    r_list.append(round(df['pearsonr'].mean(),3))
    p_values = df['p_value'].to_numpy()
    p_list.append(round(stats.combine_pvalues(p_values)[1],3))
    
tmp = pd.DataFrame(data={'Delay': delay_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp

Unnamed: 0,Delay,Pearson R,P-Wert
0,0,0.122,0.0
1,7,0.123,0.0
2,14,0.113,0.0
3,21,0.128,0.0
4,28,0.135,0.0
5,35,0.135,0.0
6,42,0.156,0.0
7,49,0.158,0.0
8,56,0.155,0.0
9,63,0.154,0.0


In [22]:
fig = px.line(tmp, x='Delay', y='Pearson R',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Verschiebung dauert ca 39-56 Tage, allerdings sehr geringe Korrelation.
Ausnahmen sind nur wenige:

In [31]:
# scatter plot high performer
tmp = dfs[7][(dfs[7]['gender']=='all')&(dfs[7]['party']=='all')]
tmp = tmp[tmp['pearsonr']>=0.5]

fig = px.scatter(tmp, x='pearsonr', y='similarity_scores',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.update_yaxes(title_text='Similarity Score')
fig.update_xaxes(title_text='Korrelation')
fig.show()

#### Betrachtung nach Dimensionen: Kategorie der Suchvorschläge, Gender, Partei:
Outputs:
* Heatmap: Kennzahlen: *R, p*, Spalten: *Dimension*, Zeilen: *Delay*

In [24]:
delay_list = []
categories = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for category in set(similarity_df['category']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
        categories.append(category)
        r_list.append(df[df['category_x']==category]['pearsonr'].mean())
        p_values = df[df['category_x']==category]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Kategorie': categories, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [25]:
delay_list = []
gender_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for gender in set(suggestions_df['gender']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']!='all')&(dfs[i]['party']=='all')]
        gender_list.append(gender)
        r_list.append(df[df['gender']==gender]['pearsonr'].mean())
        p_values = df[df['gender']==gender]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Gender': gender_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [26]:
delay_list = []
party_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for party in set(suggestions_df['party']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']!='all')]
        party_list.append(party)
        r_list.append(df[df['party']==party]['pearsonr'].mean())
        p_values = df[df['party']==party]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Parteien': party_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

#### Betrachtung der Zeiträume um die Peaks TBD

In [27]:
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df[['peak_start', 'peak_end']] = peaks_df.apply(peak_ranges, axis=1)
peaks_df.drop(columns=['index', 'num_peaks', 'lda_dates'], inplace=True)
peaks_df = peaks_df.set_index(['hashtag']).apply(pd.Series.explode).reset_index()
peaks_df.head(3)

Unnamed: 0,hashtag,peak_start,peak_end
0,afghanistan,2017-05-29,2017-06-04
1,afghanistan,2017-08-22,2017-08-28
2,armut,2017-07-03,2017-07-09


In [28]:
cluster_ts_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_ts_df.head(3)

Unnamed: 0,date,cluster,count
0,2017-05-29,0,5774
1,2017-05-29,1,646
2,2017-05-29,2,1449


In [29]:
analysis_dfs = []

for i in tqdm(range(len(delays[1:]))):
    test_range = delays[i+1]
    tmp = pd.DataFrame(data=peak_analysis(test_range, sim_df, peaks_df, cluster_ts_df))
    sig_results = round(len(tmp[(tmp['p']<=0.05)&(tmp['t']>0)])/len(tmp)*100,2)
    print(f'Anteil signifikant positiver Ergebnisse bei einer Range von {test_range} Tagen: {sig_results}%')
    print(tmp[(tmp['hashtag']=='all')&(tmp['category']=='all')])
    analysis_dfs.append(tmp)

  0%|          | 0/10 [00:00<?, ?it/s]

Anteil signifikant positiver Ergebnisse bei einer Range von 7 Tagen: 1.15%
  hashtag category  test_range      t      p
0     all      all           7 -0.115  0.908
Anteil signifikant positiver Ergebnisse bei einer Range von 14 Tagen: 2.87%
  hashtag category  test_range      t      p
0     all      all          14 -0.086  0.931
Anteil signifikant positiver Ergebnisse bei einer Range von 21 Tagen: 6.9%
  hashtag category  test_range      t      p
0     all      all          21 -0.105  0.916
Anteil signifikant positiver Ergebnisse bei einer Range von 28 Tagen: 7.47%
  hashtag category  test_range      t      p
0     all      all          28  0.596  0.551
Anteil signifikant positiver Ergebnisse bei einer Range von 35 Tagen: 7.47%
  hashtag category  test_range      t      p
0     all      all          35  0.572  0.567
Anteil signifikant positiver Ergebnisse bei einer Range von 42 Tagen: 10.34%
  hashtag category  test_range      t      p
0     all      all          42  0.925  0.355
Antei

In [38]:
for i in range(len(analysis_dfs)):
    analysis_dfs[i].to_json(f'../../data/Analysis/peak_df_{delays[i]}_range.json')

In [109]:
categories = cluster_cat['category'].unique().tolist()

plot_df = {'category':[], 'test_range':[], 't':[], 'p':[]}

for category in categories:
    for i in range(len(analysis_dfs)):
        tmp = analysis_dfs[i][analysis_dfs[i]['category']==category]
        try:
            if float(tmp['p'].values) > 0.05:
                pass
            else:
                plot_df['category'].append(category)
                plot_df['test_range'].append(int(tmp['test_range'].values))
                plot_df['t'].append(float(tmp['t'].values))
                plot_df['p'].append(float(tmp['p'].values))
        except:
            pass
        
plot_df = pd.DataFrame(data=plot_df)

In [114]:
fig = make_subplots(rows=4, cols=3, shared_yaxes='all', shared_xaxes='all', 
                    subplot_titles=categories[1:])

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Personen']['test_range'],
                     y=plot_df[plot_df['category']=='Personen']['t'],
                     name='Personen',
                     marker_color=px.colors.qualitative.Antique[0]),
              row=1, col=1)

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Wirtschaft']['test_range'],
                     y=plot_df[plot_df['category']=='Wirtschaft']['t'],
                     name='Wirtschaft',
                     marker_color=px.colors.qualitative.Antique[1]),
              row=1, col=2)

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Politik']['test_range'],
                     y=plot_df[plot_df['category']=='Politik']['t'],
                     name='Politik',
                     marker_color=px.colors.qualitative.Antique[2]),
              row=1, col=3)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Orte']['test_range'],
                     y=plot_df[plot_df['category']=='Orte']['t'],
                     name='Orte',
                     marker_color=px.colors.qualitative.Antique[3]),
              row=2, col=1)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Organisationen']['test_range'],
                     y=plot_df[plot_df['category']=='Organisationen']['t'],
                     name='Organisationen',
                     marker_color=px.colors.qualitative.Antique[4]),
              row=2, col=2)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Berufe']['test_range'],
                     y=plot_df[plot_df['category']=='Berufe']['t'],
                     name='Berufe',
                     marker_color=px.colors.qualitative.Antique[5]),
              row=2, col=3)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Justiz']['test_range'],
                     y=plot_df[plot_df['category']=='Justiz']['t'],
                     name='Justiz',
                     marker_color=px.colors.qualitative.Antique[6]),
              row=3, col=1)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Medien']['test_range'],
                     y=plot_df[plot_df['category']=='Medien']['t'],
                     name='Medien',
                     marker_color=px.colors.qualitative.Antique[7]),
              row=3, col=2)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Sport']['test_range'],
                     y=plot_df[plot_df['category']=='Sport']['t'],
                     name='Sport',
                     marker_color=px.colors.qualitative.Antique[8]),
              row=3, col=3)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Kultur']['test_range'],
                     y=plot_df[plot_df['category']=='Kultur']['t'],
                     name='Kultur',
                     marker_color=px.colors.qualitative.Antique[9]),
              row=4, col=1)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Privatleben']['test_range'],
                     y=plot_df[plot_df['category']=='Privatleben']['t'],
                     name='Privatleben',
                     marker_color=px.colors.qualitative.Antique[10]),
              row=4, col=2)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Medizin']['test_range'],
                     y=plot_df[plot_df['category']=='Medizin']['t'],
                     name='Medizin',
                     marker_color=px.colors.qualitative.Antique[11]),
              row=4, col=3)

fig.update_yaxes(title='t-Statistik', col=1)
fig.update_xaxes(title='Test Range', row=4)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15),
                  template='simple_white', showlegend=False)
fig.show()