In [6]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)

Unnamed: 0,queryterm,party,gender
0,wolfgang stefinger,CSU,male
1,kai whittaker,CDU,male
2,katrin albsteiger,CSU,female


In [3]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
suggestions_df.head(3)

Unnamed: 0,date,queryterm,suggestion,count,party,gender
0,2017-05-29,achim post,achim post,12,SPD,male
1,2017-05-29,achim post,achim postbank,4,SPD,male
2,2017-05-29,achim post,achim postert,16,SPD,male


In [11]:
party_colors = ['rgb(0,158,224)', #afd
                'rgb(50,48,46)', #cdu
                'rgb(0,128,201)', #csu
                'rgb(182,28,62)', #dielinke
                'rgb(255,237,0)', #fdp
                'rgb(70,150,43)', #grüne
                'rgb(203,166,115)', #parteilos
                'rgb(227,0,15)', #spd
                'rgb(173,185,202)'# fraktionslos
               ]

In [35]:
terms = suggestions_df.groupby(['party', 'gender'], as_index=False)['queryterm'].nunique()

parties = {'party':[], 'percentages':[]}

for party in terms['party'].unique():
    tmp = terms[terms['party']==party]
    female = tmp[tmp['gender']=='female']['queryterm'].sum()
    percentage = female / tmp['queryterm'].sum()
    parties['party'].append(party)
    parties['percentages'].append(percentage)
    
f_percentages = pd.DataFrame(data=parties)

terms = suggestions_df.groupby(['party'], as_index=False)['queryterm'].nunique()
suggs = suggestions_df.groupby(['party'], as_index=False)['suggestion'].nunique()

fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.15)

fig.add_trace(go.Bar(x=terms['party'], y=terms['queryterm'],
                     marker_color=party_colors, showlegend=False), row=1, col=1)

fig.add_trace(go.Bar(x=f_percentages['party'], y=f_percentages['percentages'],
                     marker_color=party_colors, showlegend=False), row=1, col=2)

fig.add_trace(go.Bar(x=suggs['party'], y=suggs['suggestion'],
                     marker_color=party_colors, showlegend=False), row=1, col=3)


fig.update_yaxes(title='Anzahl Suchterme', row=1, col=1)
fig.update_yaxes(title='Frauenanteil der Suchterme', row=1, col=2)
fig.update_yaxes(title='Anzahl Suchvorschläge', row=1, col=3)
fig.update_xaxes(title='Parteien', row=1, col=1)
fig.update_xaxes(title='Parteien', row=1, col=2)
fig.update_xaxes(title='Parteien', row=1, col=3)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15), template='simple_white')
fig.show()

In [47]:
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df.head(3)

Unnamed: 0,index,hashtag,lda_dates,num_peaks
0,9,afghanistan,"[2017-05-29, 2017-05-30, 2017-05-31, 2017-06-0...",2.0
1,21,armut,"[2017-07-03, 2017-07-04, 2017-07-05, 2017-07-0...",2.0
2,41,bayern,"[2017-05-31, 2017-06-01, 2017-06-02, 2017-06-0...",10.0


In [49]:
print(f'Anzahl Hashtags: {peaks_df["hashtag"].nunique()}\nMittlere Anzahl Peaks: {peaks_df["num_peaks"].mean()}')

Anzahl Hashtags: 162
Mittlere Anzahl Peaks: 5.697530864197531


In [50]:
peaks_df = peaks_df.sort_values(by='num_peaks')
peaks_df.rename(columns={'hashtag':'Hashtag', 'num_peaks':'Anzahl detektierter Peaks'}, inplace=True)
fig = px.bar(peaks_df, x='Anzahl detektierter Peaks', y='Hashtag',
            template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [52]:
output_df = pd.read_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets_topics.json')
topic_df = pd.read_json('../../data/BTW17_Twitter/lda/topics.json')

Unnamed: 0,tags,topic,topic_score
0,btw17,62,0.220584
1,btw17,31,0.363188
2,btw17,62,0.220584
3,btw17,31,0.362581
4,btw17,62,0.220584


In [83]:
tmp_df = output_df.groupby('topic', as_index=False).mean()[['topic', 'topic_score']]
tmp_df = tmp_df.merge(topic_df, how='left', on='topic')
top3 = tmp_df.nlargest(n=3, columns='topic_score').reset_index(drop=True)
flop3 = tmp_df.nsmallest(n=3, columns='topic_score').reset_index(drop=True)

print('TOP 3')
for i in range(len(top3)):
    print(top3['topic'][i], top3['topic_score'][i], top3['topic_words'][i])
    
print('\nFLOP 3')
for i in range(len(flop3)):
    print(flop3['topic'][i], flop3['topic_score'][i], flop3['topic_words'][i])

TOP 3
73 0.30100924521780004 ['fakenews', 'klimaschutz', 'klimawandel', 'altersarmut', 'wirtschaft', 'darumgrün', 'echt', 'afd', 'mittel', 'trump']
31 0.2965577789727347 ['gar', 'freuen', 'traudichdeutschland', 'afd', 'korrekt', 'flug', 'karte', 'partei', 'rote', 'ergebnis']
60 0.2958901687605438 ['erdogan', 'seehofer', 'türkei', 'obergrenze', 'mögen', 'csu', 'demokratische', 'idee', 'unfassbar', 'warum']

FLOP 3
0 0.13697400066138934 ['ehefüralle', 'nrw', 'stegner', 'ltnrw', 'lang', 'sofort', 'steigen', 'schande', 'diplomatisch', 'gefährden']
56 0.15150339751681424 ['petry', 'frauke', 'schicksahlswahl', 'entsenden', 'nonsens', 'btw', 'grüner', 'afd', 'traudichdeutschland', 'sichern']
64 0.16918780593581553 ['aktuell', 'linksextremisten', 'anmelden', 'afd', 'neckarschiffahrt', 'startpunkt', 'btw', 'heidelberg', 'traudichdeutschland', 'teil']
