In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('transformed_data.csv', index_col='id', parse_dates=['date'])
df.head()

Unnamed: 0_level_0,Unnamed: 0,date,party,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DE-0190001007,0,2017-10-24,DIE LINKE,0.00054,0.072428,0.00054,0.00054,0.72396,0.182041,0.018873,0.00054,0.00054
DE-0190001008,1,2017-10-24,CDU/CSU,0.000505,0.000505,0.029021,0.000505,0.967441,0.000505,0.000505,0.000506,0.000505
DE-0190001010,2,2017-10-24,BÜNDNIS 90/DIE GRÜNEN,0.00052,0.000519,0.028013,0.000519,0.96835,0.00052,0.00052,0.000519,0.000519
DE-0190002005,3,2017-11-21,CDU/CSU,0.000678,0.069938,0.000678,0.000678,0.925317,0.000678,0.000678,0.000678,0.000678
DE-0190002015,4,2017-11-21,AfD,0.148448,0.775312,0.000473,0.000473,0.000473,0.073402,0.000473,0.000473,0.000473


In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.sort_values(by='date', inplace=True)

## Summary statistics

In [4]:
df['party'].value_counts()

CDU/CSU                  2101
SPD                      1547
AfD                      1285
BÜNDNIS 90/DIE GRÜNEN    1188
FDP                      1186
DIE LINKE                1163
Name: party, dtype: int64

In [5]:
df[['date', 'party']].groupby('party')['date'].nunique()

party
AfD                      198
BÜNDNIS 90/DIE GRÜNEN    193
CDU/CSU                  203
DIE LINKE                190
FDP                      189
SPD                      195
Name: date, dtype: int64

In [6]:
df['date'].nunique()

207

## Topics

In [None]:
from pdf2image import convert_from_path
img = convert_from_path('topics_words.pdf', 500)[0]
fig, ax = plt.subplots(figsize=(15, 10))
ax.imshow(img)

In [None]:
columns_mapper = {
    'Topic0': 'International',
    'Topic1': 'Military',
    'Topic2': 'EU/Economy',
    'Topic3': 'Social',
    'Topic4': 'Decisions/Law',
    'Topic5': 'Democracy/Freedom',
    'Topic6': 'German History',
    'Topic7': 'Ecology',
    'Topic8': 'Health/Pandemic',
}

In [None]:
df.rename(columns=columns_mapper, inplace=True)

In [None]:
df.describe()

## Grouped parties

In [None]:
grouped_df = df.groupby('date').mean()

In [None]:
n_topics = len(grouped_df.columns)

In [None]:
n_days = 30
rolling_df = grouped_df.rolling(n_days).mean().tail(-n_days)

In [None]:
rolling_df

In [None]:
fig, ax = plt.subplots(figsize=(30, 10))

for topic_col in rolling_df.columns:
    ax.plot(rolling_df.index.values, rolling_df[topic_col].values, label=topic_col)
ax.legend()

In [None]:
#plotting commulative stacked area plots
y=[]
topics=[]
for idx, topic_col in enumerate(rolling_df.columns):
    y.append(None)
    topics.append(topic_col)
    y[idx] = rolling_df[topic_col].values
y = np.vstack([y])    

In [None]:
fig, ax = plt.subplots(figsize=(30, 10))
ax.stackplot(rolling_df.index.values, y, labels=topics)
ax.legend(loc='upper left')

In [None]:
#to plot just a few topics (better to see the behavior of individual topics to each other)
y=[]
topics=[]
topic_cols=['International', 'Health/Pandemic']
        
for idx, topic_col in enumerate(topic_cols):
    y.append(None)
    topics.append(topic_col)
    y[idx] = rolling_df[topic_col].values

y = np.vstack([y])    

fig, ax = plt.subplots(figsize=(30, 10))
ax.stackplot(rolling_df.index.values, y, labels=topics)
ax.legend(loc='upper left')

In [None]:
fig, ax = plt.subplots(figsize=(30, 10))

ax.plot(rolling_df.index.values, rolling_df['Health/Pandemic'].values)
ax.legend()

## Parties differences

In [None]:
df.groupby('party').mean()

In [None]:
df.groupby('party').agg(['mean', 'std', 'median'])

In [None]:
from sklearn.decomposition import PCA

X = df.groupby('party').mean().to_numpy()
pca = PCA(n_components=2)
X_r = pca.fit_transform(X)

fig, ax = plt.subplots()

for party, x_r in zip(df.groupby('party').mean().index, X_r):
    ax.scatter(-x_r[1], x_r[0], label=party,)
ax.legend()
ax.set_xlabel('"left -> right"')
ax.set_ylabel('"liberal -> authoritarian"')
ax.set_title("PCA of parties' topics")

In [None]:
axis0 = [list(columns_mapper.values())[i] for i in np.argsort(np.abs(pca.components_[0]))[::-1]]
axis1= [list(columns_mapper.values())[i] for i in np.argsort(np.abs(pca.components_[1]))[::-1]]

In [None]:
axis0

In [None]:
axis1

In [None]:
fig, ax = plt.subplots(9, figsize=(30, 100))
#parties = df['party'].unique()
parties = ['AfD']

n_days = 50

for topic_col, axis in zip(columns_mapper.values(), ax):
    for party in parties:
        data = df[df['party'] == party]
        axis.plot(
            data['date'].unique()[n_days:],
            data.groupby('date').mean()[topic_col].rolling(n_days).mean().tail(-n_days),
            label=party)
    axis.legend()
    axis.set_title(topic_col)