In [1]:
import csv
import io
import requests
import pandas as pd

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.conf import SparkConf

from retrieve_data import DataRetrieverOverAPI

DAWUM_API_URL = "https://api.dawum.de/"

In [2]:
spark = SparkSession.builder \
                    .appName("Spark Basics") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 21:57:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
dawum_api = DataRetrieverOverAPI(DAWUM_API_URL)
data = dawum_api.data

In [4]:
dawum_database = data['Database']
dawum_parliaments = data['Parliaments']
dawum_institutes = data['Institutes']
dawum_taskers = data['Taskers']
dawum_methods = data['Methods']
dawum_parties = data['Parties']
dawum_surveys = data['Surveys']

print(dawum_parliaments)

{'0': {'Shortcut': 'Bundestag', 'Name': 'Bundestag', 'Election': 'Bundestagswahl'}, '8': {'Shortcut': 'Mecklenburg-Vorpommern', 'Name': 'Landtag von Mecklenburg-Vorpommern', 'Election': 'Landtagswahl in Mecklenburg-Vorpommern'}, '16': {'Shortcut': 'Thüringen', 'Name': 'Thüringischer Landtag', 'Election': 'Landtagswahl in Thüringen'}, '6': {'Shortcut': 'Hamburg', 'Name': 'Hamburgische Bürgerschaft', 'Election': 'Bürgerschaftswahl in Hamburg'}, '4': {'Shortcut': 'Brandenburg', 'Name': 'Brandenburgischer Landtag', 'Election': 'Landtagswahl in Brandenburg'}, '15': {'Shortcut': 'Schleswig-Holstein', 'Name': 'Landtag von Schleswig-Holstein', 'Election': 'Landtagswahl in Schleswig-Holstein'}, '14': {'Shortcut': 'Sachsen-Anhalt', 'Name': 'Landtag von Sachsen-Anhalt', 'Election': 'Landtagswahl in Sachsen-Anhalt'}, '13': {'Shortcut': 'Sachsen', 'Name': 'Sächsischer Landtag', 'Election': 'Landtagswahl in Sachsen'}, '7': {'Shortcut': 'Hessen', 'Name': 'Hessischer Landtag', 'Election': 'Landtagswah

In [5]:
df_database = spark.read.json(spark.sparkContext.parallelize([dawum_database]))
df_database = df_database.withColumn("license_name", F.col("License.Name")) \
                    .withColumn("license_shortcut", F.col("License.Shortcut")) \
                    .withColumn("license_link", F.col("License.Link")) \
                    .drop("License") \
                    .withColumnRenamed("Author", "author") \
                    .withColumnRenamed("Last_Update", "last_update") \
                    .withColumnRenamed("Publisher", "publisher") \
                    
df_database.show()


                                                                                

+--------------------+--------------------+---------+--------------------+----------------+--------------------+
|              author|         last_update|publisher|        license_name|license_shortcut|        license_link|
+--------------------+--------------------+---------+--------------------+----------------+--------------------+
|Dipl.-Jur. Philip...|2025-02-13T20:48:...| dawum.de|ODC Open Database...|        ODC-ODbL|https://opendatac...|
+--------------------+--------------------+---------+--------------------+----------------+--------------------+



In [6]:
rdd = spark.sparkContext.parallelize(dawum_parliaments.items())  # Parallelize the dictionary items

df_parliaments = rdd.map(lambda x: (x[0], x[1]['Name'], x[1]['Shortcut'], x[1]['Election'])).toDF(["parliament_id", "parliament_name", "parliament_shortcut", "parliament_election"])

df_parliaments.show()

+-------------+--------------------+--------------------+--------------------+
|parliament_id|     parliament_name| parliament_shortcut| parliament_election|
+-------------+--------------------+--------------------+--------------------+
|            0|           Bundestag|           Bundestag|      Bundestagswahl|
|            8|Landtag von Meckl...|Mecklenburg-Vorpo...|Landtagswahl in M...|
|           16|Thüringischer Lan...|           Thüringen|Landtagswahl in T...|
|            6|Hamburgische Bürg...|             Hamburg|Bürgerschaftswahl...|
|            4|Brandenburgischer...|         Brandenburg|Landtagswahl in B...|
|           15|Landtag von Schle...|  Schleswig-Holstein|Landtagswahl in S...|
|           14|Landtag von Sachs...|      Sachsen-Anhalt|Landtagswahl in S...|
|           13| Sächsischer Landtag|             Sachsen|Landtagswahl in S...|
|            7|  Hessischer Landtag|              Hessen|Landtagswahl in H...|
|           10|Landtag von Nordr...|Nordrhein-Westfa

In [7]:
rdd = spark.sparkContext.parallelize(dawum_institutes.items())  # Parallelize the dictionary items
df_institutes = rdd.map(lambda x: (x[0], x[1]["Name"])).toDF(["institute_id", "institute_name"])
df_institutes.show()

+------------+--------------------+
|institute_id|      institute_name|
+------------+--------------------+
|           1|     Infratest dimap|
|          22|            pollytix|
|          13|              YouGov|
|           2|               Forsa|
|           5|                INSA|
|           4|                 GMS|
|           6|Forschungsgruppe ...|
|          17|               Ipsos|
|          24|Institut Wahlkrei...|
|           3|      Verian (Emnid)|
|           7|Trend Research Ha...|
|           9|          Allensbach|
|          16|               Civey|
|          25|          IFM Berlin|
|          21|      Policy Matters|
|          18| Universität Hamburg|
|          15|         Mentefactum|
|          20|            IM Field|
|          12|              uniQma|
|          23|           Conoscope|
+------------+--------------------+
only showing top 20 rows



In [8]:
rdd = spark.sparkContext.parallelize(dawum_taskers.items())
df_taskers = rdd.map(lambda x: (x[0], x[1]["Name"])).toDF(["tasker_id", "tasker_name"])
df_taskers.show()

+---------+--------------------+
|tasker_id|         tasker_name|
+---------+--------------------+
|       10|     ARD-Tagesthemen|
|       67|            pollytix|
|       43|              YouGov|
|       63|          RTL / n-tv|
|        4|                BILD|
|        3|     BILD am Sonntag|
|       38|      Ostsee-Zeitung|
|        7|                 GMS|
|        5|  ZDF-Politbarometer|
|       13|               Ipsos|
|       80|FUNKE Medien Thür...|
|       39|                 NDR|
|       97|Institut Wahlkrei...|
|       64|               FOCUS|
|      112|Märkische Allgeme...|
|      120|Radio Hamburg / D...|
|        6|Frankfurter Allge...|
|       62|  Sächsische Zeitung|
|      121|      Osthessen|News|
|      100|  NRW-Tageszeitungen|
+---------+--------------------+
only showing top 20 rows



In [9]:
rdd = spark.sparkContext.parallelize(dawum_methods.items())
df_methods = rdd.map(lambda x: (x[0], x[1]["Name"])).toDF(["method_id", "method_name"])
df_methods.show()

+---------+----------------+
|method_id|     method_name|
+---------+----------------+
|        4|Telefon & Online|
|        3|          Online|
|        1|     Telefonisch|
|        2|      Persönlich|
|        0|       Unbekannt|
+---------+----------------+



In [28]:
rdd = spark.sparkContext.parallelize(dawum_parties.items())
df_parties = rdd.map(lambda x: (x[0], x[1]["Name"], x[1]["Shortcut"])).toDF(["party_id", "party_name", "party_shortcut"])
df_parties.show(df_parties.count())


+--------+--------------------+----------------+
|party_id|          party_name|  party_shortcut|
+--------+--------------------+----------------+
|       7|Alternative für D...|             AfD|
|      11|   Bayernpartei e.V.|    Bayernpartei|
|      14|Brandenburger Ver...|          BVB/FW|
|       4|Bündnis 90/Die Gr...|           Grüne|
|      23|Bündnis Sahra Wag...|             BSW|
|      21|bunt.saar – sozia...|       bunt.saar|
|      22|Bürger für Thüringen|            BfTh|
|      16|       Bürger in Wut|             BIW|
|       1|Christlich Demokr...|         CDU/CSU|
|     101|Christlich Demokr...|             CDU|
|     102|Christlich-Sozial...|             CSU|
|       5|           Die Linke|           Linke|
|      17|Familienpartei De...|         Familie|
|       3|Freie Demokratisc...|             FDP|
|       8|        Freie Wähler|    Freie Wähler|
|       9|Nationaldemokrati...|             NPD|
|      12|Ökologisch-Demokr...|             ÖDP|
|      13|Partei für

In [33]:
rdd = spark.sparkContext.parallelize(dawum_surveys.items())
df_surveys = rdd.map(lambda x: (x[0], 
                                x[1]["Parliament_ID"],
                                x[1]["Institute_ID"],
                                x[1]["Tasker_ID"],
                                x[1]["Method_ID"],
                                x[1]["Date"], 
                                x[1]["Survey_Period"]["Date_Start"], 
                                x[1]["Survey_Period"]["Date_End"],
                                x[1]["Surveyed_Persons"],
                                x[1]["Results"]
                                )) \
                        .toDF(["survey_id", 
                               "parliament_id",
                               "institute_id",
                               "tasker_id",
                               "method_id",
                               "survey_publish_date",
                               "survey_start_date", 
                               "survey_end_date",
                               "total_surveyees",
                               "results"])

df_surveys = df_surveys.select(F.col("survey_id"), 
                               F.col("parliament_id"),
                               F.col("institute_id"),
                               F.col("tasker_id"),
                               F.col("method_id"),
                               F.explode(df_surveys.results).alias("party_id", "survey_result_by_percent"),
                               F.col("survey_publish_date"),
                               F.col("survey_start_date"), 
                               F.col("survey_end_date"),
                               F.col("total_surveyees") 
                               )

df_surveys = df_surveys.fillna({"survey_result_by_percent": 0})
df_surveys.show()

+---------+-------------+------------+---------+---------+--------+------------------------+-------------------+-----------------+---------------+---------------+
|survey_id|parliament_id|institute_id|tasker_id|method_id|party_id|survey_result_by_percent|survey_publish_date|survey_start_date|survey_end_date|total_surveyees|
+---------+-------------+------------+---------+---------+--------+------------------------+-------------------+-----------------+---------------+---------------+
|     3751|            0|           1|       10|        4|       0|                       0|         2025-02-13|       2025-02-10|     2025-02-12|           1579|
|     3751|            0|           1|       10|        4|      23|                       0|         2025-02-13|       2025-02-10|     2025-02-12|           1579|
|     3751|            0|           1|       10|        4|       1|                      32|         2025-02-13|       2025-02-10|     2025-02-12|           1579|
|     3751|           