# Preparation

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import sqlite3
import lxml
import pandas as pd
import urllib.request

In [2]:
base = 'https://www.einbuergerungstest-online.eu/'
question_subsite = 'fragen/'
pages = [''] + [str(i) for i in np.arange(2,11)] #first pages does not start with '1' but with ''

In [3]:
def get_questions(soup, category = "Allgemein"):
    mydivs = soup.findAll("div", {"class": "questions-question-text"})
    questions  =[[list(j.children) for j in list(i.children)] for i in mydivs]

    clean = []
    for q in questions:
        #get the link if existing - else set a link to None
        try: 
            qid, qtext = q
            qlink = None
        except:
            qid, qtext, qlink = q
            #extract the link
            qlink = qlink[0]['href']

        #extract the text without the link
        if qtext[0].format:
            rawtext = qtext[0].extract()
        else:
            rawtext = qtext[0].text

        clean.append((qid[0], rawtext, qlink, category))
    return clean

In [4]:
def get_answers(soup, state = False):
    answers = list(soup.findAll("li"))
    #get only the answers on the page and ignore the rest
    if state:
        raw = [list(i.children)[0] for i in answers][25:-3]
    else:
        raw = [list(i.children)[0] for i in answers][35:-12]   
        
    # clear the answers from the green coloring on the website which indicates truth
    # add truth indicator as 0 or 1 
    clean = []
    for content in raw:
        if content.format:
            clean.append((content.extract(), 0))
        else:
            clean.append((content.text, 1))   
    if state:
        return np.array(clean[:10*4]).reshape(10,4,2)
    return np.array(clean[:30*4]).reshape(30,4,2)

# Scraping national-level questions

In [5]:
questions = []
answers = []

for page in pages:
    url_to_scrape = base + question_subsite + page
    r = requests.get(url_to_scrape)
    soup = BeautifulSoup(r.text, "lxml")

    answers.append((get_answers(soup)))
    questions.append((get_questions(soup)))

# Scraping state-level question

In [6]:
states = ["bw", "by", "be", "bb", "hb", "hh", "he","mv","ni","nw", "rp","sl","sn","st","sh","th"]
category = ["Baden-Württemberg",
          "Bayern",
          "Berlin",
          "Brandenburg",
          "Bremen",
          "Hamburg",
          "Hessen",
          "Mecklenburg-Vorpommern",
          "Niedersachsen",
          "Nordrhein-Westfalen",
          "Rheinland-Pfalz",
          "Saarland",
          "Sachsen",
          "Sachsen-Anhalt",
          "Schleswig-Holstein",
          "Thüringen"]

for idx, state in enumerate(states): 
    url_to_scrape = base + question_subsite + state
    r = requests.get(url_to_scrape)
    soup = BeautifulSoup(r.text, "lxml")
    
    questions.append((get_questions(soup, category[idx])))
    answers.append(get_answers(soup, state=True))
    


# Scraping Images

In [7]:
import itertools
questions_flat = list(itertools.chain(*questions))
images =  np.hstack(questions_flat)
images = [img for img in images[2::4] if img]

for image_i in images:
    urllib.request.urlretrieve(base + image_i, "q"+image_i.split("/")[3])

# Saving Results to DB

In [8]:
import sqlite3
conn = sqlite3.connect('AllGermanCitizenship.db')
c = conn.cursor() 


c.execute("CREATE TABLE IF NOT EXISTS Question ([ID] INTEGER PRIMARY KEY,[Text] VARCHAR(1000), [Image] VARCHAR(250), [Category] VARCHAR(250))")          

c.execute('''CREATE TABLE IF NOT EXISTS Answer ([ID] INTEGER PRIMARY KEY, [QuestionID] INTEGER, [Text] VARCHAR(1000),  [True] BOOL)''')

c.execute('''CREATE TABLE IF NOT EXISTS ProgressEvent ([generated_id] INTEGER PRIMARY KEY, [QuestionID] INTEGER, [Success] BOOL)''')
        
                
conn.commit()

In [9]:
# add QUESTIONS to database 
for q in np.hstack(questions_flat).reshape((460,4)):
    query = 'INSERT INTO Question (ID, Text, Image, Category) VALUES ("{}", "{}", "{}", "{}");'.format(q[0], q[1], q[2], q[3])
    c.execute(query)
    
conn.commit()

# add ANSWERS to database
for qid, a in enumerate(np.vstack(np.array([np.array(i) for i in answers]).flatten())):
    for poss_answer in a:
        query = 'INSERT INTO Answer (QuestionID, Text, True) VALUES ("{}", "{}", "{}");'.format(qid+1, poss_answer[0], poss_answer[1])
        c.execute(query)
    
conn.commit()

# Sanity Check

In [10]:
conn = sqlite3.connect('AllGermanCitizenship.db')
c = conn.cursor() 
c.execute('Select * from Question')

df_answers = pd.read_sql('Select * from Answer', conn)
df_questions = pd.read_sql('Select * from Question', conn)

In [11]:
print(df_questions.head())

   ID                                               Text Image   Category
0   1  In Deutschland dürfen Menschen offen etwas geg...  None  Allgemein
1   2  In Deutschland können Eltern bis zum 14. Leben...  None  Allgemein
2   3  Deutschland ist ein Rechtsstaat. Was ist damit...  None  Allgemein
3   4  Welches Recht gehört zu den Grundrechten in De...  None  Allgemein
4   5  Wahlen in Deutschland sind frei. Was bedeutet ...  None  Allgemein


In [12]:
print(df_answers.head())

   ID  QuestionID                               Text  True
0   1           1       hier Religionsfreiheit gilt.     0
1   2           1       die Menschen Steuern zahlen.     0
2   3           1  die Menschen das Wahlrecht haben.     0
3   4           1        hier Meinungsfreiheit gilt.     1
4   5           2    Geschichtsunterricht teilnimmt.     0


In [13]:
df = pd.read_sql('Select Question.ID, Count(Answer.ID), SUM(Answer.True) from Question inner join Answer on Answer.QuestionID = Question.ID group by Question.ID', conn)

In [14]:
df.mean(), df.median()

(ID                  230.5
 Count(Answer.ID)      4.0
 SUM(Answer.True)      1.0
 dtype: float64, ID                  230.5
 Count(Answer.ID)      4.0
 SUM(Answer.True)      1.0
 dtype: float64)

In [15]:
pd.read_sql('Select * from Question', conn)

Unnamed: 0,ID,Text,Image,Category
0,1,In Deutschland dürfen Menschen offen etwas geg...,,Allgemein
1,2,In Deutschland können Eltern bis zum 14. Leben...,,Allgemein
2,3,Deutschland ist ein Rechtsstaat. Was ist damit...,,Allgemein
3,4,Welches Recht gehört zu den Grundrechten in De...,,Allgemein
4,5,Wahlen in Deutschland sind frei. Was bedeutet ...,,Allgemein
5,6,Wie heißt die deutsche Verfassung?,,Allgemein
6,7,"Welches Recht gehört zu den Grundrechten, die ...",,Allgemein
7,8,Was steht nicht im Grundgesetz von Deutschland?,,Allgemein
8,9,Welches Grundrecht gilt in Deutschland nur für...,,Allgemein
9,10,Was ist mit dem deutschen Grundgesetz vereinbar?,,Allgemein
