In [3]:
import requests
from scrapy.selector import Selector

In [4]:
response = requests.get("https://www.fragespiel.com/quiz/training.html")

In [5]:
# getting categories using scrapy
cat_str = Selector(text=response.text).xpath('//select[@name="kat[]"]/option/text()').getall()
cat_id = Selector(text=response.text).xpath('//select[@name="kat[]"]/option/@value').getall()
categories = { k:x for (k, x) in zip(cat_id, cat_str)}
print(categories)

{'11': 'Astronomie', '6': 'Biologie', '26': '- Pflanzen', '24': '- Tier', '5': 'Chemie', '39': 'DDR', '4': 'Erotik', '34': 'Film & TV', '8': 'Geographie', '3': 'Geschichte', '30': 'Kunst', '13': 'Literatur', '35': '- Mythen & Sagen', '36': 'Mathematik', '23': 'Medizin', '2': 'Mode & Lifestyle', '33': 'Musik', '32': 'Philosophie', '12': 'Physik', '10': 'Politik', '17': 'Religon', '1': 'Sport', '20': '- Fußball', '37': 'Technik', '19': '- Computer', '7': 'Verschiedenes', '31': '- Bauwerke', '16': '- Deutschland', '9': '- Film & Musik', '15': '- Österreich', '25': '- Speisen & Getränke', '14': '- Wissenschaft', '18': 'Wirtschaft'}


In [6]:
levels = { 0: "sehr leicht", 1: "leicht", 2: "mittel", 3:"schwer", 4:"sehr schwer"}

In [7]:
import json 

def get_questions(category, level):
    data = {"play":True, "kat[]":[category], "level[]":[level], "anzahl":300}
    response = requests.post("https://www.fragespiel.com/quiz/training.html", data=data)
    js_questions = Selector(text=response.text).xpath('//script[3]/text()').get()
    json_questions = js_questions.split("json\' : \'")[1].split("\',")[0]
    return json.loads(json_questions)

In [12]:
questions = {"questions":[]}
for cat in categories:
    for level in levels:
        got_new_questions = True
        qids = []
        retries = 5
        while got_new_questions:
            qs = get_questions(cat, level)
            new_questions = []
            for q in qs["questions"]:
                # Skip question if it is already known
                if not q['id'] in qids:
                    # add category and level info
                    q["category"] = cat
                    q["level"] = level
                    # remove weird number
                    q["a"] = q["a"][0]
                    q["b"] = q["b"][0]
                    q["c"] = q["c"][0]
                    q["d"] = q["d"][0]
                    new_questions.append(q)
            questions["questions"] = questions["questions"] + new_questions
            qids += [q['id'] for q in new_questions]
            # if there are less than 150 questions, we dont need to request more than once
            if len(qids) < 150:
                got_new_questions = False
            else:
                got_new_questions = len(new_questions) > 0 or retries > 0
                if len(new_questions) == 0:
                    retries -= 1
        print(categories[cat], " -- ", levels[level], " -- ", len(qids))

Astronomie  --  sehr leicht  --  23
Astronomie  --  leicht  --  27
Astronomie  --  mittel  --  35
Astronomie  --  schwer  --  34
Astronomie  --  sehr schwer  --  0
Biologie  --  sehr leicht  --  125
Biologie  --  leicht  --  203
Biologie  --  mittel  --  188
Biologie  --  schwer  --  121
Biologie  --  sehr schwer  --  20
- Pflanzen  --  sehr leicht  --  10
- Pflanzen  --  leicht  --  13
- Pflanzen  --  mittel  --  20
- Pflanzen  --  schwer  --  7
- Pflanzen  --  sehr schwer  --  1
- Tier  --  sehr leicht  --  41
- Tier  --  leicht  --  32
- Tier  --  mittel  --  24
- Tier  --  schwer  --  17
- Tier  --  sehr schwer  --  3
Chemie  --  sehr leicht  --  76
Chemie  --  leicht  --  165
Chemie  --  mittel  --  196
Chemie  --  schwer  --  118
Chemie  --  sehr schwer  --  15
DDR  --  sehr leicht  --  4
DDR  --  leicht  --  25
DDR  --  mittel  --  80
DDR  --  schwer  --  81
DDR  --  sehr schwer  --  20
Erotik  --  sehr leicht  --  18
Erotik  --  leicht  --  28
Erotik  --  mittel  --  17
Erotik 

In [17]:
questions = questions["questions"]
print(len(questions))

13819


In [18]:
# There are duplicates, remove them from the category with highest amount of questions
import numpy as np

# category frequency
cats_dict = {}
for q in questions:
  if q['category'] in cats_dict:
    cats_dict[q['category']] += 1
  else:
    cats_dict[q['category']] = 1


unique_ids = {}

deleted_cats = np.zeros(max([int(c) for c in categories.keys()]))

for i,q in enumerate(questions):
  qid = q["id"]
  if qid in unique_ids:
    # found a duplicate, check which duplicate to remove based on category frequency
    other_i = unique_ids[qid]
    if cats_dict[q["category"]] < cats_dict[questions[other_i]["category"]]:
      unique_ids[qid] = i 
      deleted_cats[int(questions[other_i]["category"])] += 1
      cats_dict[questions[other_i]["category"]] -= 1
    else:
      unique_ids[qid] = other_i
      deleted_cats[int(q["category"])] += 1
      cats_dict[q["category"]] -= 1
  else:
    unique_ids[qid] = i

for i,n in enumerate(deleted_cats):
  if n > 0:
    print("deleted", n, "from", categories[str(i)])
questions = [questions[i] for i in unique_ids.values()]

13819
12764
deleted 357.0 from Sport
deleted 4.0 from Mode & Lifestyle
deleted 81.0 from Geschichte
deleted 11.0 from Chemie
deleted 40.0 from Biologie
deleted 105.0 from Verschiedenes
deleted 57.0 from Geographie
deleted 11.0 from - Film & Musik
deleted 13.0 from Politik
deleted 3.0 from Astronomie
deleted 36.0 from Physik
deleted 10.0 from Literatur
deleted 37.0 from - Wissenschaft
deleted 7.0 from - Österreich
deleted 226.0 from - Deutschland
deleted 10.0 from Religon
deleted 2.0 from Wirtschaft
deleted 5.0 from - Computer
deleted 10.0 from - Fußball
deleted 3.0 from Medizin
deleted 13.0 from - Speisen & Getränke
deleted 1.0 from Kunst
deleted 10.0 from Musik
deleted 3.0 from Mathematik


In [19]:
print(len(questions))

12764


In [30]:
# Replace HTML entities with unicode
import html
for q in questions:
    for s in ["title","a","b","c","d"]:
        q[s] = html.unescape(q[s])


In [31]:
with open('questions.json', 'w', encoding='utf8') as f:
    json.dump(questions, f, indent=2, ensure_ascii=False)

Check if the same request produces different answers when there are more than 150 questions

In [17]:
test_cat = 18
test_lvl = 2
qs1 = get_questions(test_cat, test_lvl)
ids1 = [q['id'] for q in qs1['questions']]
qs2 = get_questions(test_cat, test_lvl)
ids2 = [q['id'] for q in qs2['questions']]
print('The number of intersecting elements is: ',len(set(ids1) & set(ids2)))

The number of intersecting elements is:  130


That is good because now we can just pull random requests until we get no new questions