# Simple statistics for the "Du bist Am Zug" project

## Project links
https://dubistamzug.net/en/

https://www.instagram.com/dubistamzugberlin/

https://www.facebook.com/dubistamzugberlin

## Getting .kml map
Current notebook is tailored to work with the 2024 version of it, some things could be different for the future versions
Just change the `mid` value to a proper one, the code will downlaod the map and store it to `map.kml` file localy

In [1]:
import re
import logging
import urllib.request
import xml.etree.ElementTree as ET

from collections import Counter


STYLE_YELLOW = "#icon-1899-FFD600"
STYLE_RED = "#icon-1899-A52714"


# Google map id and url that is used to download it

#mid = "1jXqAMP9-YYyS75qjMMC6zf45UsSkVIs"  # Week 1&2
#mid = "1wRl3iviQosW3gyxi463XCI5FKFUd5h4"  # Week 3
mid = "1mdOJB9W1bScXAMSuZki-BhHPtFyZVFE"  # Week 4
download_url = f"https://www.google.com/maps/d/kml?mid={mid}&forcekml=1"
view_url = f"https://www.google.com/maps/d/u/0/viewer?mid={mid}&ll=52.530777634910116%2C13.465575394245812&z=10"
print("Google Maps view:", view_url)

# download the map into that file
file_name = "map.kml"
urllib.request.urlretrieve(download_url, 'map.kml')

tree = ET.parse(file_name)
root = tree.getroot()

Google Maps view: https://www.google.com/maps/d/u/0/viewer?mid=1mdOJB9W1bScXAMSuZki-BhHPtFyZVFE&ll=52.530777634910116%2C13.465575394245812&z=10


In [2]:
def norm_text(text: str) -> str:
    """Remove all html tags and fix some common typos in the source file"""
    norm = re.sub(r"<.*?>", " ", text).strip()
    norm = re.sub("8Foto", "(Foto", norm)
    norm = re.sub(r"\s+", " ", norm)

    # if there is no normal description and it is just coordinates just return empty line
    if not re.sub(r"[\d\.\s]", "", norm):
        return ""

    return norm.strip()

def get_spotter(text: str) -> str:
    """
    Try to extract spotter from the description
    
    Examples:
        >>> get_spotter("Author Name (Foto SpotterName)   52.45022583 13.50795078")
        'SpotterName'
        >>> get_spotter("Anonym (Bild SpotterName SpotterSurname)  52.55495479 13.39554122")
        'SpotterName SpotterSurname'
        >>> get_spoter("Author Name, 1, 4 (Foto Spotter1) (Foto2 Spotter2)  52.49294, 13.3868")
        'Spotter1'
    """
    if "(" in text and ")" in text:
        tt = text
        tt = re.sub(r"[Ff]oto?:?", "", tt)
        tt = re.sub(r"[Bb]ild:?", "", tt)
        return tt.split("(")[1].split(")")[0].strip()
    elif "(" in text:
        return text.split("(")[1].split(" ")[1].strip()
    logging.warning(f"No rules to parse correctly: {text}")
    return ""

def parse_description(text: str) -> str:
    """Parse description into a dictionary with various helper fields"""
    norm = norm_text(text)
    res = {"raw": text, "norm": norm}
    if not norm:
        return res
    res["poster_by"] = norm.split("(")[0].strip()  # any text before first '('
    res["spotted_by"] = get_spotter(norm)
    return res

def xml_to_dict(element):
    """
    Recursively converts an XML element and its children to a dictionary.
    Helper function for debug purposes
    """
    node_dict = {element.tag: {} if element.attrib else None}
    
    # If the element has attributes, add them to the dictionary
    if element.attrib:
        node_dict[element.tag].update(('@' + k, v) for k, v in element.attrib.items())
    
    # If the element has children, recursively call xml_to_dict on each child
    children = list(element)
    if children:
        dd = {}
        for dc in map(xml_to_dict, children):
            for k, v in dc.items():
                if k in dd:
                    if not isinstance(dd[k], list):
                        dd[k] = [dd[k]]
                    dd[k].append(v)
                else:
                    dd[k] = v
        node_dict[element.tag] = dd
    
    # If the element has text content, add it to the dictionary
    if element.text:
        text = element.text.strip()
        if children or element.attrib:
            if text:
                node_dict[element.tag]['#text'] = text
        else:
            node_dict[element.tag] = text
    
    return node_dict

def placemark_to_dict(placemark: ET) -> dict:
    """Convert <Placemark> structure into the dictionary"""
    name = placemark.find('kml:name', namespace).text.strip()
    styleUrl = placemark.find('kml:styleUrl', namespace).text
    description = placemark.find('kml:description', namespace)
    parsed_description = parse_description(description.text)
    coordinates = placemark.find('kml:Point/kml:coordinates', namespace).text.strip()
    coordinates = coordinates.split(",")[:2]
    return {
        "name": name,
        "style": styleUrl,
        "description": parsed_description,
        "coordinates": coordinates,
    }


# parse everything

namespace = {'kml': 'http://www.opengis.net/kml/2.2'}

document_name = root.find("kml:Document/kml:name", namespace).text
document_name = re.sub(r'[\\/:*?"<>|]', '_', document_name)

unmarked = 0
locations = []
for placemark in root.findall('.//kml:Placemark', namespace):
    try:
        locations.append(placemark_to_dict(placemark))
    except AttributeError as ex:
        logging.exception(f"{xml_to_dict(placemark)}")


ERROR:root:{'{http://www.opengis.net/kml/2.2}Placemark': {'{http://www.opengis.net/kml/2.2}name': '52.50944968\t13.4514068', '{http://www.opengis.net/kml/2.2}styleUrl': '#icon-1899-A52714-nodesc', '{http://www.opengis.net/kml/2.2}Point': {'{http://www.opengis.net/kml/2.2}coordinates': '13.4514068,52.50944968,0'}}}
Traceback (most recent call last):
  File "/var/folders/sz/ytnvld_d379ddpw15vl0xn000000gn/T/ipykernel_45539/2994371934.py", line 108, in <module>
    locations.append(placemark_to_dict(placemark))
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/sz/ytnvld_d379ddpw15vl0xn000000gn/T/ipykernel_45539/2994371934.py", line 86, in placemark_to_dict
    parsed_description = parse_description(description.text)
                                           ^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'text'
ERROR:root:{'{http://www.opengis.net/kml/2.2}Placemark': {'{http://www.opengis.net/kml/2.2}name': 'Punkt 1176', '{http://www.opengis.net/km

In [3]:
# Ensure that all parsed yellow locations have non-empty poster_by and spotted_by fields
for l in locations:
    if l["style"] != STYLE_YELLOW:
        continue
    d = l["description"]
    if not d["norm"]:
        continue
    if not d["spotted_by"] or not d["poster_by"]:
        logging.warning(f"No poster/spotter for {l['name']}: {d}")



In [4]:
# typos search, show all descriptions that don't have Foto or Bild in the description
for l in locations:
    if l["style"] != STYLE_YELLOW:
        continue
    d = l["description"]
    if not ("(Foto " in d["norm"] or "(Bild " in d["norm"]):
        print("-", l["name"], "DESCR:", re.sub(r"\s+", "  ", d["norm"]))

- Bachstr.-ggü. 1 Mittelinsel vor Altonaer Str. DESCR: Anonym  (Fot  ftravelerty)  52.51864624  13.33963776
- Alt-Karow-8-9 vor Str. 72 staw. DESCR: 
- Wartenberger Str.-ggü. 24 vor ggü. Gehrenseestr. Rtg. Rhinstr. DESCR: Ilia  Saraantsev  (Fotostricktdagegen)  52.55092966  13.50942449
- Müggelheimer Damm-231 hinter Müggellandstr. Rtg. Köpenick DESCR: Angela  Grasser  (LEXI:VERSUM)  52.415272  13.65663683
- Mehringdamm-21 vor Blücherstr. DESCR: Eva  Mauermann  (Fot  angiie_pamela_photography)  52.4957106  13.38982103
- Waldemarstr.-104 vor Manteuffelstr. DESCR: Michael  Otto  (angiie_pamela_photography)  52.50121577  13.42768154
- Marktstr.-2-3 vor Schreiberhauer Str. DESCR: Alexandra  Benkenstein  (FotoTim)  52.50303388  13.47421921
- Allee der Kosmonauten-136 Mittelinsel hinter ggü. Poelchaustr. staw. DESCR: Emiliy  Püttner  (stricktdagegen)  52.53387612  13.55257366
- Allee der Kosmonauten-27 Mittelinsel vor Rhinstr. stew. DESCR: Stefanie  Schairer  (stricktdagegen)  52.52593712  13

In [5]:
# print("Locations with the same name if any")
# name_counter = Counter(l["name"] for l in locations)
# for name, count in name_counter.most_common():
#     if count > 1:
#         print("-", name, ":", count)
# print()

print("Locations with the same coord if any")
coord_counter = Counter(" ".join(l["coordinates"][::-1]) for l in locations)
for coord, count in coord_counter.most_common():
    if count > 1:
        print("-", coord, ":", count)

Locations with the same coord if any
- 52.524755 13.419534 : 3
- 52.49275208 13.37978268 : 3
- 52.47776355 13.44843583 : 3
- 52.53951645 13.42375374 : 3
- 52.54602051 13.42736244 : 3
- 52.59558105 13.33253002 : 3
- 52.4903717 13.35981655 : 3
- 52.55523682 13.19875431 : 3
- 52.42760129 13.32636989 : 3
- 52.44025794 13.38698984 : 3
- 52.46853638 13.37458515 : 3
- 52.5256 13.4203 : 2
- 52.571215 13.410703 : 2
- 52.608166 13.429106 : 2
- 52.4982 13.6145 : 2
- 52.568516 13.542488 : 2
- 52.432094 13.534826 : 2
- 52.50925633 13.38857755 : 2
- 52.52478694 13.41948458 : 2


In [6]:
# If you are wanting to check if there are some posters you are intereted in
# just replace names here. Could be any string. I have that only because
# searching for a name or substring in google maps don't really work well for now (imho)
POSTERS_TO_CHECK = [
    "Nosyrev",
    "Kaltauskaite",
    "Dvayaitca",
    "Holubeva",
    "Pasichnyk",
    "Saliukhina",
]
found_posters = []
for l in locations:
    d = l["description"]
    if not d["norm"]:
        continue
    for p in POSTERS_TO_CHECK:
        if p.lower() in d["poster_by"].lower():
            found_posters.append((p, l))

for n, l in sorted(found_posters, key=lambda x: x[0]):
        d = l["description"]
        print(n)
        print("  ", l["name"])
        print("  ", d["norm"])
        lang, lat = l["coordinates"]
        print(f"   https://www.google.com/maps?q={lat},{lang}")

Dvayaitca
   Am Bhf. Grunewald- vor Auerbacher Str. 2
   Paska Dvayaitca (Foto Svenja) 52.4873694 13.26339587
   https://www.google.com/maps?q=52.487369,13.263396
Kaltauskaite
   Ringstr.- hinter Kadettenweg
   Liuba Kaltauskaite (Foto Svenja) 52.43798828 13.29550648
   https://www.google.com/maps?q=52.437988,13.295506
Nosyrev
   Landsberger Allee- hinter Raoul-Wallenberg-Str. 74
   Grigory Nosyrev (Foto stricktdagegen) 52.54486465 13.56416512
   https://www.google.com/maps?q=52.544865,13.564165
Nosyrev
   Reginhardstr.-59 vor Mickestr.
   Jens L. Heinrich Foto grigorynosyrev) 52.56232906 13.37314537
   https://www.google.com/maps?q=52.562329,13.373145
Pasichnyk
   Berliner Allee-210 hinter Bernkasteler Str. staw.
   Anastasiia Pasichnyk (Foto Stefanie J.) 52.55745316 13.46674633
   https://www.google.com/maps?q=52.557453,13.466746
Pasichnyk
   Lindenstr.-116 hinter Brandesstr.
   Anastasiia Pasichnyk (Foto sabineberlin.de) 52.49893951 13.39549828
   https://www.google.com/maps?q=52.49

# Statistics

In [7]:
print("Different dots on the map count")
print("Red:", sum(1 for l in locations if l["style"] == STYLE_RED))
print("Yellow:", sum(1 for l in locations if l["style"] == STYLE_YELLOW))

Different dots on the map count
Red: 550
Yellow: 876


In [8]:
print("Different styles of dots, sanity check:")
print(Counter(l["style"] for l in locations))

Different styles of dots, sanity check:
Counter({'#icon-1899-FFD600': 876, '#icon-1899-A52714': 550, '#icon-1899-BDBDBD': 13, '#icon-1899-FFEA00': 1, '#icon-1899-880E4F': 1})


In [9]:
# Some sanity check
print("Yellow locations without any description:")
for l in locations:
    d = l["description"]
    if l["style"] != STYLE_YELLOW:
        continue
    if not d["norm"]:
        print(l["name"])

Yellow locations without any description:
Alt-Karow-8-9 vor Str. 72 staw.
Hohenstaufenstr.-60 hinter Eisenacher Str.
Torweg-43 vor Hackbuschstr.
Schwedenstr.- vor Osloer Str. staw.
Granitzstr.- vor Retzbacher Weg
Am Treptower Park-30 vor Herkomer Str.


In [10]:
# Some sanity check
print("Red locations with non-empty description:")
for l in locations:
    d = l["description"]
    if l["style"] != STYLE_RED:
        continue
    if d["norm"]:
        print(l["name"], "DESCR", d["norm"])

Red locations with non-empty description:


In [11]:
print("Most common posters")
posters_by_stat = Counter(l["description"].get("poster_by", "NOT_FOUND") for l in locations if l["style"] == STYLE_YELLOW)
for poster_by, count in posters_by_stat.most_common(10):
    print(f"{count}: {poster_by}")

print()
print(len(set(posters_by_stat)), "unique posters out of", sum(1 for l in locations if l["style"] == STYLE_YELLOW), "in total")

print()
print("<how_many_times_poster_was_spotted>: <posters_that_were_spotted_that_many_times>")
for freq, posters_like_that in sorted(Counter(posters_by_stat.values()).items(), reverse=True):
    print(f"{freq}: {posters_like_that}")


Most common posters
81: Anonym
6: NOT_FOUND
4: Yaroslav Rashevskyi
3: Konrad Rempe
3: Anne Berndt
3: Claudia Drechsler
3: Dorothea Heiß
3: Johannes Pol
3: Janine Pommerenke
3: Gaby Fricke

582 unique posters out of 876 in total

<how_many_times_poster_was_spotted>: <posters_that_were_spotted_that_many_times>
81: 1
6: 1
4: 1
3: 17
2: 172
1: 390


In [12]:
n = 10
print(f"Top {n} spotters:")
spotted_by_stat = Counter(l["description"].get("spotted_by", "NOT_FOUND") for l in locations if l["style"] == STYLE_YELLOW)
for spotted_by, count in spotted_by_stat.most_common(n):
    print(f"{count}: {spotted_by}")

print()
print(len(set(spotted_by_stat)), "unique spotters spotted", sum(spotted_by_stat.values()), "posters")

print()
print("Spotter statistics:")
print("<number_of_spotted_posters>: <spotters_that_spotted_that_many_posters>")
for freq, posters_like_that in sorted(Counter(spotted_by_stat.values()).items(), reverse=True):
    print(f"{freq}: {posters_like_that}")

Top 10 spotters:
167: Svenja
114: Anke
96: Tim
76: angiie_pamela_photography
69: Peter
66: stricktdagegen
43: sabineberlin.de
34: Stefanie J.
31: grigorynosyrev
14: the_karambolage

64 unique spotters spotted 876 posters

Spotter statistics:
<number_of_spotted_posters>: <spotters_that_spotted_that_many_posters>
167: 1
114: 1
96: 1
76: 1
69: 1
66: 1
43: 1
34: 1
31: 1
14: 1
11: 2
10: 1
9: 1
8: 3
6: 4
5: 3
4: 3
3: 3
2: 7
1: 27


In [13]:
from pathlib import Path

import pandas as pd

df = pd.DataFrame()
df["name"] = [l["name"] for l in locations]
df["description"] = [l["description"]["norm"] for l in locations]
df["coordinates"] = [" ".join(l["coordinates"]) for l in locations]
df["style"] = [l["style"] for l in locations]
df = df.sort_values("name")
df.head()

# saving file to .csv format
df.to_csv(Path(document_name).with_suffix(".csv"), index=False)

