# Data Preparation

This notebook connects to the Neo4J Workspace and creates the corresponding objects for Stops and Schools with the given properties in the json-files. Running this notebook takes around 1 hour.

## Paths + Imports

In [17]:
from neomodel import db, config
from credentials import Credentials
from models import *
import json, csv

In [18]:
stops_paths = "../data/haltestellen-wien.json"
schools_paths = "../data/schulen-wien.json"
lines_paths = '../data/haltestellen-linien.csv'

with open(stops_paths, 'r') as file:
    stops_data = json.load(file)

with open(schools_paths, 'r') as file:
    schools_data = json.load(file)

In [19]:
config.DATABASE_URL = Credentials.getNeo4JDatabaseURI()

## Insert data into Graph DB

### Remove and clean data if it exists

In [4]:
def clear_neo4j_database():
    db.cypher_query('MATCH (n) DETACH DELETE n')

def clear_class_registry():
    db._NODE_CLASS_REGISTRY.clear()

clear_neo4j_database()
clear_class_registry()

### Populate the graph with school data

In [5]:
def populate_schools(schools_data):
    count = 1
    for feature in schools_data['features']:
        print(f"running instance {count}..")
        count = count+1
        properties = feature['properties']
        geometry = feature['geometry']['coordinates']

        school_node = School.nodes.get_or_none(school_id=feature['id'])
        if not school_node:
            school_node = School(
                school_id=feature['id'],
                address=properties['ADRESSE'],
                type=str(properties['ART']),
                type_text=properties.get('ART_TXT'),
                care=properties.get('BETREUUNG'),
                phone=properties.get('TELEFON'),
                website=properties.get('WEBLINK1'),
                location_latitude=geometry[1], 
                location_longitude=geometry[0],
                name=properties['NAME']
            ).save()
            print(f"School {school_node.name} ADDED.")
        else:
            print(f"School with ID {school_node.name} ALREADY EXISTED.\n")

In [6]:
populate_schools(schools_data)

running instance 1..
School GRg 3 Kundmanngasse 20-22 added.
running instance 2..
School VS Kolonitzgasse 15 added.
running instance 3..
School SPR Landstraßer Hauptstraße 146 added.
running instance 4..
School SPR Landstraßer Hauptstraße 146 added.
running instance 5..
School GRg 3 Hagenmüllergasse 30 added.
running instance 6..
School PMS Erdbergstraße 70 added.
running instance 7..
School VS Petrusgasse 10 added.
running instance 8..
School PNMS Schützengasse 31 added.
running instance 9..
School VS Reisnerstraße 43 added.
running instance 10..
School PMS Fasangasse 4 added.
running instance 11..
School pRgORg 3 Schützengasse 31 added.
running instance 12..
School ORg 3 Ballsportgymnasium Wien added.
running instance 13..
School PVS Apostelgasse 5 added.
running instance 14..
School PVS Rennweg 31 added.
running instance 15..
School PVS Sebastianplatz 3 added.
running instance 16..
School GTVS Landstraßer Hauptstraße 146 added.
running instance 17..
School Saudische Schule in Wien d

In [4]:
def populate_stops(stops_data):
    count = 1
    for feature in stops_data['features']:
        print(f"running instance {count}..")
        count = count+1
        properties = feature['properties']
        geometry = feature['geometry']['coordinates']
        
        stop_node = Stop.nodes.get_or_none(stop_id=feature['id'])
        if not stop_node:
            stop_node = Stop(
                stop_id=feature['id'],
                name=properties['BEZEICHNUNG'],
                wl_number=properties['WL_NUMMER'],
                stop_latitude=geometry[1],
                stop_longitude=geometry[0]
            ).save()
            print(f"Stop {properties['BEZEICHNUNG']} ADDED.")
        else:
            print(f"Stop {properties['BEZEICHNUNG']} ALREADY EXISTED.")
            
populate_stops(stops_data)


running instance 1..
Stop Absberggasse already in graph, not added.
running instance 2..
Stop Achengasse already in graph, not added.
running instance 3..
Stop Ada-Christen-Gasse already in graph, not added.
running instance 4..
Stop Adam-Betz-Gasse already in graph, not added.
running instance 5..
Stop Adamovichgasse already in graph, not added.
running instance 6..
Stop Adolf-Loos-Gasse already in graph, not added.
running instance 7..
Stop Adolf-Unger-Gasse already in graph, not added.
running instance 8..
Stop Afritschgasse already in graph, not added.
running instance 9..
Stop Agavenweg already in graph, not added.
running instance 10..
Stop Agnesgasse already in graph, not added.
running instance 11..
Stop Aignerstraße already in graph, not added.
running instance 12..
Stop Auf der Schmelz already in graph, not added.
running instance 13..
Stop Morzinplatz already in graph, not added.
running instance 14..
Stop Alaudagasse already in graph, not added.
running instance 15..
Stop A

## Add connections from Stops to the corresponding lines (U6, 13A, 62, ..)

In [20]:
def classify_line(line_name):
    # Check if line starts with "U" followed by a digit 1-6 for metro
    if line_name.startswith("U") and len(line_name) > 1 and line_name[1].isdigit() and 1 <= int(line_name[1]) <= 6:
        return "metro"
    
    # Check if line is a single character from the German alphabet or a digit below 75 for tram
    if (line_name.isalpha() and len(line_name) == 1) or (line_name.isdigit() and int(line_name) < 75):
        return "tram"
    
    # Check if line starts with a digit below 100 followed by "A" or "B" for bus
    if (len(line_name) > 1 and line_name[:-1].isdigit() and int(line_name[:-1]) < 100 and line_name[-1] in ['A', 'B']) or (line_name.isdigit() and int(line_name) >= 100):
        return "bus"
    
    # Check if line starts with "N" for night bus
    if line_name.startswith("N"):
        return "nightbus"
    
    # Default case
    return "NA"

In [23]:
def process_stop_lines(csv_file):
    count = 1
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            print(f"run {count}")
            count=count+1

            stop_name = row['HTXT'].strip()
            lines = row['HLINIEN'].split(", ")
            
            # Find the Stop node by name
            stop_node = Stop.nodes.get_or_none(name=stop_name)
            if not stop_node:
                print(f"Stop {stop_name} NOT FOUND in the database.")
                continue
            
            # Process each line in HLINIEN
            for line_name in lines:
                # Create or get the Line node
                line_node = Line.nodes.get_or_none(name=line_name)  # Assign default type, update as needed

                if not line_node:
                    line_type = classify_line(line_name)

                    line_node = Line(
                        name = line_name, 
                        type = line_type
                    ).save()
                    print(f"Line {line_name} ADDED.")

                    if line_type == "NA":
                        print(f'line_type of {line_name} NOT KNOWN!')

                else:
                    print(f"Line {line_name} ALREADY EXISTED.")
                
                # Create the relationship from Stop to Line
                if not stop_node.has_line.is_connected(line_node):
                    stop_node.has_line.connect(line_node)
                    print(f"Line {line_name} at stop {stop_name} ADDED.")
                else:
                    print(f"Line {line_name} at stop {stop_name} ALREADY EXISTED.")

process_stop_lines(lines_paths)

run 1
Line N71 ALREADY EXISTED.
Line N71 at stop Molitorgasse ALREADY EXISTED.
run 2
Line 71 ALREADY EXISTED.
Line 71 at stop Molitorgasse ALREADY EXISTED.
run 3
Line N71 ALREADY EXISTED.
Line N71 at stop Molitorgasse ALREADY EXISTED.
run 4
Line N71 ALREADY EXISTED.
Line N71 at stop Molitorgasse ALREADY EXISTED.
run 5
Line 71 ALREADY EXISTED.
Line 71 at stop Molitorgasse ALREADY EXISTED.
run 6
Line 13A ALREADY EXISTED.
Line 13A at stop Mommsengasse ALREADY EXISTED.
run 7
Line 450 ALREADY EXISTED.
Line 450 at stop Mondweg ALREADY EXISTED.
Line 50B ALREADY EXISTED.
Line 50B at stop Mondweg ALREADY EXISTED.
Line 49A ALREADY EXISTED.
Line 49A at stop Mondweg ALREADY EXISTED.
run 8
Line 450 ALREADY EXISTED.
Line 450 at stop Mondweg ALREADY EXISTED.
Line 50B ALREADY EXISTED.
Line 50B at stop Mondweg ALREADY EXISTED.
Line 49A ALREADY EXISTED.
Line 49A at stop Mondweg ALREADY EXISTED.
run 9
Line 450 ALREADY EXISTED.
Line 450 at stop Mondweg ALREADY EXISTED.
Line 49A ALREADY EXISTED.
Line 49A a