## Import

In [1]:
import sys

sys.path.append(r'C:\Users\Marco\Documents\GitHub\GeoSpatial-analysis\facility-location-Bergen\src\facility_location_Bergen\custome_modules')

In [47]:
# Get the database using the method we defined in pymongo_test_insert file
import os
import pytz
import copy
import json
import geojson
import numpy as np
import pandas as pd
from dateutil import parser
from pymongo import GEOSPHERE
from pymongo_get_database import get_database
from get_api_call_time import get_api_call_time
from geojson import GeometryCollection, LineString
from kedro.extras.datasets.json import JSONDataSet

## Connect to MongoDB

### Retrieve the database and the collection

In [129]:
def retrieve_database_and_collections(db_name: str, collections_name: list):
    # Get the database using the method we defined in pymongo_test_insert file
    db = get_database(db_name)
    # Get the collections
    if isinstance(collections_name, list) and len(collections_name) == 2:
        collections = {collection_name:db[collection_name] for collection_name in collections_name}
    else:
        raise TypeError('collections_name must be a list with two elements (raw data and processed data)')
    return db, collections

Defing the database and the collections names

In [130]:
days = ["20_04_2023"]
db_name = "facility_location_Bergen"
collections_names = ["raw_data_"+day for day in days] + ["processed_data_"+day for day in days]

In [132]:
db, collections = retrieve_database_and_collections(db_name, 
                                                    collections_names)

In [133]:
def take_empty_collections(collections: dict):
    empty_collections = {}
    for collection_name, collection in collections.items():
        if collection.count_documents({}) == 0:
            empty_collections[collection_name] = collection
    return empty_collections

Filter the collections to only retrieve the ones that are empty

In [134]:
empty_collections = take_empty_collections(collections)

### Insert documents in the collections

In [29]:
root_dir = r"C:\Users\Marco\Documents\GitHub\GeoSpatial-analysis\facility-location-Bergen\data\01_raw\Bergen"

In [55]:
def compose_url_to_raw_data(empty_collections, root_dir: str = root_dir):
    days = np.unique([key[-10:] for key in empty_collections.keys()])
    dirs = [dir for dir in os.listdir(root_dir) if dir[:10] in days]
    dirs_urls = [os.path.join(root_dir, dir) for dir in dirs]
    file_urls = [os.path.join(dir_url, file) for dir_url in dirs_urls for file in os.listdir(dir_url)]
    return file_urls

Compose the url of the raw json

In [56]:
urls = compose_url_to_raw_data(empty_collections)

In [58]:
def from_urls_to_JSONDataSet(urls: list):
    JSONDataSets = []
    for url in urls:
        JSONDataSets.append(JSONDataSet(filepath=url))
    return JSONDataSets

In [95]:
def load_raw_data(urls: list):
    raw_data = {}
    JSONDataSets = from_urls_to_JSONDataSet(urls)
    
    for url, json in zip(urls, JSONDataSets):
        if "afternoon" in url:
            key = url[-41:].removesuffix(".json")
        elif "midday" in url:
            key = url[-38:].removesuffix(".json")
        elif "morning" in url:
            key = url[-39:].removesuffix(".json")
            
        raw_data[key] = json.load()
    
    return raw_data

Load the json file

In [96]:
raw_data = load_raw_data(urls)

In [100]:
def get_time_from_raw_data(raw_data: dict):
    times = {}
    for key, value in raw_data.items():
        times[key] = get_api_call_time(key)
    return times

Get times API call time from json names

In [111]:
api_call_times = get_time_from_raw_data(raw_data)

In [116]:
def modify_raw_data_time_fields(raw_data: dict, api_call_times: dict):
    time_processed_collections_documents = {}
    
    for key, value in raw_data.items():
        dt = parser.parse(value["sourceUpdated"])
        
        for location in value["results"]:
            time_processed_collections_documents[key] = location
            time_processed_collections_documents[key]["sourceUpdated"] = dt  
            time_processed_collections_documents[key]["api_call_time"] = api_call_times[key] 
        
    return time_processed_collections_documents

In [118]:
def modify_time_processed_data_geometry_field(input_data: dict):
    geometry_processed_collections_documents = {}
    
    for key, value in input_data.items():
        # copy the dictionary in order to not modify the original one
        geometry_processed_collections_documents[key] = value
        new_document = geometry_processed_collections_documents[key]
        # extract the links field from the input data
        raw_data_links = value['location']['shape']['links']
        # create the geometry field (in order to comply the geojson format)
        new_document["geometry"] = GeometryCollection(
            [LineString([(e['lng'],e['lat']) for e in i['points']])for i in raw_data_links])
        
        # bring embedded fields to the top level
        for k in new_document['location']:
            new_document[k] = new_document['location'][k]
        
        # remove duplicated fields    
        new_document.pop('location')
        new_document.pop('shape')
        
    return geometry_processed_collections_documents

In [121]:
def process_raw_data(raw_data: dict, api_call_times: dict):
    time_processed_collections_documents = modify_raw_data_time_fields(raw_data, api_call_times)
    processed_collections_documents = modify_time_processed_data_geometry_field(time_processed_collections_documents)
    return processed_collections_documents

Process raw data 

In [122]:
processed_collections_documents = process_raw_data(raw_data, api_call_times)

In [143]:
def insert_documents_in_the_collections(raw_data: dict, processed_collections_documents: dict, collections: dict):
    for key, value in collections.items():
        if "raw" in key:
            value.insert_many(list(raw_data.values()))
        elif "processed" in key:
            value.insert_many(list(processed_collections_documents.values()))

Insert the data in the collections

In [None]:
insert_documents_in_the_collections(raw_data, processed_collections_documents, collections)