## Import

In [1]:
import sys

sys.path.append('C:\\Users\\Marco\\Documents\\GitHub\\GeoSpatial-analysis\\modules')

In [2]:
import json
import geojson
import numpy as np
import pandas as pd
import pymongo as pm
import geopandas as gpd
from mongo_db import retrieve_database_and_collections, take_empty_collections

## Retrieve DB and Collections

In [42]:
day = "20_04_2023"
db_name = "facility_location_Bergen"

In [24]:
# retrieve database and collections
db, collections = retrieve_database_and_collections(db_name, day, ["processed"])
collection = collections[list(collections.keys())[0]]

## Query Data

We wanna verify if each road segment has been retrieved the same number of times (that should be equals to "n").

In [25]:
distinct_geometry_values = collection.distinct("geometry")

In [26]:
len(distinct_geometry_values)

924

In [28]:
geometries_count = pd.Series([collection.count_documents({"geometry": geometry}) for geometry in distinct_geometry_values])

There are 924 distinct geometries (i.e. road segments), but:
- only 847 of the 924 geometries have exactly "n" document associated with them
- 4 geometries have more than "n" documents associated with them (probably both directions of each geometry has been collected)
- the remaining geometries have less than "n" documents associated with them (i.e. not in each time the same road segment has been collected)

In [29]:
geometries_count.value_counts().sort_index()

1       24
2       12
3        3
4        4
5        4
6        4
7        2
10       5
11       3
14       1
20       1
51       1
53       1
54       1
57       1
58       2
60       4
63     847
126      4
Name: count, dtype: int64

We exlore the 4 geometries that have more than "n" documents associated with them.

In [30]:
# n is the number of api_calls done to get the data
n = 63

In [31]:
g_gtn_index = geometries_count\
    .where(geometries_count > n)\
    .dropna()\
    .index

In [32]:
g_gtn = [distinct_geometry_values[i] for i in g_gtn_index]

In [34]:
g_gtn_cursors = []

for geometry in g_gtn:
    g_gtn_cursors.append(collection.find({"geometry": geometry}))

In [35]:
g_gtn_distinct_description = []

for cursor in g_gtn_cursors:
    g_gtn_distinct_description.append(cursor.distinct("description"))

In [36]:
g_gtn_distinct_description

[['Avkjøring mot Sentrum', 'Knappetunnelen'],
 ['Avkjøring mot Flesland', 'Straume'],
 ['Avkjøring mot Sentrum', 'Knappetunnelen'],
 ['Avkjøring mot Flesland', 'Straume']]

We notice that probably data in both directions of those 4 geometries has been collected. 

In [37]:
for (geometry, distinct_description) in zip(g_gtn, g_gtn_distinct_description):
    for description in distinct_description:
        print(description, f"is equal to {n}: ", collection.count_documents({"geometry": geometry, "description": description})==n)

Avkjøring mot Sentrum is equal to 63:  True
Knappetunnelen is equal to 63:  True
Avkjøring mot Flesland is equal to 63:  True
Straume is equal to 63:  True
Avkjøring mot Sentrum is equal to 63:  True
Knappetunnelen is equal to 63:  True
Avkjøring mot Flesland is equal to 63:  True
Straume is equal to 63:  True


Still, we are going to check wether or not the documents are duplicated when excluding the fields "_id" and "description".

In [38]:
cursor0 = []
cursor1 = []

for (geometry, distinct_description) in zip(g_gtn, g_gtn_distinct_description):
    cursor0.append(collection.find({"geometry": geometry, "description": distinct_description[0]}))
    cursor1.append(collection.find({"geometry": geometry, "description": distinct_description[1]}))

In [39]:
keys = ["currentFlow"]#, "sourceUpdated", "api_call_time", "geometry", "length"]

We can se that the field "current_flow" is not always the same across time, so we can't say that the documents are duplicate even if the "geometry" fiel is the same.

In [40]:
for (c0, c1) in zip(cursor0, cursor1):
    b = []
    for (d0, d1) in zip(c0, c1):
        b.append([d0[key] for key in keys]==[d1[key] for key in keys])

    print("distinct documents", n-sum(b))

distinct documents 4
distinct documents 15
distinct documents 8
distinct documents 33


## Data Cleaning

### Remove data outside desired boundaries

Because data retrieved are within this BBOX = 'bbox:5.1334,60.2603,5.5563,60.5567', but we are interested in the city of Bergen, we are going to remove data outside the city boundaries.

In [56]:
day = "20_04_2023"
db_name = "facility_location_Bergen"

In [69]:
def filter_data_geographically(polygon: list):
    polygon = geojson.Polygon([bergen_polygon_vertex])
    
    # retrieve database and collections
    db, collections = retrieve_database_and_collections(db_name, day, ["processed", "clean"])
    key_list = list(collections.keys())
    
    processed_collection = collections[key_list[0]]
    clean_collection = collections[key_list[1]]
    
    processed_collection.create_index([("geometry", pm.GEOSPHERE)])
    
    cursor = processed_collection.find(
    {"geometry": {"$geoWithin": {"$geometry": polygon}}})
    
    clean_collection.insert_many(cursor)

In [67]:
bergen_polygon_vertex = [
     [5.161214, 60.372825],
     [5.211224, 60.398977],
     [5.255800, 60.409478],
     [5.240007, 60.479588],
     [5.259292, 60.528707],
     [5.322314, 60.545026],
     [5.542953, 60.421316],
     [5.486513, 60.348389],
     [5.343004, 60.257903],
     [5.256487, 60.240867],
     [5.227651, 60.242074],
     [5.190497, 60.291077],
     [5.197846, 60.325154],
     [5.183965, 60.337078],
     [5.169675, 60.340815],
     [5.161214, 60.372825]]

In [70]:
filter_data_geographically(bergen_polygon_vertex)

### Remove non complete data

Because we need to analyze the data in a time series fashion, we can get rid of road segments that have less than "n" documents associated with them.

Therefore we are going to keep only the road segments that have exactly "n" or "2n" documents associated with them.

In [77]:
def keep_common_road_segments_across_time(day, db_name):
    # retrieve database and collections
    db, collections = retrieve_database_and_collections(db_name, day, ["raw", "clean"])
    key_list = list(collections.keys())

    raw_collection = collections[key_list[0]]
    clean_collection = collections[key_list[1]]
    
    distinct_geometry_values = clean_collection.distinct("geometry")
    geometries_count = pd.Series([clean_collection.count_documents({"geometry": geometry}) for geometry in distinct_geometry_values])
    
    n = raw_collection.count_documents({})
    
    gt_n_index = geometries_count\
    .where(geometries_count%n == 0)\
    .dropna()\
    .index
    
    geometry_to_keep = [distinct_geometry_values[i] for i in gt_n_index]
    
    clean_collection.delete_many({ "geometry": {"$not": { "$in": geometry_to_keep }} })

In [78]:
keep_common_road_segments_across_time(day, db_name)

### Remove unwanted fields

In [None]:
def remove_unnecessary_field(day, db_name):
    # retrieve database and collections
    db, collections = retrieve_database_and_collections(db_name, day, ["clean"])
    key_list = list(collections.keys())
    
    clean_collection = collections[key_list[0]]

    clean_collection.update_many({'currentFlow.subSegments': {'$exists': True}}, {'$unset': {'currentFlow.subSegments' : ''}})

In [None]:
remove_unnecessary_field(day, db_name)