## Import

In [1]:
import sys

sys.path.append('C:\\Users\\Marco\\Documents\\GitHub\\GeoSpatial-analysis\\modules')

In [2]:
import json
import geojson
import numpy as np
import pandas as pd
import pymongo as pm
import geopandas as gpd
from pymongo_get_database import get_database

## Retrieve DB and Collections

In [28]:
day = "20_04_2023"

In [29]:
# retrieve database and collections
db_name = "bergenTrafficFlowData"
db = get_database(db_name)

collection_name_0 = "raw_data_"+day
collection_name_1 = "processed_data_"+day

collection_raw = db[collection_name_0]
collection_processed = db[collection_name_1]

## Query Data

We wanna verify if each road segment has been retrieved the same number of times (that should be equals to "n").

In [30]:
distinct_geometry_values = collection_processed.distinct("geometry")

In [31]:
len(distinct_geometry_values)

924

In [32]:
geometries_count = pd.Series([collection_processed.count_documents({"geometry": geometry}) for geometry in distinct_geometry_values])

There are 887 distinct geometries (i.e. road segments), but:
- only 847 of the 878 geometries have exactly "n" document associated with them
- 4 geometries have more than "n" documents associated with them (probably both directions of each geometry has been collected)
- the remaining geometries have less than "n" documents associated with them (i.e. not in each time the same road segment has been collected)

In [36]:
geometries_count.value_counts().sort_index()

1       24
2       12
3        3
4        4
5        4
6        4
7        2
10       5
11       3
14       1
20       1
51       1
53       1
54       1
57       1
58       2
60       4
63     847
126      4
Name: count, dtype: int64

We exlore the 4 geometries that have more than "n" documents associated with them.

In [37]:
# n is the number of api_calls done to get the data
n = 63

In [38]:
g_gtn_index = geometries_count\
    .where(geometries_count > n)\
    .dropna()\
    .index

In [39]:
g_gtn = [distinct_geometry_values[i] for i in g_gtn_index]

In [40]:
g_gtn_cursors = []

for geometry in g_gtn:
    g_gtn_cursors.append(collection_processed.find({"geometry": geometry}))

In [41]:
g_gtn_distinct_description = []

for cursor in g_gtn_cursors:
    g_gtn_distinct_description.append(cursor.distinct("description"))

In [42]:
g_gtn_distinct_description

[['Avkjøring mot Sentrum', 'Knappetunnelen'],
 ['Avkjøring mot Flesland', 'Straume'],
 ['Avkjøring mot Sentrum', 'Knappetunnelen'],
 ['Avkjøring mot Flesland', 'Straume']]

We notice that probably data in both directions of those 4 geometries has been collected. 

In [43]:
for (geometry, distinct_description) in zip(g_gtn, g_gtn_distinct_description):
    for description in distinct_description:
        print(description, f"is equal to {n}: ", collection_processed.count_documents({"geometry": geometry, "description": description})==n)

Avkjøring mot Sentrum is equal to 63:  True
Knappetunnelen is equal to 63:  True
Avkjøring mot Flesland is equal to 63:  True
Straume is equal to 63:  True
Avkjøring mot Sentrum is equal to 63:  True
Knappetunnelen is equal to 63:  True
Avkjøring mot Flesland is equal to 63:  True
Straume is equal to 63:  True


Still, we are going to check wether or not the documents are duplicated when excluding the fields "_id" and "description".

In [44]:
cursor0 = []
cursor1 = []

for (geometry, distinct_description) in zip(g_gtn, g_gtn_distinct_description):
    cursor0.append(collection_processed.find({"geometry": geometry, "description": distinct_description[0]}))
    cursor1.append(collection_processed.find({"geometry": geometry, "description": distinct_description[1]}))

In [45]:
keys = ["currentFlow"]#, "sourceUpdated", "api_call_time", "geometry", "length"]

We can se that the field "current_flow" is not always the same across time, so we can't say that the documents are duplicate even if the "geometry" fiel is the same.

In [46]:
for (c0, c1) in zip(cursor0, cursor1):
    b = []
    for (d0, d1) in zip(c0, c1):
        b.append([d0[key] for key in keys]==[d1[key] for key in keys])

    print("distinct documents", n-sum(b))

distinct documents 4
distinct documents 15
distinct documents 8
distinct documents 33


## Data Cleaning

### Remove non complete data

Because we need to analyze the data in a time series fashion, we can get rid of road segments that have less than "n" documents associated with them.

Therefore we are going to keep only the road segments that have exactly "n" or "2n" documents associated with them.

In [47]:
collection_name_2 = "clean_data_"+day
collection_processed_2 = db[collection_name_2]

In [48]:
g_n_index = geometries_count\
    .where(geometries_count%n == 0)\
    .dropna()\
    .index

In [49]:
geometry_to_keep = [distinct_geometry_values[i] for i in g_n_index]

In [50]:
cursor = collection_processed.find({ "geometry": { "$in": geometry_to_keep } })

In [51]:
collection_processed_2.insert_many(cursor)

<pymongo.results.InsertManyResult at 0x269ba4891e0>

### Remove data outside desired boundaries

Because data retrieved are within this BBOX = 'bbox:5.1334,60.2603,5.5563,60.5567', but we are interested in the city of Bergen, we are going to remove data outside the city boundaries.

In [23]:
bergen_polygon_vertex = [
     [5.161214, 60.372825],
     [5.211224, 60.398977],
     [5.255800, 60.409478],
     [5.240007, 60.479588],
     [5.259292, 60.528707],
     [5.322314, 60.545026],
     [5.542953, 60.421316],
     [5.486513, 60.348389],
     [5.343004, 60.257903],
     [5.256487, 60.240867],
     [5.227651, 60.242074],
     [5.190497, 60.291077],
     [5.197846, 60.325154],
     [5.183965, 60.337078],
     [5.169675, 60.340815],
     [5.161214, 60.372825]]

In [52]:
bergen_polygon = geojson.Polygon([bergen_polygon_vertex])

In [53]:
bergen_polygon

{"coordinates": [[[5.161214, 60.372825], [5.211224, 60.398977], [5.2558, 60.409478], [5.240007, 60.479588], [5.259292, 60.528707], [5.322314, 60.545026], [5.542953, 60.421316], [5.486513, 60.348389], [5.343004, 60.257903], [5.256487, 60.240867], [5.227651, 60.242074], [5.190497, 60.291077], [5.197846, 60.325154], [5.183965, 60.337078], [5.169675, 60.340815], [5.161214, 60.372825]]], "type": "Polygon"}

#### Index data

In [54]:
collection_processed_2.create_index([("geometry", pm.GEOSPHERE)])

'geometry_2dsphere'

In [55]:
collection_processed_2.count_documents({"geometry": {"$geoWithin": {"$geometry": bergen_polygon}}})

38430

In [56]:
collection_processed_2.count_documents({"geometry": {"$not": {"$geoWithin": {"$geometry": bergen_polygon}}}})

15435

Delete all data that are not includend in the city boundaries.

In [57]:
collection_processed_2.delete_many(
    {"geometry": {"$not": {"$geoWithin": {"$geometry": bergen_polygon}}}}
)

<pymongo.results.DeleteResult at 0x269a7d30490>

### Remove unwanted fields

In [58]:
collection_processed_2.update_many({'currentFlow.subSegments': {'$exists': True}}, {'$unset': {'currentFlow.subSegments' : ''}})

<pymongo.results.UpdateResult at 0x269ba4cf640>