In [2]:
import json
import os

import pandas as pd

In [3]:
# set relative path to the data folder
DATA_FOLDER_PATH = '../data/'

# initialize empty dfs
df = pd.DataFrame()

for file in os.listdir(DATA_FOLDER_PATH):
    data = []
    with open(os.path.join(DATA_FOLDER_PATH, file), 'r') as f:
        for line in f:
            data.append(json.loads(line))

    normalized_file_df = pd.json_normalize(data)

    df = pd.concat([
        df,
        normalized_file_df,
    ])

## Operating Period
Note: We have only two rows in this df

In [4]:
operating_period_columns = ['event', 'on', 'at', 'organization_id', 'data.id', 'data.start', 'data.finish']
df_operating_period = df.loc[df['on'] == 'operating_period'][operating_period_columns]
df_operating_period

Unnamed: 0,event,on,at,organization_id,data.id,data.start,data.finish
0,create,operating_period,2019-06-01T18:17:03.087Z,org-id,op_2,2019-06-01T18:17:04.079Z,2019-06-01T18:22:04.079Z
0,create,operating_period,2019-06-01T18:17:04.086Z,org-id,op_1,2019-06-01T18:23:04.079Z,2019-06-01T18:28:04.079Z


## Vehicle

#### Data Example

In [5]:
df_vehicle_columns = ['event', 'on', 'at', 'organization_id', 'data.id', 'data.location.lat', 'data.location.lng', 'data.location.at']
df_vehicle = df.loc[df['on'] == 'vehicle'][df_vehicle_columns]
df_vehicle

Unnamed: 0,event,on,at,organization_id,data.id,data.location.lat,data.location.lng,data.location.at
0,update,vehicle,2019-06-01T18:17:10.101Z,org-id,bac5188f-67c6-4965-81dc-4ef49622e280,52.45133,13.46045,2019-06-01T18:17:10.101Z
1,update,vehicle,2019-06-01T18:17:10.109Z,org-id,3a3eb23a-f22e-4fe9-8c20-f37220a81909,52.45848,13.52647,2019-06-01T18:17:10.109Z
2,update,vehicle,2019-06-01T18:17:10.109Z,org-id,f0b87796-b25c-40b0-9145-8822509c17e1,52.50309,13.33435,2019-06-01T18:17:10.109Z
3,update,vehicle,2019-06-01T18:17:10.111Z,org-id,9152c5d8-79cf-4fe3-96ad-359abb08a729,52.50536,13.51655,2019-06-01T18:17:10.111Z
4,update,vehicle,2019-06-01T18:17:10.111Z,org-id,f06eb89c-ada0-41cb-bdd1-0a60398f901b,52.49697,13.44936,2019-06-01T18:17:10.111Z
...,...,...,...,...,...,...,...,...
0,update,vehicle,2019-06-01T18:29:02.185Z,org-id,d57e9c3e-3479-47cd-9b20-9e317616a3f0,52.43576,13.41212,2019-06-01T18:29:02.185Z
1,deregister,vehicle,2019-06-01T18:29:02.186Z,org-id,cf25c1c6-1889-4dd1-95df-18141c4c746c,,,
0,update,vehicle,2019-06-01T18:29:03.188Z,org-id,d57e9c3e-3479-47cd-9b20-9e317616a3f0,52.43604,13.41287,2019-06-01T18:29:03.188Z
0,update,vehicle,2019-06-01T18:29:04.195Z,org-id,d57e9c3e-3479-47cd-9b20-9e317616a3f0,52.43606,13.41305,2019-06-01T18:29:04.195Z


#### Data Info

In [6]:
df_vehicle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35349 entries, 0 to 0
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   event              35349 non-null  object 
 1   on                 35349 non-null  object 
 2   at                 35349 non-null  object 
 3   organization_id    35349 non-null  object 
 4   data.id            35349 non-null  object 
 5   data.location.lat  35149 non-null  float64
 6   data.location.lng  35149 non-null  float64
 7   data.location.at   35149 non-null  object 
dtypes: float64(2), object(6)
memory usage: 2.4+ MB


#### Null Counts

In [7]:
df_vehicle.isna().sum()

event                  0
on                     0
at                     0
organization_id        0
data.id                0
data.location.lat    200
data.location.lng    200
data.location.at     200
dtype: int64

Location values are not expected for register and deregister events. Checking only update values

In [8]:
df_vehicle.loc[df_vehicle['event'] == 'update'].isna().sum()

event                0
on                   0
at                   0
organization_id      0
data.id              0
data.location.lat    0
data.location.lng    0
data.location.at     0
dtype: int64