# Downloading and processing the data for one district

https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-update-by-query.html to update and clean the data ?

In [156]:
zipcode = 75001

### Imports & connection to Elasticsearch

In [157]:
from elasticsearch import Elasticsearch
from ssl import create_default_context
import pandas as pd
import numpy as np
import sys
import datetime

with open('connect.txt') as f:
    str = f.readlines()

es = Elasticsearch(str)

### Change maximum size setting
By default the query returns the first 10,000 hits

In [158]:
es.indices.put_settings(
    index="carshare_car_history",
    body={
        "index.max_result_window": 500000
    }

)

{'acknowledged': True}

## Query
Fetch all data in given district. This should be repeated for each zipcode.

In [159]:
result = es.search(
  index="carshare_car_history",
  body = {
  "size": 200000,
  "query": {

      "bool": {
        "must": {
          "match_all": {}
        },
        "filter": [
    {
    "geo_shape": {
      "ignore_unmapped": "true",
      "location": {
        "relation": "INTERSECTS",
        "shape": {
          "coordinates": [
            [
              [
                1.87411, 49.08001
              ],
              [
                1.87411,
                48.64617
              ],
              [
                2.72473,
                48.64617
              ],
              [
                2.72473,
                49.08001
              ],
              [
                1.87411,
                49.08001
              ]
            ]
          ],
          "type": "Polygon"
        }
      }
    }
  },
  
  {
      "match_phrase": {
        "brand": "Zity"
      }    
  },
  
{
      "match_phrase": {
        "zipcode": zipcode
      }    
  },

  {
      "range": { 
        "last_update": {
          "gte": "2020-01-01T00:00:00",
          "lte": "2022-05-09T00:00:00" # fetch data until May 10th, 2022
          }
        }}
        ]
  
    }

  }
},
request_timeout=30 # default timeout is 10sec
 
)

In [160]:
print(len(result['hits']['hits']))

17318


## Convert data to dataframe

In [161]:
df = pd.json_normalize(result['hits']['hits'])
df = df[['_source.end__date', '_source.car_plate_number', '_source.status', '_source.group_id', '_source.duration', '_source.distance', '_source.location', '_source.zipcode', '_source.battery', '_source.end_battery', '_source.start_date']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17318 entries, 0 to 17317
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   _source.end__date         17318 non-null  object 
 1   _source.car_plate_number  17318 non-null  object 
 2   _source.status            17318 non-null  object 
 3   _source.group_id          17318 non-null  object 
 4   _source.duration          17318 non-null  float64
 5   _source.distance          17318 non-null  float64
 6   _source.location          17318 non-null  object 
 7   _source.zipcode           17318 non-null  object 
 8   _source.battery           17215 non-null  float64
 9   _source.end_battery       17223 non-null  float64
 10  _source.start_date        17318 non-null  object 
dtypes: float64(4), object(7)
memory usage: 1.5+ MB


### Utils

In [162]:
month_duration_dict = {"Jan": 31, "Feb": 28, "Mar": 31, "Apr": 30, "May": 31, "Jun": 30, "Jul": 31, "Aug": 31, "Sep": 30, "Oct": 31, "Nov": 30, "Dec": 31}

durations = np.array(list(month_duration_dict.values()))
cum_durations = np.cumsum(durations)
cum_dict = {i+1: cum_durations[i] for i in range(len(cum_durations))}
cum_dict[0]=0

week_dict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}

monthdict = {"Jan": 0, "Feb": 1, "Mar": 2, "Apr": 3, "May": 4, "Jun": 5, "Jul": 6, "Aug": 7, "Sep": 8, "Oct": 9, "Nov": 10,  "Dec": 11}

inv_month = {v: k for k, v in monthdict.items()}

### Map values

In [163]:
df['_source.group_id'].replace("Zity", "Client", inplace=True) # Map Zity to Client
df['_source.status'].replace("BOOKED_PARKED", "BOOKED", inplace=True) # Map BOOKED_PARKED to BOOKED (interpolation is performed later)
df['_source.group_id'].replace("Zity Corporate", "Defleeted", inplace=True) # Map Zity Corporate to Defleeted
df.rename(columns = {'_source.end__date':'end_date', '_source.car_plate_number':'car_plate_number', '_source.status':'status', '_source.group_id':'group_id', '_source.duration':'kibana_duration', '_source.distance':'distance', '_source.location':'location', '_source.zipcode':'zipcode', '_source.start_date':'start_date'}, inplace = True)

In [164]:
df

Unnamed: 0,end_date,car_plate_number,status,group_id,kibana_duration,distance,location,zipcode,_source.battery,_source.end_battery,start_date
0,2022-02-18T18:54:27,FZ-411-RF,FREE,Client,1366.000000,43.0,"48.86922836,2.32788754",75001,75.0,48.0,2022-02-17T20:25:38
1,2022-02-18T18:54:28,GB-101-JY,BOOKED,Client,111.000000,10.0,"48.86750031,2.32961774",75001,68.0,58.0,2022-02-18T17:21:44
2,2022-02-18T18:55:28,FZ-203-RF,FREE,Client,4277.000000,54.0,"48.86305237,2.33929276",75001,43.0,9.0,2022-02-15T19:56:59
3,2022-02-18T18:53:52,FB-266-YK,BOOKED,Client,387.000000,3.0,"48.86306763,2.341676",75001,50.0,48.0,2022-02-18T12:47:45
4,2022-02-18T19:40:23.7527763+01:00,GB-067-JY,BOOKED,Client,47.000000,4.0,"48.86037445,2.33242726",75001,40.0,37.0,2022-02-18T18:53:57
...,...,...,...,...,...,...,...,...,...,...,...
17313,2021-12-27T20:09:29.277Z,FB-555-TV,BOOKED,Maintenance,3.999056,0.0,"48.8653,2.3333",75001,15.0,18.0,2021-12-27T20:05:29.243Z
17314,2021-12-27T20:14:37.303Z,FC-656-BK,FREE,Maintenance,9.131059,0.0,"48.8662,2.3338",75001,100.0,100.0,2021-12-27T20:05:29.433Z
17315,2021-12-28T22:38:58.153Z,FZ-431-RF,BOOKED,Client,39.996372,10.0,"48.8582,2.3460",75001,39.0,33.0,2021-12-28T21:58:58.330Z
17316,2021-12-12T00:59:35.947Z,FZ-805-RF,BOOKED,Client,3.997915,0.0,"48.8655,2.3335",75001,36.0,35.0,2021-12-12T00:55:36.067Z


In [165]:
df['end_date_time']= pd.to_datetime(df['end_date'], infer_datetime_format=True, utc=True)

In [166]:
df = df.sort_values(by=["car_plate_number", "end_date_time"], ascending = False)
df.reset_index(drop=True, inplace = True)

### Splitting and creating columns

In [167]:
df['delta_battery']=df['_source.end_battery']-df['_source.battery']
df.drop('_source.end_battery', axis=1, inplace=True)
df.drop('_source.battery', axis=1, inplace=True)
df[['latitude', 'longitude']] = df['location'].str.split(',', expand=True)
df.drop('location', axis=1, inplace=True)
df['kibana_duration'] = pd.to_numeric(df['kibana_duration'], errors='coerce')
df['kibana_duration'] = df['kibana_duration'].replace(np.nan, 0)
df["kibana_duration"] = df["kibana_duration"].astype(int)
#df['kibana_duration'] = df['kibana_duration'].astype(str)
#df[['kibana_duration', 'trash']] = df['kibana_duration'].str.split('.', expand=True)
#df.drop('trash', axis=1, inplace=True)
df[['end_date', 'end_time']] = df['end_date'].str.split('T', expand=True)
df[['end_time', 'trash']] = df['end_time'].str.split('.', expand=True)
df.drop('trash', axis=1, inplace=True)
df[['end_year', 'end_month', "end_day_number"]] = df['end_date'].str.split('-', expand=True)
df.drop('end_date', axis=1, inplace=True)
df[['end_hour', 'end_minutes', "end_seconds"]] = df['end_time'].str.split(':', expand=True)
df.drop('end_time', axis=1, inplace=True)
df.drop('end_seconds', axis=1, inplace=True)
df[['start_date', 'start_time']] = df['start_date'].str.split('T', expand=True)
df[['start_time', 'trash']] = df['start_time'].str.split('.', expand=True)
df.drop('trash', axis=1, inplace=True)
df[['start_year', 'start_month', "start_day_number"]] = df['start_date'].str.split('-', expand=True)
df.drop('start_date', axis=1, inplace=True)
df[['start_hour', 'start_minutes', "start_seconds"]] = df['start_time'].str.split(':', expand=True)
df.drop('start_time', axis=1, inplace=True)
df.drop('start_seconds', axis=1, inplace=True)
df.head()

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,end_year,end_month,end_day_number,end_hour,end_minutes,start_year,start_month,start_day_number,start_hour,start_minutes
0,GD-998-TQ,FREE,Client,1018,16.0,75001,2022-05-07 12:37:45.087793600+00:00,-8.0,48.86552811,2.34012246,2022,5,7,14,37,2022,5,6,21,39
1,GD-866-TQ,BOOKED,Client,93,1.0,75001,2022-03-05 21:06:02.773000+00:00,-1.0,48.881073,2.35801792,2022,3,5,21,6,2022,3,5,20,43
2,GD-866-TQ,BOOKED,Client,104,4.0,75001,2022-03-05 20:43:01.096000+00:00,-4.0,48.85855484,2.34379292,2022,3,5,20,43,2022,3,5,20,3
3,GD-866-TQ,FREE,Client,153,0.0,75001,2022-03-05 20:03:00.196000+00:00,0.0,48.85855484,2.34379292,2022,3,5,20,3,2022,3,5,18,43
4,GD-791-TQ,FREE,Client,85160,1440.0,75001,2022-05-04 14:47:34.498102900+00:00,-14.0,48.85845566,2.34678626,2022,5,4,16,47,2022,3,6,13,27


### Change types

In [168]:
df['distance']= pd.to_numeric(df['distance'], errors='coerce')
df['distance'] = df['distance'].fillna(0)
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df['zipcode'] = pd.to_numeric(df['zipcode'], errors='coerce')
df['kibana_duration'] = pd.to_numeric(df['kibana_duration'], errors='coerce')
df['end_year'] = pd.to_numeric(df['end_year'], errors='coerce')
df['end_month'] = pd.to_numeric(df['end_month'], errors='coerce')
df['end_day_number'] = pd.to_numeric(df['end_day_number'], errors='coerce')
df['end_hour'] = pd.to_numeric(df['end_hour'], errors='coerce')
df['end_minutes'] = pd.to_numeric(df['end_minutes'], errors='coerce')
df['start_year'] = pd.to_numeric(df['start_year'], errors='coerce')
df['start_month'] = pd.to_numeric(df['start_month'], errors='coerce')
df['start_day_number'] = pd.to_numeric(df['start_day_number'], errors='coerce')
df['start_hour'] = pd.to_numeric(df['start_hour'], errors='coerce')
df['start_minutes'] = pd.to_numeric(df['start_minutes'], errors='coerce')

### Calculate end time since 2020

In [169]:
df['time_since_2020'] = (df['end_year']-2020)*365*24*60 + ((df['end_month']-1).map(cum_dict)+df['end_day_number'])*24*60 + df['end_hour']*60 + df['end_minutes']

### Sort by end date

In [170]:
df

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,...,end_month,end_day_number,end_hour,end_minutes,start_year,start_month,start_day_number,start_hour,start_minutes,time_since_2020
0,GD-998-TQ,FREE,Client,1018,16.0,75001,2022-05-07 12:37:45.087793600+00:00,-8.0,48.865528,2.340122,...,5,7,14,37,2022,5,6,21,39,1234957
1,GD-866-TQ,BOOKED,Client,93,1.0,75001,2022-03-05 21:06:02.773000+00:00,-1.0,48.881073,2.358018,...,3,5,21,6,2022,3,5,20,43,1144626
2,GD-866-TQ,BOOKED,Client,104,4.0,75001,2022-03-05 20:43:01.096000+00:00,-4.0,48.858555,2.343793,...,3,5,20,43,2022,3,5,20,3,1144603
3,GD-866-TQ,FREE,Client,153,0.0,75001,2022-03-05 20:03:00.196000+00:00,0.0,48.858555,2.343793,...,3,5,20,3,2022,3,5,18,43,1144563
4,GD-791-TQ,FREE,Client,85160,1440.0,75001,2022-05-04 14:47:34.498102900+00:00,-14.0,48.858456,2.346786,...,5,4,16,47,2022,3,6,13,27,1230767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17313,FA-018-MP,BOOKED,Maintenance,0,0.0,75001,2021-07-24 23:05:45.493000+00:00,0.0,48.865200,2.333200,...,7,24,23,5,2021,7,24,23,4,822185
17314,FA-018-MP,BOOKED,Maintenance,21,4.0,75001,2021-03-06 01:37:33.823000+00:00,-2.0,48.865330,2.333310,...,3,6,1,37,2021,3,6,1,16,619297
17315,FA-018-MP,FREE,Maintenance,93,0.0,75001,2021-03-06 01:16:05.483000+00:00,65.0,48.865330,2.333310,...,3,6,1,16,2021,3,5,23,42,619276
17316,FA-018-MP,RESERVED,Client,4,0.0,75001,2020-12-18 08:28:27.090000+00:00,-0.3,48.858340,2.347230,...,12,18,8,28,2020,12,18,8,23,507388


## Calculation of the durations

Only the end dates are reliable. The start date for a given segment corresponds to the last end date for the same plate.

In [171]:
def latest_segment(index):
    # returns index of the latest segment for the same plate
    
    plate_segments = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
    previous_segments = plate_segments[plate_segments.index(index)+1:]
    if not len(previous_segments):
        return -1
    return previous_segments[0]

In [172]:
def next_segment(index):
    plate_segments = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
    if plate_segments.index(index)==0:
        return -1
    return plate_segments[plate_segments.index(index)-1]

In [173]:
def duration(index):
    # returns the duration for a given index, only takes into account days and time
    
    previous_index = latest_segment(index)
    if previous_index==-1:
        return 0
    return df.loc[index, 'time_since_2020'] - df.loc[previous_index, 'time_since_2020']

Durations should be calculated on data not restricted to a specific district otherwise it makes no sense to look for the last occurence of the same plate as the car have travelled tto another district !

In [174]:
#df['duration']=df.index.map(duration)
df['duration']=df['kibana_duration'] 
df

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,...,end_day_number,end_hour,end_minutes,start_year,start_month,start_day_number,start_hour,start_minutes,time_since_2020,duration
0,GD-998-TQ,FREE,Client,1018,16.0,75001,2022-05-07 12:37:45.087793600+00:00,-8.0,48.865528,2.340122,...,7,14,37,2022,5,6,21,39,1234957,1018
1,GD-866-TQ,BOOKED,Client,93,1.0,75001,2022-03-05 21:06:02.773000+00:00,-1.0,48.881073,2.358018,...,5,21,6,2022,3,5,20,43,1144626,93
2,GD-866-TQ,BOOKED,Client,104,4.0,75001,2022-03-05 20:43:01.096000+00:00,-4.0,48.858555,2.343793,...,5,20,43,2022,3,5,20,3,1144603,104
3,GD-866-TQ,FREE,Client,153,0.0,75001,2022-03-05 20:03:00.196000+00:00,0.0,48.858555,2.343793,...,5,20,3,2022,3,5,18,43,1144563,153
4,GD-791-TQ,FREE,Client,85160,1440.0,75001,2022-05-04 14:47:34.498102900+00:00,-14.0,48.858456,2.346786,...,4,16,47,2022,3,6,13,27,1230767,85160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17313,FA-018-MP,BOOKED,Maintenance,0,0.0,75001,2021-07-24 23:05:45.493000+00:00,0.0,48.865200,2.333200,...,24,23,5,2021,7,24,23,4,822185,0
17314,FA-018-MP,BOOKED,Maintenance,21,4.0,75001,2021-03-06 01:37:33.823000+00:00,-2.0,48.865330,2.333310,...,6,1,37,2021,3,6,1,16,619297,21
17315,FA-018-MP,FREE,Maintenance,93,0.0,75001,2021-03-06 01:16:05.483000+00:00,65.0,48.865330,2.333310,...,6,1,16,2021,3,5,23,42,619276,93
17316,FA-018-MP,RESERVED,Client,4,0.0,75001,2020-12-18 08:28:27.090000+00:00,-0.3,48.858340,2.347230,...,18,8,28,2020,12,18,8,23,507388,4


## Next segment group-id column

In [175]:
def next_group_id(index):
    next_index = next_segment(index)
    if next_index == -1:
        return "no next plate"
    else :
        return df.loc[next_index, 'group_id']
df['next_group_id2']=df.index.map(next_group_id)

In [176]:
df["next_plate"] = df['car_plate_number'].shift(1)
df["last_plate"] = df['car_plate_number'].shift(-1)
df["next_group_id"] = df["group_id"].shift(1)
df.loc[df["next_plate"]!=df["car_plate_number"], 'next_group_id'] = "no next plate"

In [177]:
df.loc[df['next_group_id']!=df['next_group_id2']]

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,...,start_month,start_day_number,start_hour,start_minutes,time_since_2020,duration,next_group_id2,next_plate,last_plate,next_group_id


## Keep only "CLIENT" Group-ids

In [178]:
df = df[df['group_id']=="Client"]

## Correct bugs

### Fake booking BUG

Some segments have a "Booked" status but no distance is covered. The status of these fake entries is changed to "FREE"

Unit test : index #255 & #256 (GB-102-JY)

In [179]:
def new_status(index):
    status = df.loc[index, 'status']
    distance = df.loc[index, 'distance']
    if (status=="BOOKED" or status=="BOOKED_PARK") and distance==0:
        return "FREE"
    return status

df['status']=df.index.map(new_status)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['status']=df.index.map(new_status)


### Missing locations BUG

In April, 2022, the locations are missins. The corresponding rows should be deleted

In [180]:
df = df[df['latitude'].notna()]
df = df[df['longitude'].notna()]

### Fake year BUG
Some data are from 1753

In [181]:
df = df[df['end_year']>2000]
df.reset_index(drop=True, inplace = True)

In [209]:
df['last_plate'] = df['car_plate_number'].shift(-1)

## Interpolation of useless segments

For a given plate, a FREE segment should be followed in time by a BOOKED or BOOKED_PARK segment and vice-versa. 

Unit test : index #253 -> #283 (GB-102-JY) #628 (GB-029-JY)

TODO : CHANGE START TIMES...

In [182]:
def has_changed_status(index):
    last_index = latest_segment(index)
    if last_index==-1:
        return True
    return df.loc[index, 'status']!=df.loc[last_index, 'status']

In [183]:
df['has_changed_status']=df.index.map(has_changed_status)

In [210]:
df['last_status'] = df['status'].shift(-1)
df['Status_has_changed'] = df['status']!=df['last_status']
df.loc[df["last_plate"]!=df["car_plate_number"], 'Status_has_changed'] = True

In [211]:
(df.loc[df['has_changed_status']!=df['Status_has_changed']])

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,...,time_since_2020,duration,next_group_id2,next_plate,last_plate,next_group_id,has_changed_status,last_status,Status_has_changed,consecutive


In [212]:
def new_feature(index, feature_name):
    # returns the feature value for the last segment which status didn't change
    
    if df.loc[index, 'Status_has_changed']:
        return df.loc[index, feature_name]
    else :
        index_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
        bool_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']]['Status_has_changed'])
        ind = index_list.index(index)
        while not bool_list[ind] and ind < len(index_list):
            ind+=1 # Looking for the last segment for which the status has changed
        return df.loc[index_list[ind], feature_name]
    
def new_feature_cum(index, feature_name):
    # returns the sum of all values of the feature on the segments to interpolate
    if df.loc[index, 'Status_has_changed']:
        return df.loc[index, feature_name]
    else :
        value = int(df.loc[index, feature_name])
        index_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
        bool_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']]['Status_has_changed'])
        ind = index_list.index(index)
        while not bool_list[ind] and ind < len(index_list):
            ind+=1
            value += int(df.loc[index_list[ind], feature_name])
        return value

In [213]:
features_to_change = ['latitude', 'longitude']
features_to_cumulate = ['distance', 'delta_battery']

for name in features_to_change :
    df['new_'+name] = df.index.map(lambda x: new_feature(x, name))
    

for name in features_to_cumulate :
    df['new_'+name] = df.index.map(lambda x: new_feature_cum(x, name))

In [None]:
for name in features_to_change :
    df[name] = df['new_'+name]
    df.drop('new_'+name, axis=1, inplace=True)

for name in features_to_cumulate :
    df[name] = df['new_'+name]
    df.drop('new_'+name, axis=1, inplace=True)

In [214]:
# Delete lines
def to_keep(index):
    next_index = next_segment(index)
    if next_index == -1 or df.loc[next_index, 'Status_has_changed']:
        return True
    return False

In [219]:
df['to_keep'] = df['Status_has_changed'].shift(1)
df.loc[df["next_plate"]!=df["car_plate_number"], 'to_keep'] = True

In [218]:
# df["to_keep"]=df.index.map(to_keep)
df = df[df["to_keep"]]
df.drop("to_keep", axis=1, inplace=True)
df.drop("Status_has_changed", axis=1, inplace=True)
# UNIT TEST : GB-102-JY (done!)

'df\ndf = df[df["to_keep"]]\ndf.drop("to_keep", axis=1, inplace=True)\ndf.drop("Status_has_changed", axis=1, inplace=True)'

## Start date column

$startdate = endate - duration$

In [220]:
(df.loc[df['to_keep']!=df['to_keep2']])

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,end_date_time,delta_battery,latitude,longitude,...,has_changed_status,last_status,Status_has_changed,consecutive,new_latitude,new_longitude,new_distance,new_delta_battery,to_keep2,to_keep


In [None]:
def start_date(index):
    time = df.loc[index, 'time_since_2020'] - df.loc[index, 'kibana_duration'] # Start time since 2020
    year = 2020 + time // (365*24*60)
    time = time % (365*24*60)
    i = 0
    while cum_durations[i]< (time // (24*60)) and i < 12:
        i+=1
    month = inv_month[i]
    day = time // (24*60) - cum_durations[i-1] if i else time // (24*60)
    time = time % (24*60)
    hour = time // 60
    minute = time % 60
    return year, month, day, hour, minute
  
    
df['year']=df.index.map(lambda x: start_date(x)[0])
df['month']=df.index.map(lambda x: start_date(x)[1])
df['day_number']=df.index.map(lambda x: start_date(x)[2])
df['hour']=df.index.map(lambda x: start_date(x)[3])
df['minute']=df.index.map(lambda x: start_date(x)[4])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24958 entries, 0 to 31538
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_plate_number  24958 non-null  object 
 1   status            24958 non-null  object 
 2   group_id          24958 non-null  object 
 3   kibana_duration   24958 non-null  int64  
 4   distance          24958 non-null  float64
 5   zipcode           24958 non-null  int64  
 6   delta_battery     24957 non-null  float64
 7   latitude          24958 non-null  float64
 8   longitude         24958 non-null  float64
 9   end_year          24958 non-null  float64
 10  end_month         24958 non-null  float64
 11  end_day_number    24958 non-null  float64
 12  end_hour          24958 non-null  float64
 13  end_minutes       24958 non-null  float64
 14  start_year        24958 non-null  int64  
 15  start_month       24958 non-null  int64  
 16  start_day_number  24958 non-null  int64 

In [None]:
df['end_month'].describe()

count    24958.000000
mean         6.989983
std          3.476269
min          1.000000
25%          4.000000
50%          7.000000
75%         10.000000
max         12.000000
Name: end_month, dtype: float64

### Day of the week
scripted_day_of_week contains many nans, so we recalculate the day of week with the datetime module. Actually __scripted day of week__ is mostly wrong as there are 23,191 entries with __scripted day of week__ $\neq$ __day of week__.

In [None]:
def get_week_day(index):
    day = int(df.loc[index, 'day_number'])
    month = int(monthdict[df.loc[index, 'month']]+1)
    year = int(df.loc[index, 'year'])
    if day==0:
        # Bug: 179 entries with date 2021-01-0, scripted_day_of_week was indicating 3 in isoweekday
        return 2
    return datetime.date(year, month, day).weekday()

df['day_of_week']=df.index.map(get_week_day)

In [None]:
print(np.count_nonzero(df['hour']-df['start_hour']))
print((df['hour']-df['start_hour']).describe())

2899
count    24958.000000
mean        -0.037463
std          2.218139
min        -23.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         23.000000
dtype: float64


In [None]:
df.describe()

Unnamed: 0,kibana_duration,distance,zipcode,delta_battery,latitude,longitude,end_year,end_month,end_day_number,end_hour,...,start_day_number,start_hour,start_minutes,time_since_2020,duration,year,day_number,hour,minute,day_of_week
count,24958.0,24958.0,24958.0,24957.0,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0,...,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0,24958.0
mean,155.305113,-6936.905,75008.0,-1.692217,48.87427,2.313007,2021.024,6.989983,15.775182,14.090352,...,15.82747,13.849667,29.177057,823072.3,155.305113,2021.025723,15.706467,13.812205,29.298501,3.215282
std,1582.557831,784575.9,0.0,6.181302,0.005219,0.009891,0.352929,3.476269,8.9163,5.253418,...,8.929829,5.141637,17.352806,171359.3,1582.557831,0.350243,8.919571,5.134294,17.363375,2.000544
min,0.0,-10233000.0,75008.0,-322.0,48.796696,2.214271,2020.0,1.0,1.0,0.0,...,1.0,0.0,0.0,473367.0,0.0,2020.0,0.0,0.0,0.0,0.0
25%,10.0,0.0,75008.0,-2.0,48.8706,2.3036,2021.0,4.0,8.0,11.0,...,8.0,10.0,14.0,695061.8,10.0,2021.0,8.0,10.0,14.0,2.0
50%,24.0,0.0,75008.0,0.0,48.874214,2.3139,2021.0,7.0,16.0,15.0,...,16.0,15.0,29.0,830492.5,24.0,2021.0,16.0,15.0,29.0,3.0
75%,82.0,4.0,75008.0,0.0,48.8786,2.3221,2021.0,10.0,24.0,18.0,...,24.0,17.0,44.0,957401.5,82.0,2021.0,24.0,17.0,44.0,5.0
max,155209.0,18026820.0,75008.0,150.0,48.962547,2.483372,2022.0,12.0,31.0,23.0,...,31.0,23.0,59.0,1237004.0,155209.0,2022.0,31.0,23.0,59.0,6.0


In [None]:
# remove wrong dates (2021/01/00),
df = df[df['day_number']!=0]
df = df[df['end_day_number']!=0]
# drop duplicate columns
# df.drop("duration", axis=1, inplace=True)

# Change types
df['day_number'] = df['day_number'].astype(int)
df['year'] = df['year'].astype(int)

## Export csv

In [None]:
df.to_csv(f"{zipcode}.csv")