# Downloading and processing the data for one district

https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-update-by-query.html to update and clean the data ?

In [6]:
zipcode = 75015

### Imports & connection to Elasticsearch

In [7]:
from elasticsearch import Elasticsearch
from ssl import create_default_context
import pandas as pd
import numpy as np
import sys
import datetime

with open('connect.txt') as f:
    str = f.readlines()

es = Elasticsearch(str)

### Change maximum size setting
By default the query returns the first 10,000 hits

In [8]:
es.indices.put_settings(
    index="carshare_car_history",
    body={
        "index.max_result_window": 500000
    }

)

{'acknowledged': True}

## Query
Fetch all data in 75015 district. This should be repeated for each zipcode.

In [9]:
result = es.search(
  index="carshare_car_history",
  body = {
  "size": 200000,
  "query": {

      "bool": {
        "must": {
          "match_all": {}
        },
        "filter": [
    {
    "geo_shape": {
      "ignore_unmapped": "true",
      "location": {
        "relation": "INTERSECTS",
        "shape": {
          "coordinates": [
            [
              [
                1.87411, 49.08001
              ],
              [
                1.87411,
                48.64617
              ],
              [
                2.72473,
                48.64617
              ],
              [
                2.72473,
                49.08001
              ],
              [
                1.87411,
                49.08001
              ]
            ]
          ],
          "type": "Polygon"
        }
      }
    }
  },
  
  {
      "match_phrase": {
        "brand": "Zity"
      }    
  },
  
{
      "match_phrase": {
        "zipcode": zipcode
      }    
  },

  {
      "range": { 
        "last_update": {
          "gte": "2020-01-01T00:00:00",
          "lte": "2022-05-09T00:00:00" # fetch data until May 10th, 2022
          }
        }}
        ]
  
    }

  }
},
request_timeout=30 # default timeout is 10sec
 
)

In [4]:
print(len(result['hits']['hits']))

116730


## Convert data to dataframe

In [7]:
df = pd.json_normalize(result['hits']['hits'])
df = df[['_source.end__date', '_source.car_plate_number', '_source.status', '_source.group_id', '_source.duration', '_source.distance', '_source.location', '_source.zipcode', '_source.battery', '_source.end_battery']]
df

Unnamed: 0,_source.end__date,_source.car_plate_number,_source.status,_source.group_id,_source.duration,_source.distance,_source.location,_source.zipcode,_source.battery,_source.end_battery
0,2021-12-10T20:51:13.600Z,FZ-162-RF,BOOKED,Zity,9.999589,2.0,"48.8342,2.3084",75015,20.0,19.0
1,2021-12-26T14:39:03.447Z,GB-996-JX,FREE,Zity,1124.272803,0.0,"48.8510,2.2844",75015,33.0,33.0
2,2021-12-15T18:51:54.027Z,FB-627-JR,BOOKED_PARKED,Zity,3.994547,0.0,"48.8299,2.2682",75015,86.0,86.0
3,2021-12-26T17:25:06.463Z,GB-972-JX,BOOKED,Zity,16.015235,3.0,"48.8471,2.2851",75015,22.0,20.0
4,2021-12-14T14:41:30.530Z,FB-726-JR,FREE,Maintenance,10.002449,0.0,"48.8386,2.2721",75015,96.0,96.0
...,...,...,...,...,...,...,...,...,...,...
116725,2022-05-08T21:21:43.7907247+02:00,FZ-989-RF,FREE,Client,125.000000,1.0,"48.8390274,2.2878294",75015,100.0,99.0
116726,2022-05-08T21:21:46.3787991+02:00,GB-033-JY,BOOKED_PARKED,Client,39.000000,0.0,"48.84534454,2.29740763",75015,73.0,73.0
116727,2022-05-08T21:22:05.2249618+02:00,FZ-910-RF,FREE,Client,39.000000,2.0,"48.8347168,2.28977203",75015,78.0,77.0
116728,2022-05-08T21:22:26.8359542+02:00,GB-067-JY,BOOKED,Client,39.000000,10.0,"48.83942795,2.30699611",75015,24.0,17.0


In [8]:
print(df['_source.group_id'].value_counts())
print(df['_source.status'].unique())
print((df['_source.end__date'].str[:7].value_counts()))
print(len(df['_source.car_plate_number'].unique()))

Zity              85100
Maintenance       16241
Client             7498
Battery            4132
Workshop           3622
Zity Corporate       81
Airport              42
14                    9
MAINTENANCE           5
Name: _source.group_id, dtype: int64
['BOOKED' 'FREE' 'BOOKED_PARKED' 'RESERVED' 'UNRESPONSIVE']
2021-12    10267
2021-07    10204
2021-05     9800
2021-08     9768
2021-10     9546
2021-09     9472
2021-04     9254
2021-06     9180
2021-11     8908
2021-03     6474
2021-02     6244
2020-12     4660
2021-01     3284
2022-01     3144
2022-02     3065
2022-03     1175
2022-05      779
2020-11      735
1752-12      733
2022-04       18
1753-01       16
0001-01        4
Name: _source.end__date, dtype: int64
790


### Utils

In [9]:
month_duration_dict = {"Jan": 31, "Feb": 28, "Mar": 31, "Apr": 30, "May": 31, "Jun": 30, "Jul": 31, "Aug": 31, "Sep": 30, "Oct": 31, "Nov": 30, "Dec": 31}

durations = np.array(list(month_duration_dict.values()))
cum_durations = np.cumsum(durations)
cum_dict = {i+1: cum_durations[i] for i in range(len(cum_durations))}
cum_dict[0]=0

week_dict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}

monthdict = {"Jan": 0, "Feb": 1, "Mar": 2,
        "Apr": 3,
        "May": 4,
        "Jun": 5,
        "Jul": 6,
        "Aug": 7,
        "Sep": 8,
        "Oct": 9,
        "Nov": 10, 
        "Dec": 11
    }

inv_month = {v: k for k, v in monthdict.items()}

### Map values

In [10]:
df['_source.group_id'].replace("Zity", "Client", inplace=True) # Map Zity to Client
df['_source.group_id'].replace("Zity Corporate", "Defleeted", inplace=True) # Map Zity Corporate to Defleeted
df.rename(columns = {'_source.end__date':'end_date', '_source.car_plate_number':'car_plate_number', '_source.status':'status', '_source.group_id':'group_id', '_source.duration':'kibana_duration', '_source.distance':'distance', '_source.location':'location', '_source.zipcode':'zipcode'}, inplace = True)

### Splitting and creating columns

In [11]:
df['delta_battery']=df['_source.end_battery']-df['_source.battery']
df.drop('_source.end_battery', axis=1, inplace=True)
df.drop('_source.battery', axis=1, inplace=True)
df[['latitude', 'longitude']] = df['location'].str.split(',', expand=True)
df.drop('location', axis=1, inplace=True)
df['kibana_duration'] = df['kibana_duration'].astype(str)
df[['kibana_duration', 'trash']] = df['kibana_duration'].str.split('.', expand=True)
df.drop('trash', axis=1, inplace=True)
df[['end_date', 'end_time']] = df['end_date'].str.split('T', expand=True)
df[['end_time', 'trash']] = df['end_time'].str.split('.', expand=True)
df.drop('trash', axis=1, inplace=True)
df[['end_year', 'end_month', "end_day_number"]] = df['end_date'].str.split('-', expand=True)
df.drop('end_date', axis=1, inplace=True)
df[['end_hour', 'end_minutes', "end_seconds"]] = df['end_time'].str.split(':', expand=True)
df.drop('end_time', axis=1, inplace=True)
df.drop('end_seconds', axis=1, inplace=True)
df

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,delta_battery,latitude,longitude,end_year,end_month,end_day_number,end_hour,end_minutes
0,FZ-162-RF,BOOKED,Client,9,2.0,75015,-1.0,48.8342,2.3084,2021,12,10,20,51
1,GB-996-JX,FREE,Client,1124,0.0,75015,0.0,48.8510,2.2844,2021,12,26,14,39
2,FB-627-JR,BOOKED_PARKED,Client,3,0.0,75015,0.0,48.8299,2.2682,2021,12,15,18,51
3,GB-972-JX,BOOKED,Client,16,3.0,75015,-2.0,48.8471,2.2851,2021,12,26,17,25
4,FB-726-JR,FREE,Maintenance,10,0.0,75015,0.0,48.8386,2.2721,2021,12,14,14,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116725,FZ-989-RF,FREE,Client,125,1.0,75015,-1.0,48.8390274,2.2878294,2022,05,08,21,21
116726,GB-033-JY,BOOKED_PARKED,Client,39,0.0,75015,0.0,48.84534454,2.29740763,2022,05,08,21,21
116727,FZ-910-RF,FREE,Client,39,2.0,75015,-1.0,48.8347168,2.28977203,2022,05,08,21,22
116728,GB-067-JY,BOOKED,Client,39,10.0,75015,-7.0,48.83942795,2.30699611,2022,05,08,21,22


### Change types

In [12]:
df['distance']= pd.to_numeric(df['distance'], errors='coerce')
df['distance'] = df['distance'].fillna(0)
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df['zipcode'] = pd.to_numeric(df['zipcode'], errors='coerce')
df['kibana_duration'] = pd.to_numeric(df['kibana_duration'], errors='coerce')
df['end_year'] = pd.to_numeric(df['end_year'], errors='coerce')
df['end_month'] = pd.to_numeric(df['end_month'], errors='coerce')
df['end_day_number'] = pd.to_numeric(df['end_day_number'], errors='coerce')
df['end_hour'] = pd.to_numeric(df['end_hour'], errors='coerce')
df['end_minutes'] = pd.to_numeric(df['end_minutes'], errors='coerce')

### Calculate end time since 2020

In [13]:
df['time_since_2020'] = (df['end_year']-2020)*365*24*60 + ((df['end_month']-1).map(cum_dict)+df['end_day_number'])*24*60 + df['end_hour']*60 + df['end_minutes']

### Sort by end date

In [14]:
df = df.sort_values(by='time_since_2020', ascending = False)
df.reset_index(drop=True, inplace = True)

## Calculation of the durations

Only the end dates are reliable. The start date for a given segment corresponds to the last end date for the same plate.

In [15]:
def latest_segment(index):
    # returns index of the latest segment for the same plate
    
    plate_segments = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
    previous_segments = plate_segments[plate_segments.index(index)+1:]
    if not len(previous_segments):
        return -1
    return previous_segments[0]

In [16]:
def next_segment(index):
    plate_segments = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
    if plate_segments.index(index)==0:
        return -1
    return plate_segments[plate_segments.index(index)-1]

In [17]:
def duration(index):
    # returns the duration for a given index, only takes into account days and time
    
    previous_index = latest_segment(index)
    if previous_index==-1:
        return 0
    return df.loc[index, 'time_since_2020'] - df.loc[previous_index, 'time_since_2020']

Durations should be calculated on data not restricted to a specific district otherwise it makes no sense to look for the last occurence of the same plate as the car have travelled tto another district !

In [18]:
#df['duration']=df.index.map(duration)
df['duration']=df['kibana_duration'] 
df

Unnamed: 0,car_plate_number,status,group_id,kibana_duration,distance,zipcode,delta_battery,latitude,longitude,end_year,end_month,end_day_number,end_hour,end_minutes,time_since_2020,duration
0,FZ-431-RF,FREE,Client,142,0.0,75015,0.0,48.841396,2.302687,2022,5,9,1,46,1237066,142
1,GB-797-JX,FREE,Client,162,0.0,75015,54.0,48.828094,2.271857,2022,5,9,1,23,1237043,162
2,FZ-797-RF,FREE,Client,118,4.0,75015,-2.0,48.840595,2.288074,2022,5,9,1,21,1237041,118
3,GD-930-TQ,FREE,Battery,594,4.0,75015,69.0,48.828358,2.271328,2022,5,9,1,21,1237041,594
4,GB-952-JX,BOOKED,Client,37,5.0,75015,-2.0,48.850243,2.348434,2022,5,9,1,21,1237041,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116725,GB-829-JX,BOOKED,Client,0,0.0,75015,-95.0,48.843810,2.315260,1752,12,31,23,50,-140333770,0
116726,GB-045-JY,FREE,Maintenance,0,0.0,75015,0.0,48.842910,2.321860,1,1,1,0,0,-1061184960,0
116727,GB-805-JX,BOOKED,Client,0,0.0,75015,-24.0,48.846610,2.286550,1,1,1,0,0,-1061184960,0
116728,FZ-675-RF,BOOKED,Client,0,0.0,75015,-58.0,48.840690,2.290980,1,1,1,0,0,-1061184960,0


## Keep only "CLIENT" Group-ids

In [19]:
df = df[df['group_id']=="Client"]

## Corrrect bugs

### Fake booking BUG

Some segments have a "Booked" status but no distance is covered. The status of these fake entries is changed to "FREE"

Unit test : index #255 & #256 (GB-102-JY)

In [20]:
def new_status(index):
    status = df.loc[index, 'status']
    distance = df.loc[index, 'distance']
    if (status=="BOOKED" or status=="BOOKED_PARK") and distance==0:
        return "FREE"
    return status

df['status']=df.index.map(new_status)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['status']=df.index.map(new_status)


### Missing locations BUG

In April, 2022, the locations are missins. The corresponding rows should be deleted

In [21]:
df = df[df['latitude'].notna()]
df = df[df['longitude'].notna()]

### Fake year BUG
Some data are from 1753

In [22]:
df = df[df['end_year']>2000]

## Interpolation of useless segments

For a given plate, a FREE segment should be followed in time by a BOOKED or BOOKED_PARK segment and vice-versa. To do before the calculation of the durations.

Unit test : index #253 -> #283 (GB-102-JY) #628 (GB-029-JY)

In [23]:
def has_changed_status(index):
    last_index = latest_segment(index)
    if last_index==-1:
        return True
    return df.loc[index, 'status']!=df.loc[last_index, 'status']

In [24]:
df['Status_has_changed']=df.index.map(has_changed_status)

In [25]:
def new_feature(index, feature_name):
    # returns the feature value for the last segment which status didn't change
    
    if df.loc[index, 'Status_has_changed']:
        return df.loc[index, feature_name]
    else :
        index_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
        bool_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']]['Status_has_changed'])
        ind = index_list.index(index)
        while not bool_list[ind] and ind < len(index_list):
            ind+=1 # Looking for the last segment for which the status has changed
        return df.loc[index_list[ind], feature_name]
    
def new_feature_cum(index, feature_name):
    # returns the sum of all values of the feature on the segments to interpolate
    if df.loc[index, 'Status_has_changed']:
        return df.loc[index, feature_name]
    else :
        value = int(df.loc[index, feature_name])
        index_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']].index)
        bool_list = list(df.loc[df['car_plate_number']==df.loc[index, 'car_plate_number']]['Status_has_changed'])
        ind = index_list.index(index)
        while not bool_list[ind] and ind < len(index_list):
            ind+=1
            value += int(df.loc[index_list[ind], feature_name])
        return value

In [26]:
features_to_change = ['latitude', 'longitude']
features_to_cumulate = ['distance', 'delta_battery']

for name in features_to_change :
    df['new_'+name] = df.index.map(lambda x: new_feature(x, name))
    

for name in features_to_cumulate :
    df['new_'+name] = df.index.map(lambda x: new_feature_cum(x, name))

In [27]:
for name in features_to_change :
    df[name] = df['new_'+name]
    df.drop('new_'+name, axis=1, inplace=True)

for name in features_to_cumulate :
    df[name] = df['new_'+name]
    df.drop('new_'+name, axis=1, inplace=True)

In [28]:
# Delete lines
def to_keep(index):
    next_index = next_segment(index)
    if next_index == -1 or df.loc[next_index, 'Status_has_changed']:
        return True
    return False

In [29]:
df["to_keep"]=df.index.map(to_keep)
df
df = df[df["to_keep"]]
df.drop("to_keep", axis=1, inplace=True)
df.drop("Status_has_changed", axis=1, inplace=True)
# UNIT TEST : GB-102-JY (done!)

## Start date column

$startdate = endate - duration$

In [30]:
def start_date(index):
    time = df.loc[index, 'time_since_2020'] - df.loc[index, 'kibana_duration'] # Start time since 2020
    year = 2020 + time // (365*24*60)
    time = time % (365*24*60)
    i = 0
    while cum_durations[i]< (time // (24*60)) and i < 12:
        i+=1
    month = inv_month[i]
    day = time // (24*60) - cum_durations[i-1] if i else time // (24*60)
    time = time % (24*60)
    hour = time // 60
    minute = time % 60
    return year, month, day, hour, minute
  
    
df['year']=df.index.map(lambda x: start_date(x)[0])
df['month']=df.index.map(lambda x: start_date(x)[1])
df['day_number']=df.index.map(lambda x: start_date(x)[2])
df['hour']=df.index.map(lambda x: start_date(x)[3])
df['minute']=df.index.map(lambda x: start_date(x)[4])

In [31]:
df.drop(['end_year', 'end_month', 'end_day_number', 'time_since_2020'], axis=1, inplace=True)

### Day of the week
scripted_day_of_week contains many nans, so we recalculate the day of week with the datetime module. Actually __scripted day of week__ is mostly wrong as there are 23,191 entries with __scripted day of week__ $\neq$ __day of week__.

In [35]:
def get_week_day(index):
    day = df.loc[index, 'day_number']
    month = monthdict[df.loc[index, 'month']]+1
    year = df.loc[index, 'year']
    if day==0:
        # Bug: 179 entries with date 2021-01-0, scripted_day_of_week was indicating 3 in isoweekday
        return 2
    return datetime.date(year, month, day).weekday()

df['day_of_week']=df.index.map(get_week_day)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79714 entries, 0 to 115976
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_plate_number  79714 non-null  object 
 1   status            79714 non-null  object 
 2   group_id          79714 non-null  object 
 3   kibana_duration   79714 non-null  int64  
 4   distance          79714 non-null  float64
 5   zipcode           79714 non-null  int64  
 6   delta_battery     79713 non-null  float64
 7   latitude          79714 non-null  float64
 8   longitude         79714 non-null  float64
 9   end_hour          79714 non-null  int64  
 10  end_minutes       79714 non-null  int64  
 11  duration          79714 non-null  int64  
 12  year              79714 non-null  int64  
 13  month             79714 non-null  object 
 14  day_number        79714 non-null  int64  
 15  hour              79714 non-null  int64  
 16  minute            79714 non-null  int64

## Export csv

In [39]:
df.to_csv("75015.csv")

df.info()

### Compare durations

In [278]:
print((df["kibana_duration"]-df["duration"]).describe())
print(len(df[abs(df["kibana_duration"]-df["duration"])>1]))

count     79108.000000
mean      -2104.852619
std        7883.341602
min     -211148.000000
25%         -73.000000
50%           0.000000
75%           0.000000
max       70377.000000
dtype: float64
24245


In [280]:
print(df['duration'].describe())
print(df['kibana_duration'].describe())


count     79108.000000
mean       2319.927479
std        8049.247828
min           0.000000
25%          10.000000
50%          35.000000
75%         715.000000
max      211289.000000
Name: duration, dtype: float64
count     79108.000000
mean        215.074860
std        1894.580198
min           0.000000
25%           9.000000
50%          21.000000
75%          96.000000
max      154397.000000
Name: kibana_duration, dtype: float64


In [286]:
df2 = df[df['kibana_duration']<3000]

In [287]:
print(df2['duration'].describe())
print(df2['kibana_duration'].describe())

count     78691.000000
mean       2263.086960
std        7841.666596
min           0.000000
25%          10.000000
50%          34.000000
75%         676.000000
max      211289.000000
Name: duration, dtype: float64
count    78691.000000
mean       161.281252
std        369.997596
min          0.000000
25%          9.000000
50%         21.000000
75%         94.000000
max       2996.000000
Name: kibana_duration, dtype: float64


In [289]:
print((df2["kibana_duration"]-df2["duration"]).describe())
print(len(df2[abs(df2["kibana_duration"]-df2["duration"])>5]))

count     78691.000000
mean      -2101.805708
std        7802.988566
min     -211148.000000
25%         -71.000000
50%           0.000000
75%           0.000000
max        2905.000000
dtype: float64
2159


In [301]:
df3=df2[abs(df2["kibana_duration"]-df2["duration"])>5]
df2["difference"]=df2["kibana_duration"]-df2["duration"]
df3["difference"]=df3["kibana_duration"]-df3["duration"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["difference"]=df2["kibana_duration"]-df2["duration"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3["difference"]=df3["kibana_duration"]-df3["duration"]


In [302]:
df2.groupby(by=['year', 'month'])["difference"].mean()

year  month
2020  Dec      -3932.295925
      Nov      -4364.131498
2021  Apr      -1576.518218
      Aug      -1647.472214
      Dec      -1980.908713
      Feb      -1781.128933
      Jan      -4210.909416
      Jul      -1339.882976
      Jun      -1222.672934
      Mar      -2306.742976
      May      -1373.527765
      Nov      -1880.541496
      Oct      -2074.633855
      Sep      -1627.188889
2022  Apr     -38155.285714
      Feb      -5906.954279
      Jan      -3938.447429
      Mar     -10609.586552
      May       -319.746114
Name: difference, dtype: float64