<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Meaningful-clustering" data-toc-modified-id="Meaningful-clustering-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Meaningful clustering</a></span></li></ul></div>

In [None]:
#%matplotlib notebook
%matplotlib inline

import sklearn.cluster as skcl;
import pandas as pd;
import matplotlib.pyplot as plt
import utils.feature_normalization as fn
import filemanagement.directories as dirs
from bisect import bisect
import seaborn as sns
import numpy as np
import re

In [None]:
features = pd.read_csv( filepath_or_buffer=dirs.interim_file_path( "features.txt" ) , sep="\t" )
features = features.query( 'longest_stop_s >= 0' )
features.head( 3 )

In [None]:
print( features.head(3))

## Meaningful clustering

Clustering based on meaningful boundaries.

First define some charging times. From https://www.clippercreek.com/wp-content/uploads/2016/04/TIME-TO-CHARGE-20170706_FINAL-LOW-RES.jpg, full charging times range from 2 to 70 hours depending on vehicle and charging station.

In [None]:
charge_time_thresholds = np.array( [ 3600 * 2 ** i for i in range(1,10) if (2 ** i <= 24) ] )
print( charge_time_thresholds )

Then define range.
- Range of nissan leaf (most common EV): 135km https://en.wikipedia.org/wiki/Nissan_Leaf
- Tesla 85D: 270 miles = 434km https://www.tesla.com/fr_CH/blog/driving-range-model-s-family?redirect=no

Range depends on lots of factors, so we just use a few thresholds starting at 50km up to 400km

In [None]:
range_thresholds = np.array( [ 50 * 1000 * 2 ** i for i in range(4) ] )
print( range_thresholds / 1000 )

Now, just generate one label per combination and compute labels

In [None]:
def find_threshold( t , thresholds , unit ):
    if t == len( thresholds ):
        return ''.join( ( "> " , str( thresholds[ t - 1 ] ) , unit ) )
    if t == 0:
        return ''.join( ( "[0 , " , str( thresholds[ t ] ) , unit , "]" ) )
    return ''.join( ( "[" , str( thresholds[ t - 1 ] ) , " , " , str( thresholds[ t ] ) , unit , "]" ) )

def comp_label( range_classes , charge_time_classes ):
    m = map( lambda r , t: "".join(
                ( "range_" , find_threshold( r ,  range_thresholds / 1000 , "km" ) ,
                  "-time_" , find_threshold( t , charge_time_thresholds / 3600 , "h" ) ) ) ,
             range_classes , charge_time_classes ) 
    return np.array( list( m ) ) 

pred_meaning = (features.assign( range_class = list( map( lambda x: bisect( range_thresholds , x ) , features.longest_trip_m ) ),
                  charge_time_class = list( map( lambda x: bisect( charge_time_thresholds , x ) , features.longest_stop_9_16_s )))
        .assign( range_label = lambda x: list( map( lambda t: find_threshold( t , range_thresholds / 1000 , "km" ) , x.range_class ) ),
                 charge_time_label = lambda x: list( map( lambda t: find_threshold( t , charge_time_thresholds / 3600 , "h" ) , x.charge_time_class ) ) )
        .assign( label = lambda x: comp_label( x.range_class , x.charge_time_class)))


In [None]:

print(pred_meaning.head())


In [None]:
crosstab_clusters = pd.crosstab( pred_meaning.range_label , pred_meaning.charge_time_label ) 
crosstab_clusters

PSI wants to see when the cars are parked during the day.
We should:
- visualize the number of cars parked per TOD in a faceted way
- export a table containing the number of parked car per time bin per class


In [None]:
parked_and_dist_columns = [ v for v in pred_meaning.columns.values if (v.startswith( "parked_s" ) or v.startswith("distance_m"))]

def decode_time( type ):
    def f( name ):
        interval = re.search("\[(.*)\]", name ).group(1).split(';')
        
        low= float(interval[0])
        high=float(interval[1])
        
        if (type == 'middle'): return ( low + high ) / 2.0
        if (type == 'low'): return low
        if (type == 'high'): return high
        
        raise NameError( type )

    return f

parktime_per_group = pred_meaning.groupby( ("range_label", "charge_time_label") )[parked_and_dist_columns].aggregate( np.average )
parktime_per_group['n'] = pred_meaning.groupby( ("range_label", "charge_time_label") )['range_label'].aggregate( np.size )
parktime_per_group = parktime_per_group.reset_index()
parktime_per_group = pd.melt( parktime_per_group ,
                              id_vars=['range_label', 'charge_time_label', 'n'], value_vars=parked_and_dist_columns,
                              var_name="variable", value_name="value" )
parktime_per_group['interval_start_s'] = parktime_per_group["variable"].apply( decode_time( 'low' ) )
parktime_per_group['interval_end_s'] = parktime_per_group["variable"].apply( decode_time( 'high' ) )
parktime_per_group['time_of_day_s'] = parktime_per_group["variable"].apply( decode_time( 'middle' ) )
parktime_per_group['variable'] = parktime_per_group['variable'].apply(lambda s: re.search('(.*)_\[.*\]', s).group(1))

parktime_per_group = parktime_per_group.set_index([c for c in parktime_per_group.columns.values if c != 'value'])\
                            .unstack('variable').reset_index()
parktime_per_group.columns = [' '.join(c).strip().split(' ')[-1] for c in parktime_per_group.columns]
parktime_per_group['time_of_day_h'] = parktime_per_group['time_of_day_s'] / 3600.0
parktime_per_group['parked_time_min'] = parktime_per_group['parked_s'] / 60.0
parktime_per_group['distance_km'] = parktime_per_group['distance_m'] / 1000.0
#parktime_per_group = parktime_per_group.drop(columns=['interval_s'])
parktime_per_group


In [None]:
# To get nice plots: order categories in a meaningful way

def numeric_range( r ):
    if r.startswith( ">" ): return float("inf")
    
    low = re.search("\[(.*),", r).group(1)
    return float(low)
    
    
# looks strange, but set cannot get a pandas series in constructor, while list can...
range_ordered = list( set( list( parktime_per_group.range_label ) ) ) 
range_ordered.sort( key=numeric_range )
range_ordered


In [None]:
charge_ordered = list( set( list( parktime_per_group.charge_time_label)))
charge_ordered.sort( key= numeric_range )
charge_ordered


In [None]:

n_levels = sorted( set( list( parktime_per_group.n ) ) )

In [None]:
# Cannot get bloody Seaborn to understand that my "hue" variable should be continuous...
# Dirty hack to get this right
def create_palette( ns ):
    my_palette = {}
    m = np.log( max(ns)+2 )
    all_blues = sns.color_palette("Blues", int( m ) + 1 )
    
    for n in ns:
        my_palette[n] = all_blues[ int( np.log(n + 1) )]
    
    return my_palette

def annotate(n, **kwargs ):
    return plt.annotate( "n="+str(n.iloc[0]) , xy=(0,1) )

grid = sns.FacetGrid( parktime_per_group ,
                      row="charge_time_label", col="range_label" , hue="n",
                      row_order=charge_ordered, col_order=range_ordered,
                      palette=create_palette(parktime_per_group.n),
                      margin_titles=True )
grid.map( annotate, "n" )
grid.map( plt.plot, "time_of_day_h" , "parked_time_min" )
#grid.add_legend()


In [None]:
grid_d = sns.FacetGrid( parktime_per_group ,
                      row="charge_time_label", col="range_label" , hue="n",
                      row_order=charge_ordered, col_order=range_ordered,
                      palette=create_palette(parktime_per_group.n),
                      margin_titles=True )
grid_d.map( annotate, "n" )
grid_d.map( plt.plot, "time_of_day_h" , "distance_km" )

In [None]:
grid.savefig( dirs.final_file_path( '002_parked_time_per_class.pdf' ) )
grid_d.savefig( dirs.final_file_path( '002_traveled_distance_per_class.pdf' ) )
parktime_per_group.to_csv( dirs.final_file_path( '002_agent_classes_for_STEM.csv' ) )
