In [1]:
import pandas as pd
import numpy as np
import functools, operator
from datetime import timedelta

In [2]:
df = pd.read_csv('position_reports.csv')

In [29]:
class Position_Set:
    ''' Class for a set of position reports. '''
    
    def __init__(self, df, entity_id_column, datetime_column, longitude_column, latitude_column, position_type_column):
        self.entities = df[entity_id_column]
        self.datetime = pd.Index(pd.to_datetime(df[datetime_column]))
        self.longitude = df[longitude_column]
        self.latitude = df[latitude_column]
        self.position_type = df[position_type_column]
        
        self.n_positions = df.shape[0]
        self.first_position_date = self.datetime.min()
        self.last_position_date = self.datetime.max()
        self.lat_min = self.latitude.min()
        self.long_min = self.longitude.min()
        self.lat_max = self.latitude.max()
        self.long_max = self.longitude.max()
    
    
    def n_entities(self):
        
        return self.entities.nunique()
    
    def unique_entities(self):
        return self.entities.unique()
    
    def n_position_types(self):
        return self.position_type.nunique()
    
    def unique_position_types(self):
        return self.position_type.unique()
    
    def date_range(self):
        return (self.first_position_date, self.last_position_date)
    
    def timespan_days(self):
        return (self.last_position_date - self.first_position_date).days
    
    def mean_datetime(self):
        return self.first_position_date + timedelta(
                seconds=np.array([(x - self.first_position_date).total_seconds() for x in self.datetime]).mean())
    
    def mean_time_during_day(self):
        return pd.to_timedelta((self.datetime.hour*3600+self.datetime.minute*60+self.datetime.second).values.mean(),unit='s')
    
    def n_positions_per_day(self):
        return self.n_positions / self.timespan_days()
    
    def bbox(self):
        return (self.long_min, self.lat_min, self.long_max, self.lat_max)
    
    def mean_centre(self):
        return (self.longitude.mean(), self.latitude.mean())
    
    def __str__(self):
        print('This position set has: {} entities, {} reports, that occurred between {} and {} '
              .format(self.n_entities(), self.n_positions, self.first_position_date, self.last_position_date))

#new_set = Position_Set(df, 'Taxi_id','Datetime','Longitude', 'Latitude', 'Position_Type')#all
new_set = Position_Set(df, 'Taxi_id','Datetime','Longitude', 'Latitude', 'Position_Type')
new_set.__str__()
#df[df['Taxi_id'] == 20000589].shape[0]

This position set has: 407 entities, 484243 reports, that occurred between 2013-07-01 00:00:53 and 2013-07-03 08:40:13 
