In [None]:
from xml.etree import ElementTree as ET
import requests
import xml.dom.minidom
from os.path import join, isfile
from csv import DictReader
from datetime import datetime, date
from optparse import OptionParser
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import numpy as np
import cartopy
import cartopy.crs as ccrs

In [None]:
%matplotlib inline

In [None]:
dates = pd.Series(np.arange('2012-01', '2017-11', np.timedelta64(1, 'W'), dtype='datetime64[W]'))
dates = dates.values.astype('datetime64[D]')
dates[:5]

In [None]:
input_dir = 'in/Laurin/ShipPivots/All'
filenames = []
for i in range(len(dates) -1):
    filename = input_dir + '/data_' + str(dates[i]) + '.xml'
    filenames.append(filename)
filenames[:5]

In [None]:
namespaces = { 'default' : 'http://www.w3.org/2005/Atom',
               'd': 'http://schemas.microsoft.com/ado/2007/08/dataservices',
               'm': 'http://schemas.microsoft.com/ado/2007/08/dataservices/metadata'}

data = pd.DataFrame()

for filename in filenames:
    
    try:
        tree = ET.parse(filename)
        root = tree.getroot()
        records = []

        for i, entry in enumerate(root.findall('.//default:entry', namespaces)):
            record = {}
            for properties in entry.findall('.//m:properties', namespaces):
                record['Id'] = properties.find('.//d:Id', namespaces).text
                record['ShipName'] = properties.find('.//d:ShipName', namespaces).text
                record['CompanyName'] = properties.find('.//d:CompanyName', namespaces).text
                record['DateTime'] = properties.find('.//d:DateTime', namespaces).text
                record['State'] = properties.find('.//d:State', namespaces).text
                record['Lat'] = properties.find('.//d:Latitude', namespaces).text
                record['Lon'] = properties.find('.//d:Longitude', namespaces).text
                record['VoyageIdInternal'] = properties.find('.//d:VoyageIdInternal', namespaces).text
                record['VoyageId'] = properties.find('.//d:VoyageId', namespaces).text

                records.append(record)
        
        if records:
            df = pd.DataFrame(records)
            df.Lat = df.Lat.astype(np.float64)
            df.Lon = df.Lon.astype(np.float64)
            df.DateTime = df.DateTime.astype(np.datetime64)
            data = data.append(df, ignore_index=True)
    except FileNotFoundError:
        pass

data.dropna(axis=0, how='any', subset=['VoyageIdInternal','Lat','Lon'], inplace=True)
data.reset_index(drop=True, inplace=True)
data.VoyageIdInternal = data.VoyageIdInternal.astype(np.int64)
data[:5]

In [None]:
def create_canvas():
    fig = plt.figure(figsize=[60,20])
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.add_feature(
        cartopy.feature.LAND,
        zorder=100, 
        edgecolor=[0.2,0.2,0.2], 
        facecolor=[0.5,0.5,0.5])
    return fig,ax

In [None]:
def df_line_plot(data):  
    fig, ax = create_canvas()
    data.plot('Lon','Lat', ax=ax, legend=None)
    ax.set_global()

In [None]:
def df_scatter_plot(data):  
    fig, ax = create_canvas()
    data.plot.scatter('Lon','Lat', ax=ax, legend=None)
    ax.set_global()

In [None]:
def df_local_plot(data):  
    fig, ax = create_canvas()
    data.plot('Lon','Lat', ax=ax, legend=None)

In [None]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
def compute_deltas(data):   
    data['DeltaLat'] = data['Lat'].shift(-1) - data['Lat']
    data['DeltaLon'] = data['Lon'].shift(-1) - data['Lon']
    return data

In [None]:
def get_tracks(data, thresh):
    mask = np.absolute(data.DeltaLon) > thresh
    mask = np.absolute(mask.shift(+1) + mask)
    data['Track'] = mask.cumsum()   
    return data

In [None]:
def drop_short_tracks(data, thresh):
    return data.groupby('Track').filter(lambda x: x['DateTime'].count() > thresh)

In [None]:
grouped = data.groupby(['ShipName','VoyageIdInternal','DateTime'])
_data = compute_deltas(grouped.mean().reset_index())
_data = get_tracks(_data, 5)
_data = drop_short_tracks(_data, 1000)
_data.reset_index(drop=True, inplace=True)
_data[:5]

In [None]:
_data.VoyageIdInternal.nunique()

In [None]:
data.VoyageIdInternal.nunique()

In [None]:
df_line_plot(_data.groupby(['ShipName','Track','VoyageIdInternal']))

In [None]:
ships = list(data.groupby(['ShipName']).groups.keys())
ships