In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from time import process_time
from sklearn.cluster import DBSCAN


In [2]:
#Load zones 20, 19, and 18
t0_start = process_time() 

AIS_raw_20 = pd.read_csv('AIS_2017_12_Zone20.csv', header=0)
AIS_raw_19 = pd.read_csv('AIS_2017_12_Zone19.csv', header=0)
AIS_raw_18 = pd.read_csv('AIS_2017_12_Zone18.csv', header=0)

t0_end = process_time()
load_time = t0_end-t0_start
loaded_rows = len(AIS_raw_20)+len(AIS_raw_19)+len(AIS_raw_18)

In [3]:
#Merge dataframes into a single dataframe
t1_start = process_time() 

zones_comb = pd.concat([AIS_raw_20, AIS_raw_19, AIS_raw_18]).reset_index(drop=True)

t1_end = process_time()
combine_time = t1_end-t1_start

In [4]:
# Clean data frame of the NaN values of status, length, width, and vessel type and only keep non-useful status values.
t2_start = process_time() 

zones_clean = zones_comb[['MMSI','BaseDateTime','LAT','LON','VesselType','Status','Length','Width']].dropna()
zones_clean = zones_clean[zones_clean['Status'].isin({'at anchor', 'moored', 'power-driven vessel pushing ahead or towing alongside', \
                                                    'power-driven vessel towing astern','under way using engine'})].reset_index(drop=True)
t2_end = process_time()
clean_time = t2_end-t2_start

In [5]:
# Further cleaning by dropping the ships that do not have a change in status
t3_start = process_time() 

zones_clean_grouped = zones_clean.groupby("MMSI")['Status'].apply(set).reset_index()

for index,row in zones_clean_grouped.iterrows():
    if len(row['Status']) < 2:
        zones_clean_grouped.drop(index, inplace=True) 
     
useful_ships = zones_clean_grouped['MMSI'].unique().tolist()
zones_useful = zones_clean[zones_clean['MMSI'].isin(useful_ships)]

t3_end = process_time()
clean_time2 = t3_end-t3_start

In [6]:
# Group the data by MMSI (ships) then filter out values of lat and lon where this is not a change in status.
t4_start = process_time() 

zones_ordered = zones_useful.sort_values(['MMSI','BaseDateTime'],ascending=True)
zones_comb_list = zones_ordered.groupby("MMSI").agg({'Status':list, 'LAT':list, 'LON':list, 'BaseDateTime':list, \
                                                    'VesselType':set, 'Length':set, 'Width':set}).reset_index()

def compress_stat(row):
    stat = row['Status']
    time = row['BaseDateTime']
    lon = row['LON']
    lat = row['LAT']
    vals = [(stat[i+1],time[i+1],lon[i+1],lat[i+1]) for i,j,k in zip(np.arange(len(stat)-1), stat[:-1], stat[1:]) if (j!=k or i==0 )]
    return [(j[1],j[2],j[3]) for i,j in enumerate(vals) if j[0] in {'moored','at anchor'}]

zones_comb_list['Status_comp'] = zones_comb_list.apply(lambda row: compress_stat(row), axis=1)
zones_compressed = zones_comb_list.drop(['Status','LAT','LON','BaseDateTime'], axis=1)

for index,rows in zones_compressed.iterrows():  
    if len(rows['Status_comp']) < 2:
        zones_compressed.drop(index, inplace=True)

t4_end = process_time()
group_time = t4_end - t4_start

In [7]:
# Expand the dataframe now with the data only including when a ship enters port
t5_start = process_time() 

zones_expanded = zones_compressed.explode('Status_comp').reset_index(drop=True)
zones_expanded['Date-Time'] = [i[0] for i in zones_expanded['Status_comp'].to_list()]
zones_expanded['LON'] = [i[1] for i in zones_expanded['Status_comp'].to_list()]
zones_expanded['LAT'] = [i[2] for i in zones_expanded['Status_comp'].to_list()]
zones_expanded['Length'] = zones_expanded['Length'].apply(lambda row: list(row)[0])
zones_expanded['Width'] = zones_expanded['Width'].apply(lambda row: list(row)[0])
zones_expanded['VesselType'] = zones_expanded['VesselType'].apply(lambda row: list(row)[0])
zones_expanded = zones_expanded.drop('Status_comp', axis=1)

#Placeholder Carbon calculator
def carbonCalc(length,width,vessel):
    return length*width*vessel

zones_expanded['C02_output'] = zones_expanded.apply(lambda row: carbonCalc(row['Length'],row['Width'],row['VesselType']), axis=1)

t5_end = process_time() 
expand_time = t5_end - t5_start

In [8]:
#Create Ports based on proximity of ships anchoring/mooring near eachother
#Create a port_reference that maps a port to a Lat,Lon

lon = zones_expanded['LON'].to_list()
lat = zones_expanded['LAT'].to_list()
Coords = np.array([[i,j] for i,j in zip(lon,lat)])
clustering = DBSCAN(eps=0.1, min_samples=2).fit(Coords)

zones_expanded['Port_ID'] = clustering.labels_
port_reference = zones_expanded.groupby('Port_ID').mean()[['LON','LAT']]

In [9]:
#Create graph that incluses start port and end port
zones_grouped = zones_expanded.groupby('MMSI').agg({'Port_ID':list}).reset_index()
len_list = zones_grouped['Port_ID'].apply(lambda x: len(x)).to_list()
len_list = np.cumsum(len_list)
len_list = [i-1 for i in len_list][:-1]
zones_graph = pd.DataFrame()
zones_graph['Start_Port'] = zones_expanded['Port_ID'][:-1].to_list()
zones_graph['End_Port'] = zones_expanded['Port_ID'][1:].to_list()
zones_graph['MMSI'] = zones_expanded['MMSI'][:-1].to_list()
zones_graph['Date-Time'] = zones_expanded['Date-Time'][:-1].to_list()
zones_graph['VesselType'] = zones_expanded['VesselType'][:-1].to_list()
zones_graph['ShipLength'] = zones_expanded['Length'][:-1].to_list()
zones_graph['ShipWidth'] = zones_expanded['Width'][:-1].to_list()
zones_graph['C02_output'] = zones_expanded['C02_output'][:-1].to_list()

#Kilometers (as the crow flies)
def distCalc(start,end):
    start_lat = math.radians(port_reference['LAT'][start])    
    end_lat = math.radians(port_reference['LAT'][end])    
    start_lon = math.radians(port_reference['LON'][start])    
    end_lon = math.radians(port_reference['LON'][end])    
    dlon = end_lon - start_lon
    dlat = end_lat - start_lat
    a = math.sin(dlat / 2)**2 + math.cos(start_lat) * math.cos(end_lat) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return 6373.0 * c #6373.0 is the radius of the earth in km

zones_graph = zones_graph.drop(len_list)
zones_graph['DistBetweenPorts'] = zones_graph.apply(lambda row: distCalc(row['Start_Port'],row['End_Port']), axis=1)

In [10]:
zones_expanded.to_csv('Zones_expanded.csv', index=False)
port_reference.to_csv('Port_reference.csv')
zones_graph.to_csv('Zones_graphed.csv', index=False)

In [11]:
print('Load Time:',load_time)
print('Combine Time:',combine_time)
print('Clean Time:',clean_time)
print('Clean Time 2:',clean_time2)
print('Group Time:',group_time)
print('Expand Time:',expand_time)
print('Total Time:',load_time+combine_time+clean_time+clean_time2+group_time+expand_time)

Load Time: 157.609375
Combine Time: 128.921875
Clean Time: 47.46875
Clean Time 2: 19.03125
Group Time: 56.265625
Expand Time: 0.390625
Total Time: 409.6875
