# Libraries

In [6]:
import pandas as pd

# Importing Data

Encounter Data

In [7]:
df_enc = pd.read_csv('data\encounter-events.csv')
df_enc.head()

Unnamed: 0,fishing_vessel_mmsi,transshipment_vessel_mmsi,start_time,end_time,mean_latitude,mean_longitude,duration_hr,median_distance_km,median_speed_knots
0,416565000,354240000,2016-11-18T14:30:00Z,2016-11-19T01:50:00Z,-17.039085,-79.063725,11.333333,0.038188,0.585402
1,412679190,354240000,2016-12-11T14:50:00Z,2016-12-11T19:50:00Z,-20.269608,-79.244953,5.0,0.020033,0.575663
2,440863000,354240000,2017-06-13T12:50:00Z,2017-06-15T01:20:00Z,-62.640767,-60.69024,36.5,0.054992,0.019775
3,416563000,354240000,2016-11-15T11:30:00Z,2016-11-16T04:00:00Z,-17.046586,-79.061923,16.5,0.036427,1.023917
4,441309000,354240000,2017-05-19T00:40:00Z,2017-05-19T20:50:00Z,-46.627878,-60.554922,20.166667,0.034053,0.544031


Loitering Data

In [8]:
df_loit = pd.read_csv('data\loitering-events.csv')
df_loit.head()

Unnamed: 0,transshipment_mmsi,starting_latitude,starting_longitude,ending_latitude,ending_longitude,starting_timestamp,ending_timestamp,median_speed_knots,total_event_duration
0,256064000,12.7938,-69.503235,12.72795,-69.73912,2017-09-06T02:11:32Z,2017-09-06T15:44:15Z,1.091065,13.915556
1,256064000,25.405013,-56.3028,25.39335,-56.48825,2017-07-22T13:19:49Z,2017-07-22T23:59:17Z,1.548335,10.897639
2,256064000,10.693413,-78.5128,10.4845,-78.8843,2017-11-29T14:48:51Z,2017-11-30T07:43:36Z,1.503456,18.001389
3,256064000,11.858933,-75.553383,12.007627,-75.535627,2017-09-07T23:39:03Z,2017-09-08T14:59:26Z,0.650355,16.193611
4,256064000,45.938747,-23.79696,45.97696,-23.449387,2017-08-17T23:18:21Z,2017-08-18T17:56:30Z,1.009874,19.748333


Vessel Data

In [9]:
df_ves = pd.read_csv('data/transshipment-vessels.csv') 

df_ves.head()

Unnamed: 0,mmsi,shipname,callsign,flag,imo,first_timestamp,last_timestamp
0,306117000.0,SIERRALAUREL,PJBQ,ANT,9163403.0,2018-03-29T08:34:21Z,2018-06-30T17:08:41Z
1,306873000.0,SIERRALEYRE,PJJZ,ANT,9135822.0,2012-01-01T01:06:00Z,2012-06-26T08:58:28Z
2,309681.0,GREENBRAZIL,C6WH6,BHS,9045792.0,2018-06-29T10:34:00Z,2018-06-30T23:47:40Z
3,308735000.0,NOVA BRETAGNE,C6JI7,BHS,9000364.0,2012-01-01T00:39:08Z,2013-09-12T10:03:48Z
4,311000682.0,SIERRA LARA,C6DI3,BHS,9120205.0,2017-07-03T10:28:05Z,2018-06-30T23:35:36Z


# Data Transformation

## Individual

### Encounter Dataset

This code:

* removes all entries with duplicate values over all features
* transforms timestamps into datetime objects
* adds a target feature to the dataset

In [12]:
df_enc_mod = df_enc.copy()

df_enc_mod.drop_duplicates(inplace = True) #remove duplicates

df_enc_mod['start_time'] = pd.to_datetime(df_enc['start_time'])#parse time objects 
df_enc_mod['end_time'] = pd.to_datetime(df_enc['end_time'])
df_enc_mod['encounter'] = 1 #Creating target variable
df_enc_mod.info()
df_enc_mod.head()
df_enc_mod.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11583 entries, 0 to 11680
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   fishing_vessel_mmsi        11583 non-null  int64              
 1   transshipment_vessel_mmsi  11583 non-null  int64              
 2   start_time                 11583 non-null  datetime64[ns, UTC]
 3   end_time                   11583 non-null  datetime64[ns, UTC]
 4   mean_latitude              11583 non-null  float64            
 5   mean_longitude             11583 non-null  float64            
 6   duration_hr                11583 non-null  float64            
 7   median_distance_km         11583 non-null  float64            
 8   median_speed_knots         11583 non-null  float64            
 9   encounter                  11583 non-null  int64              
dtypes: datetime64[ns, UTC](2), float64(5), int64(3)
memory usage: 995.4 KB

Index(['fishing_vessel_mmsi', 'transshipment_vessel_mmsi', 'start_time',
       'end_time', 'mean_latitude', 'mean_longitude', 'duration_hr',
       'median_distance_km', 'median_speed_knots', 'encounter'],
      dtype='object')

### Loitering Dataset

This code:

* removes all entries with duplicate values over all features
* transforms timestamps into datetime objects
* adds a target feature to the dataset
* both transforms and renames features to those in the encounter dataset

In [11]:
df_loit_mod =  df_loit.copy()
df_loit_mod.drop_duplicates(inplace = True) #dropping duplicates


df_loit_mod['start_time'] = pd.to_datetime(df_loit['starting_timestamp']) #parsing timestamps

df_loit_mod['end_time'] = pd.to_datetime(df_loit['ending_timestamp'])

df_loit_mod.drop(['starting_timestamp', 'ending_timestamp'], axis=1, inplace=True)


df_loit_mod['mean_latitude'] = (df_loit_mod['starting_latitude']+df_loit_mod['ending_latitude'])/2 #transforming locations to mean
df_loit_mod['mean_longitude'] = (df_loit_mod['starting_longitude']+df_loit_mod['ending_longitude'])/2

df_loit_mod.rename(columns ={'transshipment_mmsi':'transshipment_vessel_mmsi', 'total_event_duration':'duration_hr'}, inplace = True)

df_loit_mod['encounter'] = 0 #creating target variable

df_loit_mod.info()
df_loit_mod.describe()
df_loit_mod.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46385 entries, 0 to 46568
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   transshipment_vessel_mmsi  46385 non-null  int64              
 1   starting_latitude          46385 non-null  float64            
 2   starting_longitude         46385 non-null  float64            
 3   ending_latitude            46385 non-null  float64            
 4   ending_longitude           46385 non-null  float64            
 5   median_speed_knots         46385 non-null  float64            
 6   duration_hr                46385 non-null  float64            
 7   start_time                 46385 non-null  datetime64[ns, UTC]
 8   end_time                   46385 non-null  datetime64[ns, UTC]
 9   mean_latitude              46385 non-null  float64            
 10  mean_longitude             46385 non-null  float64            
 11  en

Index(['transshipment_vessel_mmsi', 'starting_latitude', 'starting_longitude',
       'ending_latitude', 'ending_longitude', 'median_speed_knots',
       'duration_hr', 'start_time', 'end_time', 'mean_latitude',
       'mean_longitude', 'encounter'],
      dtype='object')

### Vessel Dataset

This code:

* removes all entries with duplicate values in mmsi [Team Decision] to avoid issues with future joins over all datasets
* removes all entries with NA or null values in the mmsi feature
* transforms timestamps into datetime objects

In [13]:
df_ves_mod = df_ves.dropna(subset=['mmsi'])
#this is necessary to avoid weird (non sql) behavior from merge (nulls are recognized as keys!)
####While this code creates a warning it is actually given as an example in the pandas documentation:
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

df_ves_mod.drop_duplicates(subset = ['mmsi'], inplace = True) #dropping duplicates in feature mmsi


df_ves_mod['first_timestamp'] = pd.to_datetime(df_ves_mod['first_timestamp']) #parsing timestamps

df_ves_mod['last_timestamp'] = pd.to_datetime(df_ves_mod['last_timestamp'])

df_ves_mod.info()
df_ves_mod.describe()
df_ves_mod.columns


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1039 entries, 0 to 1069
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   mmsi             1039 non-null   float64            
 1   shipname         1038 non-null   object             
 2   callsign         1029 non-null   object             
 3   flag             1039 non-null   object             
 4   imo              991 non-null    float64            
 5   first_timestamp  1039 non-null   datetime64[ns, UTC]
 6   last_timestamp   1039 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), float64(2), object(3)
memory usage: 64.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ves_mod.drop_duplicates(subset = ['mmsi'], inplace = True) #dropping duplicates in feature mmsi
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ves_mod['first_timestamp'] = pd.to_datetime(df_ves_mod['first_timestamp']) #parsing timestamps
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ves_mod['last_timestamp'] = pd.to_datetime(df_ves_mod['last_ti

Index(['mmsi', 'shipname', 'callsign', 'flag', 'imo', 'first_timestamp',
       'last_timestamp'],
      dtype='object')

## Aggregation

This code:

* Aggregates(Stacks) Encounter and Loitering Datasets by keys and adds non matching features as additional columns
* Removes redundant features
* Reinserts the target feature as the first column 

In [20]:
df_agg = pd.concat([df_enc_mod,df_loit_mod], ignore_index=True)
#dropping redundant data:
df_clean = df_agg.copy().drop(['starting_latitude','starting_longitude', 'ending_latitude','ending_longitude'], axis= 1 ) 
#setting target as first column:
col_en =df_clean.pop('encounter')
df_clean.insert(0, col_en.name,col_en)

df_clean.head()
df_clean.info()
df_clean.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57968 entries, 0 to 57967
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   encounter                  57968 non-null  int64              
 1   fishing_vessel_mmsi        11583 non-null  float64            
 2   transshipment_vessel_mmsi  57968 non-null  int64              
 3   start_time                 57968 non-null  datetime64[ns, UTC]
 4   end_time                   57968 non-null  datetime64[ns, UTC]
 5   mean_latitude              57968 non-null  float64            
 6   mean_longitude             57968 non-null  float64            
 7   duration_hr                57968 non-null  float64            
 8   median_distance_km         11583 non-null  float64            
 9   median_speed_knots         57968 non-null  float64            
dtypes: datetime64[ns, UTC](2), float64(6), int64(2)
memory usage: 4.4 MB


Unnamed: 0,encounter,fishing_vessel_mmsi,transshipment_vessel_mmsi,start_time,end_time,mean_latitude,mean_longitude,duration_hr,median_distance_km,median_speed_knots
57963,0,,273349630,2017-09-29 07:55:39+00:00,2017-09-29 22:49:11+00:00,51.945812,155.56233,16.982917,,1.092219
57964,0,,273349630,2016-11-19 08:08:13+00:00,2016-11-20 00:58:25+00:00,48.793993,141.242218,17.053056,,0.764958
57965,0,,273349630,2014-06-27 01:32:36+00:00,2014-06-28 01:21:20+00:00,58.224653,153.102242,25.157083,,0.560717
57966,0,,273349630,2016-10-29 17:22:12+00:00,2016-10-30 21:12:23+00:00,57.452946,150.443527,29.134444,,0.545175
57967,0,,273349630,2015-07-17 18:14:48+00:00,2015-07-19 01:59:49+00:00,57.563541,151.103195,34.060139,,0.45693


##  Merging

This code:

* Merges the aggregated datasets with the vessel data on the key pair 'transshipment_vessel_mmsi' and 'mmsi'
* This is sufficient because no fishing vessels are in the vessel dataset
* manually adds a suffix of '_t' to all features originating from the vessel dataset


In [23]:
df_clean_all = pd.merge(df_clean, df_ves_mod, how ='left', left_on ='transshipment_vessel_mmsi', right_on='mmsi')


####Optional: add suffix to all added variables
df_clean_all.rename(columns= { 'mmsi' : 'mmsi_t', 'shipname': 'shipname_t', 'callsign':'callsign_t', 'flag':'flag_t', 'imo': 'imo_t',\
     'first_timestamp': 'first_timestamp_t', 'last_timestamp':'last_timestamp_t'}, inplace = True)

####This could be use to merge on fishing vessel, this is however not necessary in this context(No fishing vessels, only tranship)

# df_clean_all = pd.merge(df_clean_all, df_ves_mod, how ='left', left_on ='fishing_vessel_mmsi', right_on='mmsi')
# df_clean_all.rename(columns= { 'mmsi' : 'mmsi_f', 'shipname': 'shipname_f', 'callsign':'callsign_f', 'flag':'flag_f', 'imo': 'imo_f',\
#      'first_timestamp': 'first_timestamp_f', 'last_timestamp':'last_timestamp_f'}, inplace = True)

df_clean.head()
df_clean_all.info()
df_clean_all.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57968 entries, 0 to 57967
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   encounter                  57968 non-null  int64              
 1   fishing_vessel_mmsi        11583 non-null  float64            
 2   transshipment_vessel_mmsi  57968 non-null  int64              
 3   start_time                 57968 non-null  datetime64[ns, UTC]
 4   end_time                   57968 non-null  datetime64[ns, UTC]
 5   mean_latitude              57968 non-null  float64            
 6   mean_longitude             57968 non-null  float64            
 7   duration_hr                57968 non-null  float64            
 8   median_distance_km         11583 non-null  float64            
 9   median_speed_knots         57968 non-null  float64            
 10  mmsi_t                     51502 non-null  float64            
 11  sh

Unnamed: 0,encounter,fishing_vessel_mmsi,transshipment_vessel_mmsi,start_time,end_time,mean_latitude,mean_longitude,duration_hr,median_distance_km,median_speed_knots,mmsi_t,shipname_t,callsign_t,flag_t,imo_t,first_timestamp_t,last_timestamp_t
57963,0,,273349630,2017-09-29 07:55:39+00:00,2017-09-29 22:49:11+00:00,51.945812,155.56233,16.982917,,1.092219,273349630.0,ZODIAK,UBFG9,RUS,8712300.0,2012-02-03 11:26:34+00:00,2018-06-30 00:13:21+00:00
57964,0,,273349630,2016-11-19 08:08:13+00:00,2016-11-20 00:58:25+00:00,48.793993,141.242218,17.053056,,0.764958,273349630.0,ZODIAK,UBFG9,RUS,8712300.0,2012-02-03 11:26:34+00:00,2018-06-30 00:13:21+00:00
57965,0,,273349630,2014-06-27 01:32:36+00:00,2014-06-28 01:21:20+00:00,58.224653,153.102242,25.157083,,0.560717,273349630.0,ZODIAK,UBFG9,RUS,8712300.0,2012-02-03 11:26:34+00:00,2018-06-30 00:13:21+00:00
57966,0,,273349630,2016-10-29 17:22:12+00:00,2016-10-30 21:12:23+00:00,57.452946,150.443527,29.134444,,0.545175,273349630.0,ZODIAK,UBFG9,RUS,8712300.0,2012-02-03 11:26:34+00:00,2018-06-30 00:13:21+00:00
57967,0,,273349630,2015-07-17 18:14:48+00:00,2015-07-19 01:59:49+00:00,57.563541,151.103195,34.060139,,0.45693,273349630.0,ZODIAK,UBFG9,RUS,8712300.0,2012-02-03 11:26:34+00:00,2018-06-30 00:13:21+00:00
