# Libraries

In [105]:
import pandas as pd

# Importing Data

Encounter Data

In [106]:
df_enc = pd.read_csv('data\encounter-events.csv')
df_enc.head()

Unnamed: 0,fishing_vessel_mmsi,transshipment_vessel_mmsi,start_time,end_time,mean_latitude,mean_longitude,duration_hr,median_distance_km,median_speed_knots
0,416565000,354240000,2016-11-18T14:30:00Z,2016-11-19T01:50:00Z,-17.039085,-79.063725,11.333333,0.038188,0.585402
1,412679190,354240000,2016-12-11T14:50:00Z,2016-12-11T19:50:00Z,-20.269608,-79.244953,5.0,0.020033,0.575663
2,440863000,354240000,2017-06-13T12:50:00Z,2017-06-15T01:20:00Z,-62.640767,-60.69024,36.5,0.054992,0.019775
3,416563000,354240000,2016-11-15T11:30:00Z,2016-11-16T04:00:00Z,-17.046586,-79.061923,16.5,0.036427,1.023917
4,441309000,354240000,2017-05-19T00:40:00Z,2017-05-19T20:50:00Z,-46.627878,-60.554922,20.166667,0.034053,0.544031


Loitering Data

In [107]:
df_loit = pd.read_csv('data\loitering-events.csv')
df_loit.head()

Unnamed: 0,transshipment_mmsi,starting_latitude,starting_longitude,ending_latitude,ending_longitude,starting_timestamp,ending_timestamp,median_speed_knots,total_event_duration
0,256064000,12.7938,-69.503235,12.72795,-69.73912,2017-09-06T02:11:32Z,2017-09-06T15:44:15Z,1.091065,13.915556
1,256064000,25.405013,-56.3028,25.39335,-56.48825,2017-07-22T13:19:49Z,2017-07-22T23:59:17Z,1.548335,10.897639
2,256064000,10.693413,-78.5128,10.4845,-78.8843,2017-11-29T14:48:51Z,2017-11-30T07:43:36Z,1.503456,18.001389
3,256064000,11.858933,-75.553383,12.007627,-75.535627,2017-09-07T23:39:03Z,2017-09-08T14:59:26Z,0.650355,16.193611
4,256064000,45.938747,-23.79696,45.97696,-23.449387,2017-08-17T23:18:21Z,2017-08-18T17:56:30Z,1.009874,19.748333


Vessel Data

In [108]:
df_ves = pd.read_csv('data/transshipment-vessels.csv') 

df_ves.head()

Unnamed: 0,mmsi,shipname,callsign,flag,imo,first_timestamp,last_timestamp
0,306117000.0,SIERRALAUREL,PJBQ,ANT,9163403.0,2018-03-29T08:34:21Z,2018-06-30T17:08:41Z
1,306873000.0,SIERRALEYRE,PJJZ,ANT,9135822.0,2012-01-01T01:06:00Z,2012-06-26T08:58:28Z
2,309681.0,GREENBRAZIL,C6WH6,BHS,9045792.0,2018-06-29T10:34:00Z,2018-06-30T23:47:40Z
3,308735000.0,NOVA BRETAGNE,C6JI7,BHS,9000364.0,2012-01-01T00:39:08Z,2013-09-12T10:03:48Z
4,311000682.0,SIERRA LARA,C6DI3,BHS,9120205.0,2017-07-03T10:28:05Z,2018-06-30T23:35:36Z


# Data Transformation

## Individual

### Encounter Dataset

This code:

* removes all entries with duplicate values over all features
* removes outliers in duration_hr column
* transforms timestamps into datetime objects
* adds a target feature to the dataset

In [109]:
df_enc_mod = df_enc.copy()

df_enc_mod = df_enc_mod.loc[df_enc_mod['duration_hr'] <= 65]  # remove outliers detected in the EDA file

df_enc_mod.drop_duplicates(inplace = True)  # remove duplicates

print('{} encounterings left in the dataset'.format(df_enc_mod.shape[0]))


11470 encounterings left in the dataset


In [110]:
df_enc_mod['start_time'] = pd.to_datetime(df_enc['start_time'])  # parse time objects
df_enc_mod['end_time'] = pd.to_datetime(df_enc['end_time'])

df_enc_mod['starting_timestamp'] = df_enc_mod['start_time'].apply(lambda x: x.timestamp()) # Changing format of timestamps from object to float
df_enc_mod['ending_timestamp'] = df_enc_mod['end_time'].apply(lambda x: x.timestamp())
df_enc_mod.head()

Unnamed: 0,fishing_vessel_mmsi,transshipment_vessel_mmsi,start_time,end_time,mean_latitude,mean_longitude,duration_hr,median_distance_km,median_speed_knots,starting_timestamp,ending_timestamp
0,416565000,354240000,2016-11-18 14:30:00+00:00,2016-11-19 01:50:00+00:00,-17.039085,-79.063725,11.333333,0.038188,0.585402,1479479000.0,1479520000.0
1,412679190,354240000,2016-12-11 14:50:00+00:00,2016-12-11 19:50:00+00:00,-20.269608,-79.244953,5.0,0.020033,0.575663,1481468000.0,1481486000.0
2,440863000,354240000,2017-06-13 12:50:00+00:00,2017-06-15 01:20:00+00:00,-62.640767,-60.69024,36.5,0.054992,0.019775,1497358000.0,1497490000.0
3,416563000,354240000,2016-11-15 11:30:00+00:00,2016-11-16 04:00:00+00:00,-17.046586,-79.061923,16.5,0.036427,1.023917,1479209000.0,1479269000.0
4,441309000,354240000,2017-05-19 00:40:00+00:00,2017-05-19 20:50:00+00:00,-46.627878,-60.554922,20.166667,0.034053,0.544031,1495154000.0,1495227000.0


In [111]:
df_enc_mod['from_encounter'] = 1  # support variable need later after merging

In [112]:
df_enc_mod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11470 entries, 0 to 11680
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   fishing_vessel_mmsi        11470 non-null  int64              
 1   transshipment_vessel_mmsi  11470 non-null  int64              
 2   start_time                 11470 non-null  datetime64[ns, UTC]
 3   end_time                   11470 non-null  datetime64[ns, UTC]
 4   mean_latitude              11470 non-null  float64            
 5   mean_longitude             11470 non-null  float64            
 6   duration_hr                11470 non-null  float64            
 7   median_distance_km         11470 non-null  float64            
 8   median_speed_knots         11470 non-null  float64            
 9   starting_timestamp         11470 non-null  float64            
 10  ending_timestamp           11470 non-null  float64            
 11  fr

### Loitering Dataset

This code:

* removes all entries with duplicate values over all features
* remove outliers in total_even_duration column
* transforms timestamps into datetime objects
* adds a target feature to the dataset
* both transforms and renames features to those in the encounter dataset

In [113]:
df_loit_mod =  df_loit.copy()

df_loit_mod = df_loit_mod.loc[df_loit_mod['total_event_duration'] <= 200]  # remove outliers detected in the EDA file

df_loit_mod.drop_duplicates(inplace = True)  # dropping duplicates

print('{} loitering events left in the dataset'.format(df_loit_mod.shape[0]))

45919 loitering events left in the dataset


In [114]:
df_loit_mod['start_time'] = pd.to_datetime(df_loit['starting_timestamp'])  # Create new start time column with datetime format
df_loit_mod['end_time'] = pd.to_datetime(df_loit['ending_timestamp'])

df_loit_mod['starting_timestamp'] = df_loit_mod['start_time'].apply(lambda x: x.timestamp()) # Changing format of timestamps from object to float
df_loit_mod['ending_timestamp'] = df_loit_mod['end_time'].apply(lambda x: x.timestamp())
df_loit_mod.head()

Unnamed: 0,transshipment_mmsi,starting_latitude,starting_longitude,ending_latitude,ending_longitude,starting_timestamp,ending_timestamp,median_speed_knots,total_event_duration,start_time,end_time
0,256064000,12.7938,-69.503235,12.72795,-69.73912,1504664000.0,1504713000.0,1.091065,13.915556,2017-09-06 02:11:32+00:00,2017-09-06 15:44:15+00:00
1,256064000,25.405013,-56.3028,25.39335,-56.48825,1500730000.0,1500768000.0,1.548335,10.897639,2017-07-22 13:19:49+00:00,2017-07-22 23:59:17+00:00
2,256064000,10.693413,-78.5128,10.4845,-78.8843,1511967000.0,1512028000.0,1.503456,18.001389,2017-11-29 14:48:51+00:00,2017-11-30 07:43:36+00:00
3,256064000,11.858933,-75.553383,12.007627,-75.535627,1504828000.0,1504883000.0,0.650355,16.193611,2017-09-07 23:39:03+00:00,2017-09-08 14:59:26+00:00
4,256064000,45.938747,-23.79696,45.97696,-23.449387,1503012000.0,1503079000.0,1.009874,19.748333,2017-08-17 23:18:21+00:00,2017-08-18 17:56:30+00:00


We rename the transshipment column make clear that it's a transshipment VESSEL attribute not one of a possible transshipment itself
and rename the duration column to match the format of the encounter data set

In [115]:
df_loit_mod.rename(columns ={'transshipment_mmsi':'transshipment_vessel_mmsi', 'total_event_duration':'duration_hr'}, inplace = True)

df_loit_mod['from_encounter'] = 0  # support variable need later after merging

In [116]:
df_loit_mod.info()
df_loit_mod.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45919 entries, 0 to 46568
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   transshipment_vessel_mmsi  45919 non-null  int64              
 1   starting_latitude          45919 non-null  float64            
 2   starting_longitude         45919 non-null  float64            
 3   ending_latitude            45919 non-null  float64            
 4   ending_longitude           45919 non-null  float64            
 5   starting_timestamp         45919 non-null  float64            
 6   ending_timestamp           45919 non-null  float64            
 7   median_speed_knots         45919 non-null  float64            
 8   duration_hr                45919 non-null  float64            
 9   start_time                 45919 non-null  datetime64[ns, UTC]
 10  end_time                   45919 non-null  datetime64[ns, UTC]
 11  fr

Unnamed: 0,transshipment_vessel_mmsi,starting_latitude,starting_longitude,ending_latitude,ending_longitude,starting_timestamp,ending_timestamp,median_speed_knots,duration_hr,from_encounter
count,45919.0,45919.0,45919.0,45919.0,45919.0,45919.0,45919.0,45919.0,45919.0,45919.0
mean,398441300.0,13.520202,37.65423,13.562817,37.726024,1429944000.0,1430045000.0,1.013651,29.714431,0.0
std,122335200.0,35.43782,109.918414,35.377867,109.880895,54354730.0,54349610.0,0.410779,27.869862,0.0
min,200001100.0,-73.196533,-179.998672,-87.21564,-179.98558,1325378000.0,1325398000.0,0.0,8.0,0.0
25%,273827700.0,-12.585608,-60.478582,-12.4672,-60.397284,1384257000.0,1384337000.0,0.754335,12.842361,0.0
50%,357805000.0,8.77771,35.030396,8.749802,35.255932,1434330000.0,1434418000.0,1.02199,19.552639,0.0
75%,514298000.0,45.622994,150.68013,45.658457,150.68634,1476807000.0,1476885000.0,1.302433,35.419861,0.0
max,800668700.0,80.17616,179.995407,80.41528,179.998611,1516290000.0,1516320000.0,1.996064,199.666806,0.0


### Vessel Dataset

This code:

* removes all entries with duplicate values in mmsi [Team Decision] to avoid issues with future joins over all datasets
* removes all entries with NA or null values in the mmsi feature
* transforms timestamps into datetime objects

In [117]:
df_ves_mod = df_ves.dropna(subset=['mmsi'])
#this is necessary to avoid weird (non sql) behavior from merge (nulls are recognized as keys!)
####While this code creates a warning it is actually given as an example in the pandas documentation:
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

df_ves_mod.drop_duplicates(subset = ['mmsi'], inplace = True) #dropping duplicates in feature mmsi


df_ves_mod['first_timestamp'] = pd.to_datetime(df_ves_mod['first_timestamp']) #parsing timestamps

df_ves_mod['last_timestamp'] = pd.to_datetime(df_ves_mod['last_timestamp'])

df_ves_mod.info()
df_ves_mod.describe()
df_ves_mod.columns


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1039 entries, 0 to 1069
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   mmsi             1039 non-null   float64            
 1   shipname         1038 non-null   object             
 2   callsign         1029 non-null   object             
 3   flag             1039 non-null   object             
 4   imo              991 non-null    float64            
 5   first_timestamp  1039 non-null   datetime64[ns, UTC]
 6   last_timestamp   1039 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), float64(2), object(3)
memory usage: 64.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ves_mod['first_timestamp'] = pd.to_datetime(df_ves_mod['first_timestamp']) #parsing timestamps
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ves_mod['last_timestamp'] = pd.to_datetime(df_ves_mod['last_timestamp'])


Index(['mmsi', 'shipname', 'callsign', 'flag', 'imo', 'first_timestamp',
       'last_timestamp'],
      dtype='object')

## Aggregation

This code:

* Aggregates(Stacks) Encounter and Loitering Datasets by keys and adds non matching features as additional columns
* Removes redundant features
* Reinserts the target feature as the first column 

In [118]:
df_agg = pd.concat([df_enc_mod,df_loit_mod], ignore_index=True)
#dropping redundant data:
df_clean = df_agg.copy().drop(['starting_latitude','starting_longitude', 'ending_latitude','ending_longitude'], axis= 1 ) 
#setting target as first column:
col_en =df_clean.pop('encounter')
df_clean.insert(0, col_en.name,col_en)

df_clean.head()
df_clean.info()
df_clean.tail()

KeyError: 'encounter'

##  Merging

This code:

* Merges the aggregated datasets with the vessel data on the key pair 'transshipment_vessel_mmsi' and 'mmsi'
* This is sufficient because no fishing vessels are in the vessel dataset
* manually adds a suffix of '_t' to all features originating from the vessel dataset


In [None]:
df_clean_all = pd.merge(df_clean, df_ves_mod, how ='left', left_on ='transshipment_vessel_mmsi', right_on='mmsi')


####Optional: add suffix to all added variables
df_clean_all.rename(columns= { 'mmsi' : 'mmsi_t', 'shipname': 'shipname_t', 'callsign':'callsign_t', 'flag':'flag_t', 'imo': 'imo_t',\
     'first_timestamp': 'first_timestamp_t', 'last_timestamp':'last_timestamp_t'}, inplace = True)

####This could be use to merge on fishing vessel, this is however not necessary in this context(No fishing vessels, only tranship)

# df_clean_all = pd.merge(df_clean_all, df_ves_mod, how ='left', left_on ='fishing_vessel_mmsi', right_on='mmsi')
# df_clean_all.rename(columns= { 'mmsi' : 'mmsi_f', 'shipname': 'shipname_f', 'callsign':'callsign_f', 'flag':'flag_f', 'imo': 'imo_f',\
#      'first_timestamp': 'first_timestamp_f', 'last_timestamp':'last_timestamp_f'}, inplace = True)

df_clean.head()
df_clean_all.info()
df_clean_all.tail()