<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [14]</a>'.</span>

In [1]:
# Parameters
config = {
    "notebook": "notebooks/feature_engineering.ipynb",
    "data_path": "C:/Users/nico_/Desktop/ITBA/TFI/global fishing watch/dataset/trollers.csv",
    "output_label": "trollers",
}


In [2]:
# check if config exists
try:
    config
except NameError:
    config_exists = False
else:
    config_exists = True

# make config if it does not exist already (e.g. passed in by papermill)
if not(config_exists):
    # set up some config for the experiment run
    config = {
        "data_path" : "C:/Users/nico_/Desktop/ITBA/TFI/global fishing watch/dataset/trollers.csv",
        "output_label": "trollers"
    }
print(config)

{'notebook': 'notebooks/feature_engineering.ipynb', 'data_path': 'C:/Users/nico_/Desktop/ITBA/TFI/global fishing watch/dataset/trollers.csv', 'output_label': 'trollers'}


In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

df = pd.read_csv(config['data_path'], header="infer", sep=",",error_bad_lines=False, engine ='python')

#Nos quedamos con los puntos que estan mayor a 3 millas nauticas de la costa.
df = df[(df['distance_from_shore'] >= 5556)]

#Nos quedamos solamente con los valores que estamos seguros que esta o no esta pescando.
df = df[(df['is_fishing'] == -1.0)  | (df['is_fishing'] == 1.0) ]

#Renombramos las columnas de la variable a predecir de si esta pescando o no.
df["is_fishing"] = np.where(df["is_fishing"] == 1.0, 'SI', 'NO')

#Dropeamos los que contienen el mismo timestamp.
df = df.drop_duplicates(subset='timestamp', keep="first")

df["timestamp"]=df["timestamp"].astype(int)
#df['date_series'] = pd.to_datetime(df['timestamp'],unit='s')
#df = df.set_index('date_series') 
df['datetime'] = pd.to_datetime(df['timestamp'],unit='s')
df.index = df.datetime
#print(df['date_series'])


#print(df.resample('60Min',on='datetime'))
df = df.resample('5Min',on='datetime').agg('first').dropna()  
df.reset_index(drop=True, inplace=True)
#print(df)

In [4]:
#There are multiple definitions for night, depending if it's for civil (-6°), nautical (-12°) or astronomical (-18°) purposes. 
# Just pick a treshold : if the sun is below, it's nighttime!

import ephem
import math
import datetime

def get_day_night(lat,lon,datetime): 
    sun = ephem.Sun()
    observer = ephem.Observer()
    # ↓ Define your coordinates here ↓
    observer.lat, observer.lon, observer.elevation = lat, lon, 0
    # ↓ Set the time (UTC) here ↓
    #observer.date = datetime.datetime.utcnow()
    observer.date=datetime
    sun.compute(observer)
    current_sun_alt = sun.alt
    if (current_sun_alt*180/math.pi) < -12.0 :
        return 2
    else:
        return 1
    
df['sun_state'] = df.apply(lambda x: get_day_night(x['lat'], x['lon'],x['datetime']), axis=1)

In [5]:
df["sun_state"].value_counts()

1    8934
2    5107
Name: sun_state, dtype: int64

Se pesca mas de noche, que de dia

In [6]:
print(df.groupby(["is_fishing"])[['sun_state']].describe(percentiles=[]))

           sun_state                                   
               count      mean       std  min  50%  max
is_fishing                                             
NO           12755.0  1.369894  0.482795  1.0  1.0  2.0
SI            1286.0  1.302488  0.459514  1.0  1.0  2.0


In [7]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

def distance(x):
    y = x.shift()
    return haversine_np(x['lat'], x['lon'], y['lat'], y['lon']).fillna(0)

df['distance'] = df.groupby('mmsi').apply(distance).reset_index(level=0, drop=True)

In [8]:
#Convierto latitud y longitud a radianes
df['lat'] = np.radians(df['lat'])
df['lon'] = np.radians(df['lon'])

In [9]:
#Calculo la diferencia de tiempo entre puntos consecutivos.
df['timediff']= df.groupby('mmsi')['timestamp'].diff()

In [10]:
#Normalizamos la variable curso
df['course']=df['course']/360.0

In [11]:
#Agregar esto deberia de hacer robusto a mi algoritmo a LAT/LON y timestamps.



#Velocidad entre puntos a partir de la distancia        S = x(t2)-x(t1) / t2 - t1
df['S0']= df['distance']/df['timediff']
#Aceleracion rectilinea         A = S(t2)-S(t1) / t2 - t1
df['A0']= df.groupby('mmsi')['S0'].diff()/df['timediff']
#Jerk                           J = A(t2)-A(t1) / t2 - t1
df['J0']= df.groupby('mmsi')['A0'].diff()/df['timediff']
#Derivada del curso             Phi = C(t2)-C(t1) / t2 - t1
df['C0']= df.groupby('mmsi')['course'].diff()/df['timediff']
#2 Horas deberia de ser suficientemente informativo para saber el comportamiento del buque (paper subrayado)

#Velocidad promedio entre puntos    Vavg = V1 + V2 / 2
df['Vavg']  = df.groupby('mmsi')['speed'].transform(lambda x: x.rolling(2, 1).mean())


#Variacion de velocidad entre puntos Delta_S = V2 - V1 / 2 
df['Delta_S']= (df.groupby('mmsi')['speed'].diff())/2

#Curso promedio entre puntos        Cavg = C1 + C2 / 2
df['Cavg'] = df.groupby('mmsi')['course'].transform(lambda x: x.rolling(2, 1).mean())
#Variacion del curso                DeltaC = C2-C1/2
df['Delta_C']= (df.groupby('mmsi')['course'].diff())/2


In [12]:
number_lags = 6
columns_to_lag = ['speed','course','S0','A0','J0','C0']

#df = pd.DataFrame(data={'vals':[5,4,3,2,1]})
for column in columns_to_lag:
    for lag in range(1, number_lags + 1):
        df[str(column) +'_lag_' + str(lag)] = df[column].shift(lag)


In [13]:
print(df.columns)


Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'is_fishing', 'source', 'datetime',
       'sun_state', 'distance', 'timediff', 'S0', 'A0', 'J0', 'C0', 'Vavg',
       'Delta_S', 'Cavg', 'Delta_C', 'speed_lag_1', 'speed_lag_2',
       'speed_lag_3', 'speed_lag_4', 'speed_lag_5', 'speed_lag_6',
       'course_lag_1', 'course_lag_2', 'course_lag_3', 'course_lag_4',
       'course_lag_5', 'course_lag_6', 'S0_lag_1', 'S0_lag_2', 'S0_lag_3',
       'S0_lag_4', 'S0_lag_5', 'S0_lag_6', 'A0_lag_1', 'A0_lag_2', 'A0_lag_3',
       'A0_lag_4', 'A0_lag_5', 'A0_lag_6', 'J0_lag_1', 'J0_lag_2', 'J0_lag_3',
       'J0_lag_4', 'J0_lag_5', 'J0_lag_6', 'C0_lag_1', 'C0_lag_2', 'C0_lag_3',
       'C0_lag_4', 'C0_lag_5', 'C0_lag_6'],
      dtype='object')


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [14]:
df = df.dropna()
df.to_csv(f"../df_procesado/{config['data_path']}_fe.csv",sep=",")  

OSError: [Errno 22] Invalid argument: '../df_procesado/C:/Users/nico_/Desktop/ITBA/TFI/global fishing watch/dataset/trollers.csv_fe.csv'

In [None]:
print(df.count())