In [76]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

df = pd.read_csv("C:/Users/nico_/Desktop/ITBA/TFI/global fishing watch/dataset/trollers.csv", header="infer", sep=",",error_bad_lines=False, engine ='python')

#Nos quedamos con los puntos que estan mayor a 3 millas nauticas de la costa.
df = df[(df['distance_from_shore'] >= 5556)]

#Nos quedamos solamente con los valores que estamos seguros que esta o no esta pescando.
df = df[(df['is_fishing'] == -1.0)  | (df['is_fishing'] == 1.0) ]

#Renombramos las columnas de la variable a predecir de si esta pescando o no.
df["is_fishing"] = np.where(df["is_fishing"] == 1.0, 'SI', 'NO')

#Dropeamos los que contienen el mismo timestamp.
df = df.drop_duplicates(subset='timestamp', keep="first")


In [77]:
#There are multiple definitions for night, depending if it's for civil (-6°), nautical (-12°) or astronomical (-18°) purposes. 
# Just pick a treshold : if the sun is below, it's nighttime!

import ephem
import math
import datetime

def get_day_night(lat,lon): 
    sun = ephem.Sun()
    observer = ephem.Observer()
    # ↓ Define your coordinates here ↓
    observer.lat, observer.lon, observer.elevation = lat, lon, 0
    # ↓ Set the time (UTC) here ↓
    observer.date = datetime.datetime.utcnow()
    sun.compute(observer)
    current_sun_alt = sun.alt
    if (current_sun_alt*180/math.pi) < -12.0 :
        return 2
    else:
        return 1
    
df['sun_state'] = df.apply(lambda x: get_day_night(x['lat'], x['lon']), axis=1)

In [78]:
df["sun_state"].value_counts()

1    14859
2     6539
Name: sun_state, dtype: int64

Se pesca mas de noche, que de dia

In [79]:
print(df.groupby(["is_fishing"])[['sun_state']].describe(percentiles=[]))

           sun_state                                   
               count      mean       std  min  50%  max
is_fishing                                             
NO           19530.0  1.328827  0.469799  1.0  1.0  2.0
SI            1868.0  1.062634  0.242368  1.0  1.0  2.0


In [80]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

def distance(x):
    y = x.shift()
    return haversine_np(x['lat'], x['lon'], y['lat'], y['lon']).fillna(0)

df['distance'] = df.groupby('mmsi').apply(distance).reset_index(level=0, drop=True)

In [81]:
#Convierto latitud y longitud a radianes
df['lat'] = np.radians(df['lat'])
df['lon'] = np.radians(df['lon'])

In [82]:
#Calculo la diferencia de tiempo entre puntos consecutivos.
df['timediff']= df.groupby('mmsi')['timestamp'].diff()

In [83]:
#Normalizamos la variable curso
df['course']=df['course']/360.0

In [84]:
#Agregar esto deberia de hacer robusto a mi algoritmo a LAT/LON y timestamps.

#Velocidad entre puntos         S = x(t2)-x(t1) / t2 - t1
df['S0']= df['distance']/df['timediff']
#Aceleracion rectilinea         A = S(t2)-S(t1) / t2 - t1
df['A0']= df.groupby('mmsi')['S0'].diff()/df['timediff']
#Jerk                           J = A(t2)-A(t1) / t2 - t1
df['J0']= df.groupby('mmsi')['A0'].diff()/df['timediff']
#Derivada del curso             Phi = C(t2)-C(t1) / t2 - t1
df['C0']= df.groupby('mmsi')['course'].diff()/df['timediff']
#2 Horas deberia de ser suficientemente informativo para saber el comportamiento del buque (paper subrayado)

#Velocidad promedio entre puntos    Vavg = V1 + V2 / 2
#Curso promedio entre puntos        Cavg = C1 + C2 / 2



In [85]:
number_lags = 6
columns_to_lag = ['speed','course','S0','A0','J0','C0']

#df = pd.DataFrame(data={'vals':[5,4,3,2,1]})
for column in columns_to_lag:
    for lag in range(1, number_lags + 1):
        df[str(column) +'_lag_' + str(lag)] = df[column].shift(lag)


In [86]:
print(df.columns)

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'is_fishing', 'source', 'sun_state',
       'distance', 'timediff', 'S0', 'A0', 'J0', 'C0', 'speed_lag_1',
       'speed_lag_2', 'speed_lag_3', 'speed_lag_4', 'speed_lag_5',
       'speed_lag_6', 'course_lag_1', 'course_lag_2', 'course_lag_3',
       'course_lag_4', 'course_lag_5', 'course_lag_6', 'S0_lag_1', 'S0_lag_2',
       'S0_lag_3', 'S0_lag_4', 'S0_lag_5', 'S0_lag_6', 'A0_lag_1', 'A0_lag_2',
       'A0_lag_3', 'A0_lag_4', 'A0_lag_5', 'A0_lag_6', 'J0_lag_1', 'J0_lag_2',
       'J0_lag_3', 'J0_lag_4', 'J0_lag_5', 'J0_lag_6', 'C0_lag_1', 'C0_lag_2',
       'C0_lag_3', 'C0_lag_4', 'C0_lag_5', 'C0_lag_6'],
      dtype='object')


In [87]:
df = df.dropna()
df.to_csv('./df_procesado/out.csv')  

In [88]:
print(df.count())

mmsi                   21353
timestamp              21353
distance_from_shore    21353
distance_from_port     21353
speed                  21353
course                 21353
lat                    21353
lon                    21353
is_fishing             21353
source                 21353
sun_state              21353
distance               21353
timediff               21353
S0                     21353
A0                     21353
J0                     21353
C0                     21353
speed_lag_1            21353
speed_lag_2            21353
speed_lag_3            21353
speed_lag_4            21353
speed_lag_5            21353
speed_lag_6            21353
course_lag_1           21353
course_lag_2           21353
course_lag_3           21353
course_lag_4           21353
course_lag_5           21353
course_lag_6           21353
S0_lag_1               21353
S0_lag_2               21353
S0_lag_3               21353
S0_lag_4               21353
S0_lag_5               21353
S0_lag_6      