# This notebook was used to develop and then carry out the non-machine-learning steps in the recreational trip identification process

## Key inputs: 
* Indicators file with columns indicating the predicted vessel class: **Indicators_OurTable.Predictions.csv**
* Data from **Cuebiq's Device table**

## Key outputs:
* Final data set with features for identified recreational fishing trips including indication if the trip is fully-tracked (called interrupted here) and the sample weight for the trip: **DisappearanceIndicators.csv**
* File with all of the stops and trawls within a given trip **Stop_Trawls_Indicators.csv** 

# Spectus Platform Preliminaries

In [None]:
# # Install modules
!pip install tqdm
!pip install statsmodels

# # Suppress all warnings
# import warnings
# warnings.filterwarnings("ignore")

import pandas as pd
import os
import sys
import time
import csv
from datetime import datetime, timedelta
from datetime import datetime
import pytz  
import warnings
import numpy as np
import math
local_timezone = pytz.timezone('US/Central')

import csv
import datetime
import datetime as dt
import geopandas as gpd
import heapq
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pytz
import requests
import seaborn as sns
import time
import math
import statsmodels

from scipy import interpolate
from shapely import wkt
from shapely.geometry import Polygon
from zipfile import ZipFile
from shapely.geometry import Point  # Import the Point class from shapely.geometry
from datetime import datetime
from tqdm import tqdm

class OperationCancelled(Exception):
    pass

local_timezone = pytz.timezone('US/Central')

### SQL Prelminatries

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
def get_dates_sequence(
    date_start, 
    date_end, 
    date_format
):
    return [
        (datetime.strptime(date_start, date_format) + timedelta(days=x)).strftime(date_format)
        for x in range (
        0,
        (datetime.strptime(date_end, date_format) - datetime.strptime(date_start, date_format) + timedelta(days=1)).days
        )
    ]

### Date parameter for request from Spectus

In [None]:
date_format = "%Y%m%d"

first_date = "20190101"
last_date_to_compute = "20220422"

# Functions and other objects

### Gulf wkt 

In [None]:
Gulf=  'POLYGON ((-97.1461459379788 25.957168007570218, -96.289339604698 24.85679572942162, -92.46567751398405 24.93226258740249, -86.46809930536585 26.63685607835417, -87.4518747436496 30.2870218479717, -87.56970870420537 30.265986917324028, -87.68688306481553 30.240398581969316, -87.80866540589953 30.223904790285445, -87.86330288865585 30.223335989499247, -87.95809562982397 30.227886303681714, -88.02919018569986 30.21821663439117, -88.04103927834595 30.236417575256624, -87.96007047859824 30.256320996255198, -87.9264980494346 30.259732606445226, -87.87712683007626 30.243810745100816, -87.84223783506322 30.25688960618315, -87.78825863523141 30.25802681616385, -87.76653529871386 30.268829654214812, -87.75929418654121 30.287589653053473, -87.77443469381102 30.308051015443525, -87.80142429372691 30.321121316921534, -87.83236359119138 30.34441622544655, -87.84487096676237 30.372249210920785, -87.87910145959437 30.384175375165654, -87.9185984350809 30.414267849534085, -87.92386469847926 30.462509996885288, -87.94032177159842 30.47215556160502, -87.93505550820048 30.49938493987453, -87.91991500093027 30.529441859299524, -87.9172818692315 30.552120171744733, -87.91991500093027 30.575926698627413, -87.92452298140375 30.609925886488114, -87.92978924480211 30.64051495391115, -87.9475628837711 30.650709160976575, -87.96928622028864 30.651841784313817, -87.99298440558087 30.665432229337696, -88.01141632747473 30.659203512573512, -88.0423556249392 30.638249428634225, -88.05091330296123 30.624088693607646, -88.04696360541267 30.61559125795084, -88.05354643466042 30.590094476474817, -88.07790290287716 30.565157802896906, -88.07592805410286 30.539080789195197, -88.09370063563347 30.456267467088836, -88.09896689903184 30.398370320370773, -88.10620801120405 30.353504810736837, -88.15887064518647 30.323393627447288, -88.19178479142525 30.31032362908067, -88.20429216699586 30.330780246575657, -88.19705105482359 30.353504810736837, -88.28855238136764 30.381335210261483, -88.2957934935403 30.364865114151982, -88.31817511298274 30.363161152750493, -88.32014996175704 30.38190309511809, -88.3517475421464 30.390988804016374, -88.39058623470804 30.37224860380745, -88.39256108348229 30.342143188113525, -88.43732432236718 30.34725608152752, -88.44259058576552 30.331348424972347, -88.48077099540268 30.308618718161995, -88.5018360489957 30.312596797505776, -88.5288256489116 30.337030027619335, -88.55384040005282 30.331348424972347, -88.58807111214138 30.335893733464147, -88.60584475111035 30.368840908630105, -88.65784982716818 30.350096668434418, -88.70377316315728 30.333940318201655, -88.76345157602148 30.340304903230376, -88.81215810808295 30.366323547786152, -88.96059706293589 30.3823315816322, -89.10671665911954 30.334299618169936, -89.33169445006897 30.2782325348214, -89.41287200350473 30.192067121425936, -89.44534302487904 30.08977311890679, -89.359526754104 30.069703042406346, -89.20876844058175 30.176028017449696, -89.16006190852028 30.16199165952355, -89.16006190852028 30.08977311890679, -89.13918768049385 30.055651566395866, -89.16238126718959 29.921058052351142, -89.21108779925103 29.7782302647352, -89.24587817929464 29.66946405486304, -89.31545893938255 29.693644509171918, -89.27371048332964 29.629150377050877, -89.30386214603394 29.59688781400496, -89.3850396994697 29.61503677626405, -89.46157853556547 29.532331728567527, -89.3827203407997 29.439460132099583, -89.18979122539224 29.427128425449155, -89.19596940111164 29.394836842780364, -88.95749181835598 29.20950068771296, -89.04027937299146 29.04327521681688, -89.10329676532581 29.036793464759455, -89.13913018449634 28.94816874403513, -89.19473376596763 28.98276303815456, -89.26022242858967 29.027070073382717, -89.29358457747252 29.016265231135534, -89.4010848349841 28.903827915763785, -89.44186079473003 28.94816874403513, -89.39243538897739 29.025989640029877, -89.44680333530505 29.0864764893534, -89.45298151102406 29.14692783767063, -89.42209063242889 29.17066685855066, -89.376372132108 29.156639918576943, -89.36525141581359 29.180376694261838, -89.34671688865649 29.20195081707395, -89.37390086182032 29.206265096965808, -89.40232047012778 29.203029404070804, -89.4356826190106 29.20195081707395, -89.46780913274982 29.179297868997324, -89.49499310591362 29.193321712569713, -89.54441851166625 29.2278337721458, -89.63214860687673 29.254788228681946, -89.68404528291667 29.282813330704627, -89.80143062157865 29.305443379957012, -89.85579856790665 29.300055727855437, -89.9076952439466 29.27526886673428, -89.98924716343802 29.21489311194422, -90.05473582606008 29.165272108427317, -90.1572935429963 29.102672297767754, -90.26232567572231 29.06703684192044, -90.2833314731671 29.10914859304988, -90.26479694600961 29.13181701694839, -90.28827401374208 29.15232223739727, -90.29939473003648 29.19547774319095, -90.269739486585 29.23538042243976, -90.29568782460515 29.2310673685935, -90.30927981118687 29.24724038352842, -90.33893505463838 29.26341084252479, -90.393303000966 29.251552755878066, -90.40565935240406 29.30220951059583, -90.44767094729364 29.334530475168393, -90.50203889362164 29.32698983267116, -90.54899302908619 29.3032870410304, -90.57123446167503 29.309751984787326, -90.56629192109963 29.291433580776584, -90.59100462397578 29.241849662499888, -90.72198194921984 29.166349767170132, -90.74669465209598 29.108069019686837, -90.77634989554745 29.09943202507891, -90.84801673388853 29.115625795370804, -90.88755705849043 29.13721352427197, -90.89126396392177 29.16742872854931, -90.91474103165422 29.217048688862008, -90.92956865337999 29.198713674444775, -90.93080428852403 29.16742872854931, -90.97652278884493 29.160954790180483, -91.02965510002853 29.163112814994165, -91.08772995178788 29.17713887061521, -91.15445424955357 29.210577881865433, -91.21005783102524 29.21165637807694, -91.28790284508534 29.240771484217632, -91.33609261569393 29.292511224930067, -91.36327658885774 29.33237606278429, -91.36698404442399 29.461560546417502, -91.37316222014341 29.483075264842185, -91.42505889618336 29.473394206552683, -91.48189811279867 29.486302078781378, -91.50784645081843 29.51963980456361, -91.52885224826323 29.514263494469418, -91.5424442348453 29.538992155608895, -91.5424442348453 29.565864276975645, -91.56097876200242 29.62710598028727, -91.58692710002259 29.62603188493061, -91.6351168706312 29.61636451146687, -91.65118012750057 29.642142112800144, -91.63758814091887 29.656102225576717, -91.64129504635021 29.723727639327834, -91.71296188469128 29.718362205888425, -91.76980110130658 29.702264184791886, -91.83034722335326 29.685090118027063, -91.85505992622939 29.662544701481323, -91.845174845079 29.634624327137942, -91.79451380418274 29.609919080630874, -91.75744474986853 29.590580315799386, -91.69689862782151 29.57123784364269, -91.71914006041031 29.53254178333445, -91.74879530386181 29.517489314800784, -91.77103673645024 29.48199963736188, -91.8019276150454 29.470166981654714, -91.8525886559417 29.47662132872088, -91.94649692687157 29.517489314800784, -92.01816376521265 29.556191136499905, -92.081181157547 29.575536491098305, -92.14666982016863 29.57338719024669, -92.2282217396605 29.543292175193457, -92.2620745477254 29.513938983582918, -92.40184398133182 29.53587031357371, -92.53015690398729 29.55580375974114, -92.65388722226176 29.58171136156176, -92.76845233177524 29.613588513778055, -92.90134785881101 29.66735835342932, -92.99758255080252 29.705179169035475, -93.10985635812567 29.739006777997886, -93.2015084457365 29.758900043967557, -93.27024751144474 29.76088915343348, -93.3389865771526 29.740996282222838, -93.43980387352478 29.750943211286994, -93.61623414217547 29.740996282222838, -93.76058618016258 29.715129649330578, -93.8293252458708 29.663376387168597, -93.95534686633557 29.6852752506843, -94.10199020651301 29.65541198179467, -94.53962892485461 29.488014011072636, -94.71376789131534 29.402214927729204, -94.72293005770678 29.324334651639106, -94.9360211614021 29.206402439154715, -95.13765575414574 29.060298314788838, -95.37824248412433 28.86583930097359, -95.42635983011992 28.85781253647636, -95.88003908572986 28.63281185841163, -96.17332576608472 28.508049911739334, -96.3657951500674 28.385156078353774, -96.42995161139497 28.326681104162276, -96.5811775559527 28.237899002445175, -96.8538425165949 28.05404703841444, -97.01423366991399 27.871905659410203, -97.18378864850824 27.632633899635536, -97.28460594488008 27.443685488256804, -97.34647110401767 27.286998971872194, -97.37396673030096 27.111736817761667, -97.37167542811062 26.942327753778045, -97.30062980709232 26.653886203530476, -97.27907391837897 26.589650805967054, -97.227579295341 26.438555761793396, -97.19764056101677 26.3183955849306, -97.16770182669217 26.152971688301193, -97.15093613547081 26.036819954392655, -97.1461459379788 25.957168007570218))'
Gulf_wkt= 'POLYGON ((-97.1461459379788 25.957168007570218, -96.289339604698 24.85679572942162, -92.46567751398405 24.93226258740249, -86.46809930536585 26.63685607835417, -87.4518747436496 30.2870218479717, -87.56970870420537 30.265986917324028, -87.68688306481553 30.240398581969316, -87.80866540589953 30.223904790285445, -87.86330288865585 30.223335989499247, -87.95809562982397 30.227886303681714, -88.02919018569986 30.21821663439117, -88.04103927834595 30.236417575256624, -87.96007047859824 30.256320996255198, -87.9264980494346 30.259732606445226, -87.87712683007626 30.243810745100816, -87.84223783506322 30.25688960618315, -87.78825863523141 30.25802681616385, -87.76653529871386 30.268829654214812, -87.75929418654121 30.287589653053473, -87.77443469381102 30.308051015443525, -87.80142429372691 30.321121316921534, -87.83236359119138 30.34441622544655, -87.84487096676237 30.372249210920785, -87.87910145959437 30.384175375165654, -87.9185984350809 30.414267849534085, -87.92386469847926 30.462509996885288, -87.94032177159842 30.47215556160502, -87.93505550820048 30.49938493987453, -87.91991500093027 30.529441859299524, -87.9172818692315 30.552120171744733, -87.91991500093027 30.575926698627413, -87.92452298140375 30.609925886488114, -87.92978924480211 30.64051495391115, -87.9475628837711 30.650709160976575, -87.96928622028864 30.651841784313817, -87.99298440558087 30.665432229337696, -88.01141632747473 30.659203512573512, -88.0423556249392 30.638249428634225, -88.05091330296123 30.624088693607646, -88.04696360541267 30.61559125795084, -88.05354643466042 30.590094476474817, -88.07790290287716 30.565157802896906, -88.07592805410286 30.539080789195197, -88.09370063563347 30.456267467088836, -88.09896689903184 30.398370320370773, -88.10620801120405 30.353504810736837, -88.15887064518647 30.323393627447288, -88.19178479142525 30.31032362908067, -88.20429216699586 30.330780246575657, -88.19705105482359 30.353504810736837, -88.28855238136764 30.381335210261483, -88.2957934935403 30.364865114151982, -88.31817511298274 30.363161152750493, -88.32014996175704 30.38190309511809, -88.3517475421464 30.390988804016374, -88.39058623470804 30.37224860380745, -88.39256108348229 30.342143188113525, -88.43732432236718 30.34725608152752, -88.44259058576552 30.331348424972347, -88.48077099540268 30.308618718161995, -88.5018360489957 30.312596797505776, -88.5288256489116 30.337030027619335, -88.55384040005282 30.331348424972347, -88.58807111214138 30.335893733464147, -88.60584475111035 30.368840908630105, -88.65784982716818 30.350096668434418, -88.70377316315728 30.333940318201655, -88.76345157602148 30.340304903230376, -88.81215810808295 30.366323547786152, -88.96059706293589 30.3823315816322, -89.10671665911954 30.334299618169936, -89.33169445006897 30.2782325348214, -89.41287200350473 30.192067121425936, -89.44534302487904 30.08977311890679, -89.359526754104 30.069703042406346, -89.20876844058175 30.176028017449696, -89.16006190852028 30.16199165952355, -89.16006190852028 30.08977311890679, -89.13918768049385 30.055651566395866, -89.16238126718959 29.921058052351142, -89.21108779925103 29.7782302647352, -89.24587817929464 29.66946405486304, -89.31545893938255 29.693644509171918, -89.27371048332964 29.629150377050877, -89.30386214603394 29.59688781400496, -89.3850396994697 29.61503677626405, -89.46157853556547 29.532331728567527, -89.3827203407997 29.439460132099583, -89.18979122539224 29.427128425449155, -89.19596940111164 29.394836842780364, -88.95749181835598 29.20950068771296, -89.04027937299146 29.04327521681688, -89.10329676532581 29.036793464759455, -89.13913018449634 28.94816874403513, -89.19473376596763 28.98276303815456, -89.26022242858967 29.027070073382717, -89.29358457747252 29.016265231135534, -89.4010848349841 28.903827915763785, -89.44186079473003 28.94816874403513, -89.39243538897739 29.025989640029877, -89.44680333530505 29.0864764893534, -89.45298151102406 29.14692783767063, -89.42209063242889 29.17066685855066, -89.376372132108 29.156639918576943, -89.36525141581359 29.180376694261838, -89.34671688865649 29.20195081707395, -89.37390086182032 29.206265096965808, -89.40232047012778 29.203029404070804, -89.4356826190106 29.20195081707395, -89.46780913274982 29.179297868997324, -89.49499310591362 29.193321712569713, -89.54441851166625 29.2278337721458, -89.63214860687673 29.254788228681946, -89.68404528291667 29.282813330704627, -89.80143062157865 29.305443379957012, -89.85579856790665 29.300055727855437, -89.9076952439466 29.27526886673428, -89.98924716343802 29.21489311194422, -90.05473582606008 29.165272108427317, -90.1572935429963 29.102672297767754, -90.26232567572231 29.06703684192044, -90.2833314731671 29.10914859304988, -90.26479694600961 29.13181701694839, -90.28827401374208 29.15232223739727, -90.29939473003648 29.19547774319095, -90.269739486585 29.23538042243976, -90.29568782460515 29.2310673685935, -90.30927981118687 29.24724038352842, -90.33893505463838 29.26341084252479, -90.393303000966 29.251552755878066, -90.40565935240406 29.30220951059583, -90.44767094729364 29.334530475168393, -90.50203889362164 29.32698983267116, -90.54899302908619 29.3032870410304, -90.57123446167503 29.309751984787326, -90.56629192109963 29.291433580776584, -90.59100462397578 29.241849662499888, -90.72198194921984 29.166349767170132, -90.74669465209598 29.108069019686837, -90.77634989554745 29.09943202507891, -90.84801673388853 29.115625795370804, -90.88755705849043 29.13721352427197, -90.89126396392177 29.16742872854931, -90.91474103165422 29.217048688862008, -90.92956865337999 29.198713674444775, -90.93080428852403 29.16742872854931, -90.97652278884493 29.160954790180483, -91.02965510002853 29.163112814994165, -91.08772995178788 29.17713887061521, -91.15445424955357 29.210577881865433, -91.21005783102524 29.21165637807694, -91.28790284508534 29.240771484217632, -91.33609261569393 29.292511224930067, -91.36327658885774 29.33237606278429, -91.36698404442399 29.461560546417502, -91.37316222014341 29.483075264842185, -91.42505889618336 29.473394206552683, -91.48189811279867 29.486302078781378, -91.50784645081843 29.51963980456361, -91.52885224826323 29.514263494469418, -91.5424442348453 29.538992155608895, -91.5424442348453 29.565864276975645, -91.56097876200242 29.62710598028727, -91.58692710002259 29.62603188493061, -91.6351168706312 29.61636451146687, -91.65118012750057 29.642142112800144, -91.63758814091887 29.656102225576717, -91.64129504635021 29.723727639327834, -91.71296188469128 29.718362205888425, -91.76980110130658 29.702264184791886, -91.83034722335326 29.685090118027063, -91.85505992622939 29.662544701481323, -91.845174845079 29.634624327137942, -91.79451380418274 29.609919080630874, -91.75744474986853 29.590580315799386, -91.69689862782151 29.57123784364269, -91.71914006041031 29.53254178333445, -91.74879530386181 29.517489314800784, -91.77103673645024 29.48199963736188, -91.8019276150454 29.470166981654714, -91.8525886559417 29.47662132872088, -91.94649692687157 29.517489314800784, -92.01816376521265 29.556191136499905, -92.081181157547 29.575536491098305, -92.14666982016863 29.57338719024669, -92.2282217396605 29.543292175193457, -92.2620745477254 29.513938983582918, -92.40184398133182 29.53587031357371, -92.53015690398729 29.55580375974114, -92.65388722226176 29.58171136156176, -92.76845233177524 29.613588513778055, -92.90134785881101 29.66735835342932, -92.99758255080252 29.705179169035475, -93.10985635812567 29.739006777997886, -93.2015084457365 29.758900043967557, -93.27024751144474 29.76088915343348, -93.3389865771526 29.740996282222838, -93.43980387352478 29.750943211286994, -93.61623414217547 29.740996282222838, -93.76058618016258 29.715129649330578, -93.8293252458708 29.663376387168597, -93.95534686633557 29.6852752506843, -94.10199020651301 29.65541198179467, -94.53962892485461 29.488014011072636, -94.71376789131534 29.402214927729204, -94.72293005770678 29.324334651639106, -94.9360211614021 29.206402439154715, -95.13765575414574 29.060298314788838, -95.37824248412433 28.86583930097359, -95.42635983011992 28.85781253647636, -95.88003908572986 28.63281185841163, -96.17332576608472 28.508049911739334, -96.3657951500674 28.385156078353774, -96.42995161139497 28.326681104162276, -96.5811775559527 28.237899002445175, -96.8538425165949 28.05404703841444, -97.01423366991399 27.871905659410203, -97.18378864850824 27.632633899635536, -97.28460594488008 27.443685488256804, -97.34647110401767 27.286998971872194, -97.37396673030096 27.111736817761667, -97.37167542811062 26.942327753778045, -97.30062980709232 26.653886203530476, -97.27907391837897 26.589650805967054, -97.227579295341 26.438555761793396, -97.19764056101677 26.3183955849306, -97.16770182669217 26.152971688301193, -97.15093613547081 26.036819954392655, -97.1461459379788 25.957168007570218))'
Gulf_with_coast_Coarse = 'POLYGON ((-84.84686264156812 25.68340767152216, -87.43248750342957 30.94498325396067, -95.80214228446721 30.0359530662285, -98.3915463041922 26.769804868308682, -95.4673270575147 24.286120603664912, -84.84686264156812 25.68340767152216))'

### Islands with lots of pings on it that should be excluded from feature calculation

In [None]:

AlabamaIsland= 'POLYGON ((-88.11218406491841 30.28270197801872, -88.11748790022997 30.284420403489747, -88.1289123735651 30.296676657854306, -88.13525930319557 30.31441063898764, -88.12276411652998 30.344306892409193, -88.1261384293325 30.346573465262267, -88.14068302824425 30.330248934489845, -88.14841474252134 30.326463909723657, -88.14033683208271 30.309429488292594, -88.12233483298868 30.27266101723977, -88.13929844491092 30.259205656184136, -88.17437965596034 30.254122039939332, -88.2182311697725 30.24923753336725, -88.30108761392364 30.233386081193586, -88.32532134524112 30.23258845903686, -88.34528532389731 30.2314917180055, -88.40194641363063 30.215039068387483, -88.40817794454074 30.217232905188055, -88.42087180380169 30.212147117858834, -88.43252707457806 30.211449048099738, -88.44764430697126 30.20376995391028, -88.4625307419233 30.204368346411314, -88.48353330906463 30.209653988836138, -88.51157452826871 30.22032416074346, -88.51088213594514 30.2246118384801, -88.53211550052762 30.228699915260023, -88.57377443864924 30.22780254711091, -88.68005685360352 30.245947643114775, -88.7160612544177 30.256215137818614, -88.73648716752392 30.24843987369816, -88.7441034830807 30.249536425624242, -88.76568304382471 30.244851072739735, -88.77803070692467 30.247742061555087, -88.78576242120177 30.24166091740736, -88.86804184612036 30.248938361474544, -88.8717346051779 30.25133081048743, -88.89908410195018 30.234083937046506, -88.91223955609405 30.225409510211975, -88.9413202941934 30.222318343593045, -88.98551800416709 30.215836746861996, -88.9950960979732 30.208656633512476, -89.06366907456174 30.218130299450735, -89.05593736028412 30.237573354839526, -89.06678484001655 30.251729468053952, -89.08051728776313 30.235080870280683, -89.13729363724818 30.23378483291677, -89.1603733813597 30.22989637955409, -89.12047958599241 30.22005260021551, -89.12145473065867 30.211288948999467, -89.09844131653294 30.20977208396677, -89.10448721346415 30.20286162516733, -89.08849484093591 30.203030178716162, -89.0746477866739 30.219378500914573, -88.99020129980069 30.207581121374503, -88.97615921660534 30.20437870267233, -88.95626626541215 30.205221454538503, -88.93988353780959 30.21752474053794, -88.91335960288517 30.224434169581016, -88.90282804048856 30.221737864773814, -88.89112630449237 30.229152525120753, -88.8747438740979 30.238588556974634, -88.86908803503293 30.248529110671157, -88.78542036054029 30.240947271347025, -88.77430371134409 30.238082926154163, -88.75870139668251 30.23572399096632, -88.732957577491 30.23825141935825, -88.69941263547238 30.232690225631984, -88.64343933162394 30.22308510875915, -88.59780256123877 30.21954614500025, -88.57654440751222 30.227466506513878, -88.53129735506896 30.228477021928455, -88.52973712360316 30.21988264595238, -88.52349619773817 30.217523274113134, -88.51160410515517 30.21952563990618, -88.50750849755646 30.207053988309582, -88.46518721903666 30.192389349352695, -88.41935541971861 30.19660354973145, -88.40141275785743 30.214301221513225, -88.34641459868976 30.230142150145852, -88.30409253525568 30.224580980238727, -88.22452073048152 30.239072330381035, -88.17205794743222 30.243958466852533, -88.12642117704708 30.227108695087196, -88.07200810466486 30.2368819144896, -88.07493353866359 30.258783876570888, -88.11218406491841 30.28270197801872))'

GalvestonIsland = 'POLYGON ((-94.79810501429436 29.285782165982226, -94.85587801828437 29.256474439597326, -94.96386123262589 29.194262486622222, -94.98840851114379 29.17802706499387, -94.9832015126703 29.172398185737194, -94.95751050670931 29.185942592884174, -94.93155434112559 29.200608310191058, -94.90207396798999 29.21719558256349, -94.87503715080425 29.23384831840025, -94.84904021158258 29.24836617943305, -94.82392719885846 29.263585041171126, -94.80006095983339 29.275328868980722, -94.79198685607093 29.280744467360933, -94.78065199926338 29.29067598019759, -94.7849675640115 29.294349034860943, -94.79810501429436 29.285782165982226))'

BiscayneTX = 'POLYGON ((-94.72630885515686 29.39417826702656, -94.71635868628177 29.38842018852729, -94.70841340220476 29.39890094161126, -94.69809195840361 29.409315928554435, -94.6909634792623 29.41675455187773, -94.6995770582621 29.419341771414878, -94.72630885515686 29.39417826702656))'

# Function to extract coordinates from polygon string
def extract_coordinates(polygon):
    coordinates = polygon.split('((')[1].split('))')[0].split(', ')
    return [(float(coord.split()[0]), float(coord.split()[1])) for coord in coordinates]

# Extract coordinates for each polygon
alabama_coordinates = extract_coordinates(AlabamaIsland)
galveston_coordinates = extract_coordinates(GalvestonIsland)
biscayne_coordinates = extract_coordinates(BiscayneTX)

# Find the largest and smallest latitudes and longitudes
all_coordinates = alabama_coordinates + galveston_coordinates + biscayne_coordinates

min_islands_latitude = min(coord[1] for coord in all_coordinates)
max_islands_latitude = max(coord[1] for coord in all_coordinates)
min_islands_longitude = min(coord[0] for coord in all_coordinates)
max_islands_longitude = max(coord[0] for coord in all_coordinates)


### Function that can be used to create a new variable indicating if the lat & lng are within a polygon

In [None]:
""" Function that can be used to create a new variable indicating if the lat & lng are within a polygon """
def is_inside_any_polygon(lat, lng, polygons):
    #  Function that can be used to creat a TF new variable indiccating if a point is within a polygon
    point = Point(lng, lat)
    for polygon in polygons:
        if polygon.contains(point):
            return True
    return False
    """
    Usage suppose that 
            1. the polygon you want to use is defined by My_wkt 
            2. df contains lat, lng is df
            3. new variable is "is_in_wkt"
    polygons = [wkt.loads(My_wkt)]  # Create a list with a single polygon
    df['is_in_wkt'] = df.apply(lambda row: is_inside_any_polygon(row['lat'], row['lng'], polygons), axis=1)
    """


### Function that excludes rows that are within the islands

In [None]:
from shapely.geometry import Polygon, Point  # Import the Point class

# AlabamaIsland reformatting
coordinates_str = AlabamaIsland.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
alabama_island_polygon = Polygon(coordinates_list)

# GalvestonIsland reformatting
coordinates_str = GalvestonIsland.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
galveston_island_polygon = Polygon(coordinates_list)

# BiscayneTX reformatting
coordinates_str = BiscayneTX.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
biscayne_tx_polygon = Polygon(coordinates_list)

# Create a list of polygons
island_polygons = [alabama_island_polygon, galveston_island_polygon, biscayne_tx_polygon]

## , min_islands_latitude, max_islands_latitude, min_islands_longitude, max_islands_longitude
def is_point_outside_all_islands(data_frame):
    # Filter out points not in the Gulf but outside the islands rectangle
    data_frame['in_Gulf_not_Islands_rectangle'] = (data_frame['lat'] < min_islands_latitude) | \
                                                   (data_frame['lat'] > max_islands_latitude) | \
                                                   (data_frame['lng'] < min_islands_longitude) | \
                                                   (data_frame['lng'] > max_islands_longitude)
    filtered_data = data_frame[data_frame['in_Gulf_not_Islands_rectangle'] == False]
    
    # Apply the function is_point_outside_all_islands to the filtered data
    filtered_data['in_Gulf_not_Islands'] = filtered_data.apply(is_point_outside_all_islands_row, axis=1)
    
    return filtered_data

def is_point_outside_all_islands_row(row):
    lat = row['lat']
    lng = row['lng']
    point = Point(lng, lat)
    return all(not polygon.contains(point) for polygon in island_polygons)

########################### ON Islands
def is_point_on_islands(data_frame):
    # Filter out points not in the Gulf but outside the islands rectangle
    data_frame['in_Gulf_not_Islands_rectangle'] = (data_frame['lat'] < min_islands_latitude) | \
                                                   (data_frame['lat'] > max_islands_latitude) | \
                                                   (data_frame['lng'] < min_islands_longitude) | \
                                                   (data_frame['lng'] > max_islands_longitude)
    filtered_data = data_frame[data_frame['in_Gulf_not_Islands_rectangle'] == False]
    
    # Apply the function is_point_outside_all_islands to the filtered data
    filtered_data['in_Gulf_not_Islands'] = filtered_data.apply(is_point_on_islands_row, axis=1)
    
    return filtered_data

def is_point_on_islands_row(row):
    lat = row['lat']
    lng = row['lng']
    point = Point(lng, lat)
    return all(polygon.contains(point) for polygon in island_polygons)



### Function that finds out if a point is inside the Gulf wkt and not in the islands

In [None]:
from shapely.geometry import Polygon, Point  

# AlabamaIsland reformatting
coordinates_str = AlabamaIsland.replace('POLYGON ((', '').replace('))', '')  
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  
alabama_island_polygon = Polygon(coordinates_list)

# GalvestonIsland reformatting
coordinates_str = GalvestonIsland.replace('POLYGON ((', '').replace('))', '')  
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  
galveston_island_polygon = Polygon(coordinates_list)

# BiscayneTX reformatting
coordinates_str = BiscayneTX.replace('POLYGON ((', '').replace('))', '')  
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  
biscayne_tx_polygon = Polygon(coordinates_list)

# Create a list of polygons
island_polygons = [alabama_island_polygon, galveston_island_polygon, biscayne_tx_polygon]

# Gulf reformatting
coordinates_str = Gulf.replace('POLYGON ((', '').replace('))', '')  
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  
gulf_polygon = Polygon(coordinates_list)

# Gulf with coast
coordinates_str = Gulf_with_coast_Coarse.replace('POLYGON ((', '').replace('))', '')  
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  
gulf_with_coast_polygon = Polygon(coordinates_list)

# Create a function to check if a point is inside the Gulf and not inside any island polygon
# def from shapely.geometry import Polygon, Point  # Import the Point class

# AlabamaIsland reformatting
coordinates_str = AlabamaIsland.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
alabama_island_polygon = Polygon(coordinates_list)

# GalvestonIsland reformatting
coordinates_str = GalvestonIsland.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
galveston_island_polygon = Polygon(coordinates_list)

# BiscayneTX reformatting
coordinates_str = BiscayneTX.replace('POLYGON ((', '').replace('))', '')  # Remove the 'POLYGON ((' and '))'
coordinates_list = [tuple(map(float, point.split())) for point in coordinates_str.split(',')]  # Split and convert to tuples
# Create a Polygon object using the list of coordinates
biscayne_tx_polygon = Polygon(coordinates_list)

# Create a list of polygons
island_polygons = [alabama_island_polygon, galveston_island_polygon, biscayne_tx_polygon]

# Create a function to check if a point is outside all polygons
def is_point_in_Gulf_not_Islands(row):
    lat = row['lat']
    lng = row['lng']
    point = Point(lng, lat)
    
    # Check if the point is inside the Gulf polygon
    if gulf_polygon.contains(point):
        # Check if the point is not inside any island polygon
        if all(not polygon.contains(point) for polygon in island_polygons):
            return True
    
    return False

def is_point_in_Gulf_Coast_or_Islands(row):
    lat = row['lat']
    lng = row['lng']
    point = Point(lng, lat)
    
    # Check if the point is inside the Gulf polygon
    if gulf_with_coast_polygon.contains(point):
        return True
    return False
    
    

### Function that eliminates pings that are almost certainly erroneous (speed > 60 m.p.h.)

In [None]:
def pingspeed(pings_df):
    pings_df['time_diff_minutes_from_previous'] = pings_df["event_timestamp"].diff()/60.0
    pings_df['time_diff_minutes_from_previous'].fillna(value=0, inplace=True)

    pings_df['time_diff_minutes_to_next'] = pings_df["event_timestamp"].diff(-1)/60.0
    pings_df['time_diff_minutes_to_next'].fillna(value=99999, inplace=True)

    
    pings_df_shifted_down = pings_df.shift(1)
    pings_df['dist_fwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_down['lat'], pings_df_shifted_down['lng'])
    pings_df['ping_speed_fwd'] = abs(pings_df['dist_fwd']/(0.00001+pings_df['time_diff_minutes_from_previous']))
    pings_df['ping_speed_fwd'].fillna(value=0, inplace=True)

    # Calculate speed moving backward e.g., first row is the speed to the next ping
    pings_df_shifted_up = pings_df.shift(-1)
    pings_df['dist_bkwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_up['lat'], pings_df_shifted_up['lng'])
    pings_df['ping_speed_bkwd'] = abs(pings_df['dist_bkwd']/(0.00001+pings_df['time_diff_minutes_to_next']))
    pings_df['ping_speed_bkwd'].fillna(value=0, inplace=True)

    pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2
    
    # columns_to_drop = ['ping_speed_bkwd', 'ping_speed_fwd', 'dist_bkwd', 'dist_fwd', 'time_diff_minutes_from_previous', 'time_diff_minutes_to_next']
    # pings_df.drop(columns=columns_to_drop, inplace=True)
    
    return pings_df



In [None]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

def EliminateErrantPings(pings_df):
    # Calculate speed moving forward e.g., row 0 is time since previous trip
    # create (or recreate) the time difference variables
    pings_df['time_diff_minutes_from_previous'] = pings_df["event_timestamp"].diff()/60.0
    pings_df['time_diff_minutes_from_previous'].fillna(value=0, inplace=True)

    pings_df['time_diff_minutes_to_next'] = pings_df["event_timestamp"].diff(-1)/60.0
    pings_df['time_diff_minutes_to_next'].fillna(value=99999, inplace=True)

    
    pings_df_shifted_down = pings_df.shift(1)
    pings_df['dist_fwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_down['lat'], pings_df_shifted_down['lng'])
    pings_df['ping_speed_fwd'] = abs(pings_df['dist_fwd']/(0.00001+pings_df['time_diff_minutes_from_previous']))
    pings_df['ping_speed_fwd'].fillna(value=0, inplace=True)

    # Calculate speed moving backward e.g., first row is the speed to the next ping
    pings_df_shifted_up = pings_df.shift(-1)
    pings_df['dist_bkwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_up['lat'], pings_df_shifted_up['lng'])
    pings_df['ping_speed_bkwd'] = abs(pings_df['dist_bkwd']/(0.00001+pings_df['time_diff_minutes_to_next']))
    pings_df['ping_speed_bkwd'].fillna(value=0, inplace=True)

    pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2
    pings_df['row_index'] = pings_df.reset_index().index


    # Step 2: Check if the maximum of Avg_ping_speed > 2.0
    iIteration=0
    while len(pings_df) > 2 and pings_df['Avg_ping_speed'].max() > 1.6:  # 1.6 km/hr = 60 mp.h
        iIteration=iIteration+1

        max_index = pings_df['Avg_ping_speed'].idxmax()
    
        # Step 4: Recalculate ping_speed_fwd for the row after the row that was dropped
        if max_index + 1 < len(pings_df) and max_index - 1 >= 0:
            lat_after = pings_df.iloc[max_index+1]['lat']
            lon_after = pings_df.iloc[max_index+1]['lng']
            lat_before = pings_df.iloc[max_index - 1]['lat']
            lon_before = pings_df.iloc[max_index - 1]['lng']
            distance = haversine(lat_before, lon_before, lat_after, lon_after)
            time_diff = pings_df.iloc[max_index+1]['event_timestamp']-pings_df.iloc[max_index-1]['event_timestamp']
            new_speed = distance /time_diff
            
            # Calculate new fwd speed for the row before
            index_before = max_index - 1
            index_after = max_index + 1
            
            # Update the value using .loc[] or .iloc[] with a single call
            pings_df.loc[index_before, 'ping_speed_fwd'] = new_speed
            pings_df.loc[index_after, 'ping_speed_bkwd'] = new_speed
    
        pings_df = pings_df.drop(max_index)
        pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2

        # Reset index (I don't know if this is really necessary)
        pings_df.reset_index(drop=True, inplace=True)
    
    return pings_df

In [None]:
def EliminateErrantPingsSpeed(pings_df, mph_limit):
    km_per_min_limit = mph_limit*(0.0268224)
    # Calculate speed moving forward e.g., row 0 is time since previous trip
    pings_df = pings_df.copy()
    pings_df.sort_values(by='event_timestamp', inplace=True)
    pings_df = pings_df.drop_duplicates()

    # create (or recreate) the time difference variables
    pings_df['time_diff_minutes_from_previous'] = abs(pings_df["event_timestamp"].diff()/60.0)
    pings_df['time_diff_minutes_from_previous'].fillna(value=0, inplace=True)

    pings_df['time_diff_minutes_to_next'] = abs(pings_df["event_timestamp"].diff(-1)/60.0)
    pings_df['time_diff_minutes_to_next'].fillna(value=0, inplace=True)

    
    pings_df_shifted_down = pings_df.shift(1)
    pings_df['dist_fwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_down['lat'], pings_df_shifted_down['lng'])
    pings_df['ping_speed_fwd'] = abs(pings_df['dist_fwd']/(0.00001+pings_df['time_diff_minutes_from_previous']))
    pings_df['ping_speed_fwd'].fillna(value=0, inplace=True)

    # Calculate speed moving backward e.g., first row is the speed to the next ping
    pings_df_shifted_up = pings_df.shift(-1)
    pings_df['dist_bkwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_up['lat'], pings_df_shifted_up['lng'])
    pings_df['ping_speed_bkwd'] = abs(pings_df['dist_bkwd']/(0.00001+pings_df['time_diff_minutes_to_next']))
    pings_df['ping_speed_bkwd'].fillna(value=0, inplace=True)

    pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2
    pings_df['row_index'] = pings_df.reset_index().index

    # Step 2: Check if the maximum of ping_speed > km_per_min_limit
    iIteration=0
    while len(pings_df) > 2 and pings_df['Avg_ping_speed'].max() > km_per_min_limit:
        iIteration=iIteration+1

        max_index = pings_df['Avg_ping_speed'].idxmax()
    
        # Step 4: Recalculate ping_speed_fwd for the row after the row that was dropped
        if max_index + 1 < len(pings_df) and max_index - 1 >= 0:
            lat_after = pings_df.iloc[max_index+1]['lat']
            lon_after = pings_df.iloc[max_index+1]['lng']
            lat_before = pings_df.iloc[max_index - 1]['lat']
            lon_before = pings_df.iloc[max_index - 1]['lng']
            distance = haversine(lat_before, lon_before, lat_after, lon_after)
            time_diff = pings_df.iloc[max_index+1]['event_timestamp']-pings_df.iloc[max_index-1]['event_timestamp']
            new_speed = 0
            if distance*time_diff > 0:
                new_speed = distance /time_diff
            
            # Calculate new fwd speed for the row before
            index_before = max_index - 1
            index_after = max_index + 1
            
            # Update the value using .loc[] or .iloc[] with a single call
            pings_df.loc[index_before, 'ping_speed_fwd'] = new_speed
            pings_df.loc[index_after, 'ping_speed_bkwd'] = new_speed

            pings_df = pings_df[pings_df['event_timestamp'].notna() & (pings_df['event_timestamp'] != '')]

        
        ################ Debugging ###############
        pings_df = pings_df.drop(max_index)
        pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2

        # Reset index (I don't know if this is really necessary)
        pings_df.reset_index(drop=True, inplace=True)
    
    return pings_df

### Function to create CDF of speeds

In [None]:
import matplotlib.pyplot as plt

def CumulativeSpeedsGraph(pings_df):

    pings_df = pings_df.sort_values(by=['event_timestamp'], ascending=[True])
    pings_df = pings_df.drop_duplicates(subset=['event_timestamp'])
    pings_df['hours_since_start'] = (pings_df['event_timestamp'] - pings_df['event_timestamp'].min())/(60*60)

    # Calculate speed moving forward e.g., row 0 is time since previous trip
    # create (or recreate) the time difference variables
    pings_df['time_diff_minutes_from_previous'] = pings_df["event_timestamp"].diff()/60.0
    pings_df['time_diff_minutes_from_previous'].fillna(value=0, inplace=True)

    pings_df['time_diff_minutes_to_next'] = pings_df["event_timestamp"].diff(-1)/60.0
    pings_df['time_diff_minutes_to_next'].fillna(value=99999, inplace=True)

    
    pings_df_shifted_down = pings_df.shift(1)
    pings_df['dist_fwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_down['lat'], pings_df_shifted_down['lng'])
    pings_df['dist_fwd'].fillna(value=0, inplace=True)
    pings_df['ping_speed_fwd'] = abs(pings_df['dist_fwd']/(0.00001+pings_df['time_diff_minutes_from_previous']))
    pings_df['ping_speed_fwd'].fillna(value=0, inplace=True)

    pings_df['lat_origin'] = pings_df['lat'].iloc[0]
    pings_df['lng_origin'] = pings_df['lng'].iloc[0]
    pings_df['dist_from_origin'] =  haversine(pings_df['lat'], pings_df['lng'], pings_df['lat_origin'], pings_df['lng_origin'])

    # Calculate speed moving backward e.g., first row is the speed to the next ping
    pings_df_shifted_up = pings_df.shift(-1)
    pings_df['dist_bkwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_up['lat'], pings_df_shifted_up['lng'])
    pings_df['dist_bkwd'].fillna(value=0, inplace=True)
    pings_df['ping_speed_bkwd'] = abs(pings_df['dist_bkwd']/(0.00001+pings_df['time_diff_minutes_to_next']))
    pings_df['ping_speed_bkwd'].fillna(value=0, inplace=True)

    pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2
    pings_df['row_index'] = pings_df.reset_index().index

    # print("Timestamp | Time Difference | Distance | Speed")
    # for timestamp, timediff, dist, speed in zip(pings_df['event_timestamp'].head(50), 
    #                                             pings_df['time_diff_minutes_from_previous'].head(50), 
    #                                             pings_df['dist_fwd'].head(50), 
    #                                             pings_df['ping_speed_fwd'].head(50)):
    #     print(f"{timestamp} |          {timediff:.5f} |      {dist:.2f} |   {speed:.2f}")

    # Step 2: Check if the maximum of Avg_ping_speed > 2.0
    iIteration=0
    
    while len(pings_df) > 2 and pings_df[['ping_speed_fwd', 'ping_speed_bkwd']].max().max() > 1.60934:  # 1.6 km/hr = 60 mp.h
    # while len(pings_df) > 2 and pings_df['Avg_ping_speed'].max() > 1.6:  # 1.6 km/hr = 60 mp.h
        iIteration=iIteration+1

        max_index = pings_df['Avg_ping_speed'].idxmax()
    
        # Step 4: Recalculate ping_speed_fwd for the row after the row that was dropped
        if max_index + 1 < len(pings_df) and max_index - 1 >= 0:
            lat_after = pings_df.iloc[max_index+1]['lat']
            lon_after = pings_df.iloc[max_index+1]['lng']
            lat_before = pings_df.iloc[max_index - 1]['lat']
            lon_before = pings_df.iloc[max_index - 1]['lng']
            distance = haversine(lat_before, lon_before, lat_after, lon_after)
            time_diff = pings_df.iloc[max_index+1]['event_timestamp']-pings_df.iloc[max_index-1]['event_timestamp']
            new_speed = 0
            if distance*time_diff > 0:
                new_speed = distance /time_diff
            
            # Calculate new fwd speed for the row before
            index_before = max_index - 1
            index_after = max_index + 1
            
            # Update the value using .loc[] or .iloc[] with a single call
            pings_df.loc[index_before, 'ping_speed_fwd'] = new_speed
            pings_df.loc[index_after, 'ping_speed_bkwd'] = new_speed
    
        pings_df = pings_df.drop(max_index)
        pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2

        # Reset index (I don't know if this is really necessary)
        pings_df.reset_index(drop=True, inplace=True)
        
    # print("Timestamp | Time Difference | Distance | Speed")
    # for timestamp, timediff, dist, speed in zip(pings_df['event_timestamp'].head(50), 
    #                                             pings_df['time_diff_minutes_from_previous'].head(50), 
    #                                             pings_df['dist_fwd'].head(50), 
    #                                             pings_df['ping_speed_fwd'].head(50)):
    #     print(f"{timestamp} |          {timediff:.2f} |      {dist:.2f} |   {speed:.2f}")
    ####################################################################
    # Now create the cumulative speeds graph using forward speed
    # convert speed fwd to MPH
    
    ##############################################################################
    ###   Cumulative Speed Graph 
    ##############################################################################
    #### Create graph with cumulative time at speeed
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MultipleLocator

    fig_columns_df = pd.DataFrame()
    fig_columns_df['ping_speed_fwd'] = pings_df['ping_speed_fwd']*37.2823
    fig_columns_df['time_diff_minutes_from_previous'] = pings_df['time_diff_minutes_from_previous']
    fig_columns_df = fig_columns_df.sort_values(by=['ping_speed_fwd'], ascending=[True])
    fig_columns_df['cumulative_time_at_speed_lt'] = fig_columns_df['time_diff_minutes_from_previous'].cumsum()
    fig_columns_df['cumulative_time_at_speed_lt'].fillna(value=0, inplace=True)

    fig, ax = plt.subplots()

    # Plot the data
    ax.plot(fig_columns_df['ping_speed_fwd'], fig_columns_df['cumulative_time_at_speed_lt'])

    # Set the labels for the axes
    ax.set_xlabel('ping_speed_fwd')
    ax.set_ylabel('cumulative_time_at_speed_lt')

    # Set major ticks at 1 unit intervals on the x-axis
    ax.set_xlim(left=0)
    GridStep = (fig_columns_df['ping_speed_fwd'].max())/20
    ax.xaxis.set_major_locator(MultipleLocator(5))

    # Enable grid lines
    ax.grid(True)    
    # Show the plot
    # plt.show()
    ##############################################################################
    ### End of cumulative speed graph """
    ##############################################################################
    ##############################################################################
#     #### Create graph with time on horizontal axis speed and distance 
#     # Step 1: Create a figure and a set of subplots
    fig_speeds, ax1 = plt.subplots()

    ax1.plot(pings_df['hours_since_start'], 37.2823*pings_df['ping_speed_fwd'], marker='o', linestyle='None', label='Ping Speed Forward', color='tab:blue')
    ax1.plot(pings_df['hours_since_start'], 37.2823*pings_df['ping_speed_bkwd'], marker='o', linestyle='None', label='Ping Speed Backward', color='tab:orange')
    # ax1.plot(pings_df['hours_since_start'], 37.2823*pings_df['ping_speed_fwd'], label='Ping Speed Forward', color='tab:blue')
    # ax1.plot(pings_df['hours_since_start'], 37.2823*pings_df['ping_speed_bkwd'], label='Ping Speed Backward', color='tab:orange')
    ax1.set_xlabel('Hours Since Start')
    ax1.set_ylabel('Ping Speed (fwd/bkwd)')
    ax1.tick_params(axis='y')
    ax1.legend(loc='upper left')

    # Step 3: Create a secondary y-axis for dist_from_origin
    ax2 = ax1.twinx()
    ax2.plot(pings_df['hours_since_start'], pings_df['dist_from_origin'], label='Distance from Origin', color='tab:green')
    ax2.set_ylabel('Distance from Origin')
    ax2.tick_params(axis='y')

    # Step 4: Combine legends
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines + lines2, labels + labels2, loc='upper center')

    # Step 5: Display the plot
    plt.title('Ping Speeds and Distance from Origin over Time')

    plt.show()

    

    return fig_columns_df
    # return fig, fig_columns_df


### Assorted date & time functions

In [None]:
# THIS CODE GENERATE THE SEASON BASED ON MONTH

# def month_to_season(month_num):
    
#     monthMar= (month_num>1)*(month_num-1) + (month_num<=1)*(11+month_num)
#     season=int((monthMar) / 3)+1
    
#     return season
# month 6 and 1 are wrong

def month_to_season(month_num):
    if (month_num == 11) | (month_num == 0) | (month_num == 1): # DEC-FEB
        season = 4
    if (month_num == 2) | (month_num == 3) | (month_num == 4): # MARCH - MAY
        season = 1
    if (month_num == 5) | (month_num == 6) | (month_num == 7): # JUNE - August 
        season = 2
    if (month_num == 8) | (month_num == 9) | (month_num == 10): # SEP- NOV
        season = 3
    return season

# check season function
season= month_to_season(1)
season

# note 0 equals Jan and 11 equals december 
def monthofyear(EpochTime):  # January = 0
    Base = 1546322400  # Jan. 1, 2019, 12:00 a.m. 
    Jan12021 = 731
    Jan12022 = 1096
    Jan12023 = 1461

    FebStart = 31
    MarStart = 59
    AprStart = 90
    MayStart = 120
    JunStart = 151
    JulStart = 181
    AugStart = 212
    SepStart = 243
    OctStart = 273
    NovStart = 304
    DecStart = 334
    leapyear2020 = 1582869600     # Feb 29, 2020

    days_since_2019 = epoch_to_days_since_1_1_2019(EpochTime)
    leapyearadjust = -1*(EpochTime >leapyear2020)
    year = int((days_since_2019+ leapyearadjust)/365)
    dayofyear = (days_since_2019) - 365*year +  leapyearadjust
    month = 1
    month = 1*(dayofyear>=FebStart) + \
            1*(dayofyear>=    MarStart ) + \
            1*(dayofyear>=    AprStart ) + \
            1*(dayofyear>=    MayStart ) + \
            1*(dayofyear>=    JunStart ) + \
            1*(dayofyear>=    JulStart ) + \
            1*(dayofyear>=    AugStart ) + \
            1*(dayofyear>=    SepStart ) + \
            1*(dayofyear>=    OctStart ) + \
            1*(dayofyear>=    NovStart ) + \
            1*(dayofyear>=    DecStart )
    return month
        

def epoch_to_season(EpochTime):
    month = monthofyear(EpochTime)
    season = month_to_season(month)
    return season

def epoch_to_DOW(EpochTime):
    Base = 1546322400  # Jan. 1, 2019, 12:00 a.m. 
    BaseDOW = 1
    DaySince = (EpochTime - Base) / (24*60*60)
    WeeksSinceBase = DaySince / 7
    DayOfWeek = BaseDOW + int((WeeksSinceBase - int(WeeksSinceBase)) * 7)
    return DayOfWeek

def epoch_to_days_since_1_1_2019(EpochTime):
    Base = 1546322400  # Jan. 1, 2019, 12:00 a.m. 
    DaySinceBase = int((EpochTime-Base) / (60*60*24))
    return DaySinceBase

def AISdate_to_epoch(AISDate):
    from datetime import datetime

    # date_string = "2019-06-01T16:14:14"

    # Define the format of the input date string
    date_format = "%Y-%m-%dT%H:%M:%S"

    # Convert the date string to a datetime object
    dt_object = datetime.strptime(AISDate, date_format)

    # Convert the datetime object to epoch time
    epoch_time = int(dt_object.timestamp())

    # print("Epoch Time:", epoch_time)
    return epoch_time

def epoch_to_hour_of_day(EpochTime):
    Base = 1546322400  # Jan. 1, 2019, 12:00 a.m. 
    # Daylight Savings Time points for US Central Time
    start2019 = 1552204800  
    end2019 = 1572768000
    start2020 = 1583654400
    end2020 = 1604217600
    start2021 = 1615708800
    end2021 = 1636272000
    start2022 = 1647158400
    end2022 = 1667721600
    start2023 = 1678608000
    end2023 = 1699171200

    
    DayLightSavingsAdjust = +1 * (EpochTime > start2019) + \
                            -1 * (EpochTime > end2019) + \
                            +1 * (EpochTime > start2020) + \
                            -1 * (EpochTime > end2020) + \
                            +1 * (EpochTime > start2021) + \
                            -1 * (EpochTime > end2021) + \
                            +1 * (EpochTime > start2022) + \
                            -1 * (EpochTime > end2022) + \
                            +1 * (EpochTime > start2023) + \
                            -1 * (EpochTime > end2023)
#    print(DayLightSavingsAdjust)
    DaysSince = ((EpochTime-Base) / (60*60*24))
    PortionOfDay = DaysSince - int(DaysSince)
    HourOfDay = int(PortionOfDay*24) + DayLightSavingsAdjust
    return HourOfDay

def epoch_to_date(epoch_time):
    # Convert epoch time to a datetime object
    dt = datetime.fromtimestamp(epoch_time)
    
    # Format the datetime as 'YYYY-MM-DD'
    formatted_date = dt.strftime('%Y-%m-%d')
    
    return formatted_date

def epoch_to_datetime(epoch_time):
    return datetime.fromtimestamp(epoch_time)

def date_to_epoch(date_string, date_format='%m/%d/%Y'):
    """
    Convert a date string to epoch time integer.
    
    Parameters:
        date_string (str): The date string to convert.
        date_format (str): The format of the date string. Default is '%m/%d/%Y'.
    
    Returns:
        int: Epoch time integer.
    """
    # Parse the date string to a datetime object
    date_obj = datetime.strptime(date_string, date_format)
    
    # Convert the datetime object to epoch time
    epoch_time = int(date_obj.timestamp())
    
    return epoch_time

# Set directories to be used

In [None]:
#################################################################
# Directory for this output
OurTable_V3_directory = '~/RecFishing/Analysis with Our Tables and V3/Data Files'
# Expand the tilde to the user's home directory
OurTable_V3_directory = os.path.expanduser(OurTable_V3_directory)
# Check to make sure the directory exist
DirExist = os.path.exists(OurTable_V3_directory)
print(OurTable_V3_directory, "exists = " ,DirExist)

#################################################################
# Directory for Groups of V3 Pings
V3_Pings_Groups_directory = '~/RecFishing/Analysis with Our Tables and V3/Data Files/V3_Ping_Groups'
V3_Pings_Groups_directory = os.path.expanduser(V3_Pings_Groups_directory)
print(V3_Pings_Groups_directory, "exists = " ,os.path.exists(V3_Pings_Groups_directory))

#################################################################
# Directory some core data fro analysis
CoreData_Directory = '~/RecFishing/CoreData'
CoreData_Directory = os.path.expanduser(CoreData_Directory)
print(CoreData_Directory, "exists = ", os.path.exists(CoreData_Directory))

#################################################################
# Directory  with Original_directory material
Original_directory = '~/RecFishing/DataflowStudioJobs'
Original_directory = os.path.expanduser(Original_directory)
DirExist = os.path.exists(Original_directory)
print(Original_directory, "exists = ", DirExist)


#################################################################
# Directory  with Original_directory material
Previously_Processed_directory = '~/RecFishing/DataflowStudioJobs/FinalCode - Rec Fishing Identification'
Previously_Processed_directory = os.path.expanduser(Previously_Processed_directory)
DirExist = os.path.exists(Previously_Processed_directory)
print(Previously_Processed_directory, "exists = ", DirExist)

#################################################################
# Directory with Travel Cost files
Travel_Cost_directory = '~/RecFishing/Travel Costs with Dedicated Table/CSV Files'
# Expand the tilde to the user's home directory
Travel_Cost_directory = os.path.expanduser(Travel_Cost_directory)
DirExist = os.path.exists(Travel_Cost_directory)
print(Travel_Cost_directory, "exists = ", DirExist)

#################################################################
# Directory with Weather data and related files
Weather_Data_directory = '~/RecFishing/uploaded_files/Weather Data'
# Expand the tilde to the user's home directory
Weather_Data_directory = os.path.expanduser(Weather_Data_directory)
print(Weather_Data_directory, "exists = ", DirExist)


#################################################################
# Directory with other Uploaded data 
Uploaded_Data_directory = '~/RecFishing/uploaded_files'
# Expand the tilde to the user's home directory
Uploaded_Data_directory = os.path.expanduser(Uploaded_Data_directory)

####################################################################################
####################  AIS Directory #################################################
AIS_Directory = '~/RecFishing/AIS Files/Data'
AIS_Directory = os.path.expanduser(AIS_Directory)
DirExist = os.path.exists(AIS_Directory)
print(AIS_Directory, "exists = ", DirExist)

# ID_list_RandomSample from ScheduledExecution5.pkl


## Set input and output files to be used

In [None]:
def check_file_existence(file_path):
    if not os.path.exists(file_path):
        print(f"{file_path} Does NOT Exist")


######################################################################################################################
#########################  Log File  ################
Log_filename  =  os.path.join(OurTable_V3_directory, 'Log.txt')

######################################################################################################################
########################## Complete list of randomized IDs- without bernouli sampling 740k #########################
# PKL_File_With_Random_IDs_Filename  =  os.path.join(Original_directory, 'cuebiq_id_list_wo_sampling_740k.pkl')
PKL_File_With_Random_IDs_Filename  =  os.path.join(CoreData_Directory, 'cuebiq_id_list_wo_sampling_740k.pkl')
check_file_existence(PKL_File_With_Random_IDs_Filename)
    
# Data gathered and used prior to the NOAA Webinar in February 2024
IDs_Used_in_NOAA_Webinar_filename = os.path.join(CoreData_Directory, 'IDs_From_Random_Draw_Prior_to_NOAA_Webinar.csv')
check_file_existence(IDs_Used_in_NOAA_Webinar_filename)
Ping_Used_in_NOAA_Webinar_filename = os.path.join(CoreData_Directory, 'Pings_From_Random_Draw_Prior_to_NOAA_Webinar.csv')
check_file_existence(Ping_Used_in_NOAA_Webinar_filename)
                                                     
# List of IDs that have been processed for Indicators
AlreadyFullyProcessedIDs_Filename  =  os.path.join(OurTable_V3_directory, 'RandomlyChosenCuebiq_ids.List_of_Processed_ids.csv')
check_file_existence(AlreadyFullyProcessedIDs_Filename)
    
######################################################################################################################
#########################  ID Checklist with columns for ID, Pings, Indicators Created (TF) & Trips  ################
IDs_Pulled_from_Dedicated_Table_filename  =  os.path.join(OurTable_V3_directory, 'IDs_Pulled_From_Dedicated_Table.csv')
check_file_existence(IDs_Pulled_from_Dedicated_Table_filename)
    
ID_For_V3_Queries_filename  =  os.path.join(OurTable_V3_directory, 'IDs_from_V3.csv')
check_file_existence(ID_For_V3_Queries_filename)
    
RecTripRating_filename =  os.path.join(OurTable_V3_directory, 'RecTripRating.csv')
check_file_existence(RecTripRating_filename)
    
# This file contains information about the rows of Pings_V3_temp_filename that can be used to avoid loading the entire file into a data frame
V3_Pings_Index_filename =  os.path.join(OurTable_V3_directory, 'V3_Pings_File_Index.csv')
check_file_existence(V3_Pings_Index_filename)
    
ID_Groups_filename = os.path.join(OurTable_V3_directory,'Cuebiq_ID_Groups.csv')
check_file_existence(ID_Groups_filename)

######################################################
# Pings in the OurTable for a single large draw of IDs TEMPORARY FILE
Pings_OurTable_temp_filename = os.path.join(OurTable_V3_directory,'Pings_OurTable_temp.csv')
check_file_existence(Pings_OurTable_temp_filename)
    
# Pings from V3 corresponding with the IDs found in the OurTable 
Pings_V3_temp_filename = os.path.join(OurTable_V3_directory,'Pings_V3_temp.csv')
check_file_existence(Pings_V3_temp_filename)
    
# Set output file names
Indicators_IDs_checked_filename = os.path.join(OurTable_V3_directory, 'IDs_Checked_Indicators_OurTable.csv')
check_file_existence(Indicators_IDs_checked_filename)

cuebiq_id_list_and_count_filename= os.path.join(OurTable_V3_directory,'cuebiq_id_list_and_count.csv')
check_file_existence(cuebiq_id_list_and_count_filename)

# List of IDs and dates for V3 query Pings in the OurTable for a single large draw of IDs TEMPORARY FILE
OurTable_IDs_and_Dates_filename = os.path.join(OurTable_V3_directory,'OurTable_IDs_and_Dates.csv')
check_file_existence(OurTable_IDs_and_Dates_filename)
    
##########################################################################################
############################### PINGS FILES   ##############################################
Pings_OurTable_Gulf_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Gulf_ALL.csv')
check_file_existence(Pings_OurTable_Gulf_filename)

Pings_V3_Before_After_filename= os.path.join(OurTable_V3_directory,'Pings_V3_Before_After.csv')
check_file_existence(Pings_V3_Before_After_filename)

Pings_OurTable_Gulf_MT19_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Gulf_MT19.csv')
check_file_existence(Pings_OurTable_Gulf_MT19_filename)

Pings_OurTable_Coast_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Coast.csv')
check_file_existence(Pings_OurTable_Coast_filename)

Pings_OurTable_Outside_our_wkts_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Outside_our_wkts.csv')
check_file_existence(Pings_OurTable_Outside_our_wkts_filename)
    
##########################################################################################
############################### INDICATORS  ##############################################
Indicators_filename = os.path.join(OurTable_V3_directory,'Indicators_OurTable.csv')
check_file_existence(Indicators_filename)
    
# cuebiq_id_count_filename= os.path.join(EEZ_V3_directory,'cuebiq_id_count_distribution_EEZ_V3.csv')
Indicators_Classified_filename = os.path.join(OurTable_V3_directory,'Indicators_OurTable.Predictions.csv')
check_file_existence(Indicators_Classified_filename)

Rec_Indicators_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable.csv')
check_file_existence(Rec_Indicators_filename)

Rec_Indicators_Step1_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable.Step1.csv')
check_file_existence(Rec_Indicators_filename)

V3_Indicators_filename =  os.path.join(OurTable_V3_directory,'V3_indicators.csv')
check_file_existence(V3_Indicators_filename)

Rec_indicators_with_V3_filename = os.path.join(OurTable_V3_directory,'rec_indicators_with_V3.csv')
check_file_existence(Rec_indicators_with_V3_filename)

Indicators_with_V3_indicators_filename= os.path.join(OurTable_V3_directory,'Indicators_with_V3_indicators_indicators.csv')
check_file_existence(Indicators_with_V3_indicators_filename)

# Rec_Indicators_Selected_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable_Selected.csv')
Rec_Indicators_Selected_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable_Selected_May2024.csv')
check_file_existence(Rec_Indicators_Selected_filename)

Rec_Indicators_Final_All_Exclusions_And_Disappearance_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_Final_All_Exclusions_And_Disappearance.csv')
check_file_existence(Rec_Indicators_Final_All_Exclusions_And_Disappearance_filename)

# Sorted_Results_file_path = os.path.join(OurTable_V3_directory,'Indicators_EEZ_and_V3.Predictions.sorted.csv')
# RecFishing_Results_file_path =  os.path.join(OurTable_V3_directory,'RecFishingBoat Predictions.sorted.csv')

DisappearanceIndicators_filename = os.path.join(OurTable_V3_directory,'DisappearanceIndicators.csv')
check_file_existence(DisappearanceIndicators_filename)

DisappearanceAnalysis_filename = os.path.join(OurTable_V3_directory,'DisappearanceAnalysis.csv')
check_file_existence(DisappearanceAnalysis_filename)

Stops_Indicators_filename = os.path.join(OurTable_V3_directory,'Stops_Indicators.csv')
check_file_existence(Stops_Indicators_filename)

Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Trawls_Indicators.csv')
check_file_existence(Trawls_Indicators_filename)

Stop_Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Stop_Trawls_Indicators.csv')
check_file_existence(Stop_Trawls_Indicators_filename)
##########################################################################################
############################### WEATHER data files ####################################
Buoys_file_path  = os.path.join(Weather_Data_directory,'Buoys.csv')
check_file_existence(Buoys_file_path)

Weather_file_path  = os.path.join(Weather_Data_directory,'DailyWeatherData.csv')
check_file_existence(Weather_file_path)

##########################################################################################
############################### SUPPLEMENTARY MAP DATA  ############################
Industrial_polygons_filename  = os.path.join(Uploaded_Data_directory,'Polygons Around Industrial Sites.wkt')
check_file_existence(Industrial_polygons_filename)

##########################################################################################
############################### AIS Files INCLUDING CLASSIFIER ############################
RF_Classfier_filename = os.path.join(AIS_Directory, 'rf_model_AIS_2019.pkl')
check_file_existence(RF_Classfier_filename)

RF_Importance_Factors_filename = os.path.join(AIS_Directory, 'rf_classifier_importance_factors.csv')
check_file_existence(RF_Importance_Factors_filename)

AIS_Predictions_filename = os.path.join(AIS_Directory, 'RandomForest_Predictions2019AISData.csv')
check_file_existence(AIS_Predictions_filename)

AIS_indicators_file_path = os.path.join(AIS_Directory,'Indicators2019_All.C.csv')
check_file_existence(AIS_indicators_file_path)
  

### Dedicate Table Names for reference
# Dedicated table with all Pings within the Gulf WKT for 1/12019 - 4/22/2022
#  Table Name:  dedicated.ScheduledExecution5.DeviceTable   
#  Code used for call:  RecFishing/DataflowStudioJobs/ScheduledEx5-updated.ipynb

# Dedicated table with all Pings within the Gulf WKT AND Origin for 1/12019 - 4/22/2022
#  Table Name:  dedicated.ScheduledExecution5_parallel_origin.DeviceTable
#  Code used for call:  RecFishing/DataflowStudioJobs/ScheduledEx5-origin.ipynb


## Map Preliminaries

In [None]:
import geopandas as gpd

platforms= gpd.read_file("~/RecFishing/Travel Costs with Dedicated Table/Shapefiles/platform.shx").to_crs(epsg=4326)
platforms['lng'] = platforms['geometry'].x
platforms['lat'] = platforms['geometry'].y

platforms['REMOVAL_DA'] = pd.to_datetime(platforms['REMOVAL_DA'], errors='coerce', format='%Y-%m-%d')
platforms['INSTALL_DA'] = pd.to_datetime(platforms['INSTALL_DA'], errors='coerce', format='%Y-%m-%d')

platforms = platforms[(platforms['REMOVAL_DA'].isna()) | 
    (platforms['REMOVAL_DA'] > pd.Timestamp('2019-01-01'))]

platforms = platforms[(platforms['INSTALL_DA'] < pd.Timestamp('2022-04-22'))]

LA_AR= gpd.read_file("~/RecFishing/Travel Costs with Dedicated Table/Shapefiles/Artificial_Reef_LA_2021_centroid.shx").to_crs(epsg=4326)
LA_AR['lng'] = LA_AR['geometry'].x
LA_AR['lat'] = LA_AR['geometry'].y
LA_AR['Deployment'] = pd.to_datetime(LA_AR['Deployment'], errors='coerce', format='%Y-%m-%d')
LA_AR = LA_AR[(LA_AR['Deployment'] < pd.Timestamp('2022-04-22'))]
LA_AR.Deployment.dtype

TX_AR=gpd.read_file("~/RecFishing/Travel Costs with Dedicated Table/Shapefiles/Artificial_Reef_TX_2021_2.shx").to_crs(epsg=4326)
TX_AR['lng'] = TX_AR['geometry'].x
TX_AR['lat'] = TX_AR['geometry'].y

AL_AR=gpd.read_file("~/RecFishing/Travel Costs with Dedicated Table/Shapefiles/AL_AR_2024.shx").to_crs(epsg=4326)
AL_AR['lng'] = AL_AR['geometry'].x
AL_AR['lat'] = AL_AR['geometry'].y

### Function to check if a point is within one of the industrial polygons

In [None]:
from shapely import wkt
from shapely.geometry import Point

####################################################################
##  LOAD INDUSTRIAL POLYGONS
# Load polygons from WKT file
def load_polygons_from_wkt(file_path):
    polygons = []
    with open(file_path, 'r') as file:
        for line in file:
            polygon = wkt.loads(line.strip())
            polygons.append(polygon)
    return polygons

# Load polygons
Industrial_polygons = load_polygons_from_wkt(Industrial_polygons_filename)


# ####################################################################
# # Check if a point is within any of the polygons
# def is_point_in_polygons(lat, lng, polygons):
#     point = Point(lng, lat)  # Note that Point takes (lng, lat)
#     for polygon in polygons:
#         if polygon.contains(point):
#             return True
#     return False

# # Path to the WKT file


# Check if a point is within any of the polygons
def is_point_in_polygons(lat, lng, polygons):
    point = Point(lng, lat)  # Note that Point takes (lng, lat)
    for polygon in polygons:
        if polygon.contains(point):
            return True
    return False

# Check if any point in the DataFrame is within any of the polygons
def any_point_in_polygons(df, polygons):
    for index, row in df.iterrows():
        if is_point_in_polygons(row['lat'], row['lng'], polygons):
            return True
    return False

####################################################################
# EXAMPLE USAGE FOR A SINGLE POINT
latitude = 31.0700
# latitude = 30.0700
longitude = -93.7300
is_within = is_point_in_polygons(latitude, longitude, Industrial_polygons)

if is_within:
    print(f"The point ({latitude}, {longitude}) is within one of the polygons")
else:
    print(f"The point ({latitude}, {longitude}) is NOT within one of the polygons")

####################################################################
# EXAMPLE USAGE FOR A DATA FRAME
# Sample DataFrame with lat and lng columns
data = {'lat': [30.075, 29.765, 30.070], 'lng': [-93.728, -93.882, -93.730]}
test_df = pd.DataFrame(data)

# Check if any point in the DataFrame is within any of the polygons
result = any_point_in_polygons(test_df, Industrial_polygons)

print("The result of the data frame test is ", result)


# Analysis of potential recreational fishing trips after the ML classificiation
These steps were taken to analyze the trip. They are retained here to give the reader a sense of the tools that were used to develop the non-ML criteria for trip identification.

## Functions for creating a map

In [None]:
def SelectATrip(cuebiq_id_i, Trip_number_i):
    global random_trip_df
    # cuebiq_id_i = input("cuebiq_id")
    # cuebiq_id_i = int(input("cuebiq_id: "))

    # Trip_number_i = input("Trip_number").astype(int)
    # Trip_number_i = int(input("Trip_number_i: "))
    Rec_Indicators_partial_df = Rec_Indicators_df[(Rec_Indicators_df['cuebiq_id']==cuebiq_id_i) & (Rec_Indicators_df['Trip_number'] == Trip_number_i)]
    if len(Rec_Indicators_partial_df) >0:
        random_trip_df = Rec_Indicators_partial_df.sample(n=1)
    else:
        random_trip_df = Rec_Indicators_partial_df
                                                                                                   
    return random_trip_df

In [None]:
def DrawATrip(Rec_Indicators_df):
    global random_trip_df

    # initialize Rec_Indicators_partial_df in case there are no exclusions
    Rec_Indicators_partial_df=Rec_Indicators_df
    
    # Select a random trip from the remaining trips
    random_trip_df = Rec_Indicators_partial_df.sample(n=1)
    text = str(random_trip_df['cuebiq_id'].iloc[0])
    text2 = str(random_trip_df['pct_during_outside_gulf'])
    print(text, random_trip_df['Trip_number'].iloc[0], text2)
    

    return random_trip_df

In [None]:
# A simple function to create a kernel density plot
import seaborn as sns
import matplotlib.pyplot as plt

def create_kde_plot(df, column):
    """
    Creates a kernel density estimate plot for a specified column in a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    column (str): The name of the column to plot.

    Returns:
    None
    """
    # Check if the column exists in the DataFrame
    if column not in df.columns:
        print(f"Column '{column}' does not exist in the DataFrame.")
        return
    
    # Create the kernel density plot
    sns.kdeplot(data=df, x=column, fill=True)
    
    # Set the title and labels
    plt.title(f'Kernel Density Estimate of {column}')
    plt.xlabel(column)
    plt.ylabel('Density')
    
    # Show the plot
    plt.show()

# Example usage:
# Assuming 'DisappearanceAnalysis_df' is your DataFrame and you want to plot 'crit1'
# create_kde_plot(DisappearanceAnalysis_df, 'crit1')


In [None]:
# A simple function that creates a cumulative CDF
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_cdf(data, column, xlim_max=None, xlim_min=None):
    """
    Plots the cumulative density function (CDF) for a given column in a DataFrame.
    
    Parameters:
    data (pd.DataFrame): The data frame containing the data.
    column (str): The column for which the CDF is to be plotted.
    xlim_min (float, optional): The lower limit of the horizontal axis.
    xlim_max (float, optional): The upper limit of the horizontal axis.
    
    Returns:
    None
    """
    # Sort the data
    sorted_data = np.sort(data[column].dropna())
    # Calculate the CDF
    cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    
    # Create the plot
    plt.figure(figsize=(8, 6))
    plt.plot(sorted_data, cdf, marker='.', linestyle='none')
    plt.xlabel(column)
    plt.ylabel('CDF')
    plt.title('Cumulative Density Function')
    
    # Set x-axis limit if provided
    if xlim_min is not None and xlim_max is not None:
        plt.xlim(xlim_min, xlim_max)
    elif xlim_min is not None:
        plt.xlim(left=xlim_min)
    elif xlim_max is not None:
        plt.xlim(right=xlim_max)
    
    plt.grid(True)
    plt.show()



### Functions that applies initial non-ML filtering criteria to the Indicators_df
This involves criteria that are based entirely on variables using pings in the Gulf polygon and are not excluded because they are within polygons around islands.   Additional criteria are applied below to trips that pass these criteria.

In [None]:
def NonMLTripFiltering(Indicators_df):
    warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

    print(len(Indicators_df), "before processing")
    # retain only trips with pings both before and after

    ####  This code has been cleaned up. In the original, there were errors that were removed using a function 
    ####   cleanup_repeated_column_names()
    ####   It is possible that some errors remain. If so, it would be necessary to 
    

    ############################## One-Way Trips ####################################
    # Exlude trips that appear to be one way or at least mostly moving across destination, trips where the furthest distance from the origin is
    #    at least twice the final from the origin
    Indicators_df['final_over_max'] = Indicators_df['Distance_from_origin_t']/Rec_Indicators_df['Max_distance_traveled_origin_t']
    Indicators_df1 = Indicators_df[Indicators_df['final_over_max'] < 0.5]
    print(len(Indicators_df1), "after excluding trips with final distance from origin/ max distance from origin > 0.5")
    # create_kde_plot(Indicators_df1,'Trip_Duration_t')
    ############################## Exclude trips that are too long in duration ####################################
    # Exclude trips that are more than 72 hours in length
    max_hours = 72
    Indicators_df2 = Indicators_df1[Indicators_df1['Trip_Duration_t'] <= max_hours*60]
    print(f"{len(Indicators_df2)} after limiting to trips less than {max_hours} hours")
    # create_kde_plot(Indicators_df2,'Trip_Duration_t')

    ############################## Start too far from coast ####################################
    Indicators_df2['Begin_End_Dist_from_Coast_max'] = Indicators_df2[
        ['first_distance_from_coast_t', 'last_distance_from_coast_t']].max(axis=1)
    Indicators_df2['Begin_End_Dist_from_Coast_min'] = Indicators_df2[
        ['first_distance_from_coast_t', 'last_distance_from_coast_t']].min(axis=1)
    min_distance = 5
    Indicators_df3 = Indicators_df2[Indicators_df2['Begin_End_Dist_from_Coast_max']<min_distance]
    print(len(Indicators_df3), "after limiting to trips where the start & end distances from coast are less than ", min_distance)
    # create_kde_plot(Indicators_df3,'Trip_Duration_t')

    ############################## Exclude trips that are too short in duration ####################################
    min_hours = 2
    Indicators_df4=Indicators_df3[(Indicators_df3['Trip_Duration_t'] > min_hours*60)]
    print(f"{len(Indicators_df4)} after limiting to trips that are at least {min_hours} hours in length")
    # create_kde_plot(Indicators_df4,'Trip_Duration_t')

    ############################## Exclude Trips that are more than 9 hous must go at least 50 km from the origin ####################################
    Indicators_df4['Long_Near'] = 1*(Indicators_df4['Trip_Duration_t'] > 9*60)*(Indicators_df4['Max_distance_traveled_origin_t'] < 50)
    Indicators_df5 = Indicators_df4[Indicators_df4['Long_Near'] == 0]
    Indicators_df5.drop(columns=['Long_Near'], inplace=True)
    print(len(Indicators_df5), "after dropping trips more than 9 hours and didn't go at least 50 km from the origin")
    # create_kde_plot(Indicators_df5,'Trip_Duration_t')

    ############################## Exclude trips do not travel far enough from the origin ####################################
    min_dist = 0.5
    Indicators_df6 = Indicators_df5[Indicators_df5['Max_distance_traveled_origin_t'] > min_dist]
    print(f"{len(Indicators_df6)} after dropping trips that didn't go at least {min_dist} km from the origin")
    # create_kde_plot(Indicators_df6,'Trip_Duration_t')
    
                  
    return Indicators_df6
    # return Indicators_df5  


# Main Cells to Apply Criteria to select recreational fishing trips

In [None]:
# Load indicators after ML & associated ping 
Indicators_with_RF_Predictions_df = pd.read_csv(Indicators_Classified_filename)
AllPings_OurTable_df = pd.read_csv(Pings_OurTable_Gulf_filename)

## First apply criteria to get a list of potential rec trips

In [None]:
Rec_Indicators_df = pd.read_csv(Rec_Indicators_Step1_filename)

In [None]:
# Limit to only ML predictions of 371
Indicators_with_RF_Predictions_df = pd.read_csv(Indicators_Classified_filename)

print(len(Indicators_with_RF_Predictions_df), "trips were categorized with the ML algorithm (including duplicates)")

# Clean up the file, dropping duplicate rows, keeping the one with the lowest value of the pct variables since those had some errors
Indicators_with_RF_Predictions_df['tot_pct'] = Indicators_with_RF_Predictions_df['pct_time_stopped'] +Indicators_with_RF_Predictions_df['pct_time_trawling'] +Indicators_with_RF_Predictions_df['pct_time_moving'] 
Indicators_with_RF_Predictions_df = Indicators_with_RF_Predictions_df.sort_values(by=['cuebiq_id', 'Trip_number', 'tot_pct'])
Indicators_with_RF_Predictions_df = Indicators_with_RF_Predictions_df.drop_duplicates(subset=['cuebiq_id', 'Trip_number'], keep='first')

print(len(Indicators_with_RF_Predictions_df), "trips were categorized with the ML algorithm")
Rec_Indicators_df = Indicators_with_RF_Predictions_df[Indicators_with_RF_Predictions_df['Predicted_Class']==371]
n_rec_trips0 =len(Rec_Indicators_df)
print(n_rec_trips0, "trips identified as 371")

# Exclude trips that do not require V3 pings
Rec_Indicators_df = NonMLTripFiltering(Rec_Indicators_df)

Rec_Indicators_df.to_csv(Rec_Indicators_Step1_filename, index=False)


In [None]:
df = pd.read_csv(Rec_Indicators_Step1_filename)
df['Trip_duration_hrs'] =(df['timestamp_end_t']- df['timestamp_start_t'])/(60*60)
plot_cdf(df, 'Trip_duration_hrs', 10, xlim_min=None)


### Add any new trips to ID_For_V3_Queries_df for new queries of the complete Cuebiq Device Table (V3)


In [None]:
if os.path.exists(ID_For_V3_Queries_filename):
    ID_For_V3_Queries_df = pd.read_csv(ID_For_V3_Queries_filename)

    # Perform a left anti-join to identify rows in Rec_Indicators_df not in ID_For_V3_Queries_df
    merged_df = Rec_Indicators_df.merge(
        ID_For_V3_Queries_df[['cuebiq_id', 'Trip_number']],
        on=['cuebiq_id', 'Trip_number'],
        how='left',
        indicator=True
    )

    # Select rows where the merge indicator is 'left_only', meaning they are only in Rec_Indicators_df
    missing_rows = merged_df[merged_df['_merge'] == 'left_only']

    # Drop the merge indicator column
    missing_rows = missing_rows.drop(columns=['_merge'])

    # Create new column: V3_pings_pulled
    missing_rows['V3_pings_pulled'] = False

    # Append missing_rows to ID_For_V3_Queries_df
    columns_to_append = ['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t', 'V3_pings_pulled']
    ID_For_V3_Queries_df = pd.concat([ID_For_V3_Queries_df, missing_rows[columns_to_append]], ignore_index=True)
    ID_For_V3_Queries_df = ID_For_V3_Queries_df[columns_to_append]
    ID_For_V3_Queries_df.to_csv(ID_For_V3_Queries_filename, index=False)
    # ID_For_V3_Queries_df

# Cell that pulls data from the Cuebiq Device table
### First pull V3 pings in bunches
This will not capture orphan trips that were missed

In [None]:
#### With found start date
if os.path.exists(ID_For_V3_Queries_filename):
    ID_For_V3_Queries_df = pd.read_csv(ID_For_V3_Queries_filename)
    columns_to_retain = ['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t', 'V3_pings_pulled']
    ID_For_V3_Queries_df = ID_For_V3_Queries_df[columns_to_retain]

else:  # Create an empty data frame for the ID_For_V3_Queries_df
    Indicators_df= pd.read_csv(Rec_Indicators_Step1_filename)
    ID_For_V3_Queries_df = Indicators_df[['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t']].copy()
    ID_For_V3_Queries_df['V3_pings_pulled'] = False

Remaining_ID_For_V3_Queries_df=ID_For_V3_Queries_df[ID_For_V3_Queries_df['V3_pings_pulled'] == False]
if len(Remaining_ID_For_V3_Queries_df)==0:
    print("V3 pings for all trips have been found")
    sys.exit()
          
min_start_timestamp_processed = Remaining_ID_For_V3_Queries_df['timestamp_start_t'].min()
# max_start_timestamp_processed = ID_For_V3_Queries_df[ID_For_V3_Queries_df['V3_pings_pulled'] == True]['timestamp_start_t'].max()

one_day = 24*60*60
# start_date= epoch_to_datetime(max_start_timestamp_processed-2*one_day)
start_date= epoch_to_datetime(min_start_timestamp_processed-2*one_day)

end_date = datetime(2022, 5, 1)
eight_hours = 8*60*60

start_time = time.time()

# Find end_date_epoch_time to use at end of queries
end_date_string = end_date.strftime('%m/%d/%Y') + ' 0:00'
end_date_epoch_time = date_to_epoch(end_date_string, date_format='%m/%d/%Y %H:%M')

current_date = start_date
istep = 1
while current_date <= end_date:
# while current_date <= start_date:
    # Move to the next day
    current_date += timedelta(days=istep)

    start_date_string = current_date.strftime('%m/%d/%Y') + ' 0:00'
    start_day_epoch_time = date_to_epoch(start_date_string, date_format='%m/%d/%Y %H:%M')

    # ID_For_V3_Queries_df = pd.read_csv(ID_For_V3_Queries_filename)
    end_day_epoch_time = start_day_epoch_time +16*60*60

    ########################################################################################################################################################
    # DURING DEBUGGING A LOT OF VARIATIONS WERE TRIED TO FIGURE OUT HOW TO STRATEGICALLY PULL A LIMIT SET OF DATA
    # The final options that worked were
    # nIDs_per_pull=1000
    # n_max_days = 30
    # n_max_day_x_ids = 8000
    ########################################################################################################################################################
    # nIDs_per_pull=100 took 61.3 minutes for a single pull
    # nIDs_per_pull=500 crashed. It appears that the data frame was too large. Rather than reducing this number, I reduced the maximum number of days that can be included
    # nIDs_per_pull=500, n_max_days = 30: 39.8 min to process 64 trips
    # nIDs_per_pull=500, n_max_days = 30: 46.5 min to process 188 trips
    # nIDs_per_pull=500, n_max_days = 30: 56.9 min to process 431 trips
    # nIDs_per_pull=500, n_max_days = 30: 37.3 min to process 652 trips
    # nIDs_per_pull=1000, n_max_days = 30:35.2 min to process 1151 trips
    # nIDs_per_pull=2000, n_max_days = 30: CRASHED too much
    # nIDs_per_pull=2000, n_max_days = 30:limit of 20,000 for IDs*days CRASHED 
    # nIDs_per_pull= 1000 , n_max_days = 30 ID count: 848 Trip Count: 939
    # nIDs_per_pull=2000, n_max_days = 30:limit of 15,000 for IDs*days CRASHED 
    # nIDs_per_pull= 1000 n_max_days = 30, n_max_day_x_ids = 10000 22.2 min to process 780 trips
    # nIDs_per_pull: 1000 n_max_days: 30 n_max_day_x_ids: 10000 39.0 min to process 803 trips
    # nIDs_per_pull: 1000 n_max_days: 30 n_max_day_x_ids: 10000 trips: 968 days x ids: 10440  CRASHED
    ########################################################################################################################################################
  
    nIDs_per_pull=1000
    n_max_days =  30
    n_max_day_x_ids = 8000
    nIDs = 0
    istep = 0
    start_time = time.time()
    # Loop over time until nIDs_per_pull is just passed
    istep = 1
    while nIDs < nIDs_per_pull:
        istep = istep+1
        end_day_epoch_time = start_day_epoch_time +istep*24*60*60
        selected_trips_df = Remaining_ID_For_V3_Queries_df[(Remaining_ID_For_V3_Queries_df['timestamp_start_t'] >= start_day_epoch_time) & 
                                                 (Remaining_ID_For_V3_Queries_df['timestamp_end_t'] <= end_day_epoch_time)]
        

        unique_cuebiq_ids = selected_trips_df['cuebiq_id'].unique()
        nIDs = len(unique_cuebiq_ids)
        if end_day_epoch_time>end_date_epoch_time:
            nIDs = nIDs_per_pull+1
        if istep>n_max_days:  # terminate if there are more than n_max_days days between start and en
            nIDs = nIDs_per_pull+1
        if istep*(len(unique_cuebiq_ids))>n_max_day_x_ids:  # terminate if ids* days > n_max_day_x_ids
            nIDs = nIDs_per_pull+1

    if len(selected_trips_df) == 0:
        print("No trips were found in the ", istep, " days between " , start_day_epoch_time, " and ", end_day_epoch_time)
    if len(selected_trips_df) > 0:
        print(len(selected_trips_df), " trips were found in the ", istep, " days between " , start_day_epoch_time, " and ", end_day_epoch_time)
        # Create the days for use in the processing date query
        start_date = current_date - timedelta(days=1)
        end_date_q = current_date + timedelta(days=istep+2)

        # Narrow the start and end dates based on the trips that were selected
        start_time_epoch_time_selected = selected_trips_df['timestamp_start_t'].min()
        end_time_epoch_time_selected = selected_trips_df['timestamp_end_t'].max()

        start_date_query= epoch_to_datetime(start_time_epoch_time_selected-2*one_day)
        end_date_query= epoch_to_datetime(end_time_epoch_time_selected+2*one_day)
        
        
        # dayi_Q = start_date.strftime('%Y%m%d')
        # end_window_Q = end_date_q.strftime('%Y%m%d')
        dayi_Q = start_date_query.strftime('%Y%m%d')
        end_window_Q = end_date_query.strftime('%Y%m%d')

        # Convert id list to a tuple that can be used in a SQL query
        cuebiq_id_df = pd.DataFrame({'cuebiq_id': unique_cuebiq_ids})
        cuebiq_id_list = cuebiq_id_df['cuebiq_id'].tolist()
        if len(cuebiq_id_list) == 1:
            cuebiq_id_tuple = f"({cuebiq_id_list[0]})"  # Single element, still needs parentheses
        else:
            cuebiq_id_tuple = str(tuple(cuebiq_id_list))  # Multiple elements, convert list to tuple string
        # cuebiq_id_tuple = ','.join(map(str, cuebiq_id_list))
        # cuebiq_id_tuple = tuple(cuebiq_id_list)

        current_time = datetime.now(local_timezone)
        formatted_time = current_time.strftime("%H:%M:%S")
        # Write output for this query
        text_to_write = (
            f"\n\nStarting at{formatted_time} \nids: from {dayi_Q} to {end_window_Q}\n"
            f"# nIDs_per_pull: {nIDs_per_pull}  n_max_days: {n_max_days} n_max_day_x_ids: {n_max_day_x_ids}\n"
            f"# IDs_per_pull: {len(unique_cuebiq_ids)}         days: {istep},      day_x_ids: {istep * len(unique_cuebiq_ids)}  ,   trips: {len(selected_trips_df)}"
        )

        print(text_to_write)
        with open(Log_filename, 'a') as file:
            file.write(text_to_write)
        ###########  Start Query for this set of ID's and dates
        max_retries = 5
        retry_count = 0
        while retry_count < max_retries:
            try:
                specific_cuebiq_id_query = f"""
                    SELECT *
                    FROM paas_cda_pe_v3.device_location_uplevelled
                    WHERE cuebiq_id IN {cuebiq_id_tuple}
                    AND country_code = 'US'
                    AND processing_date BETWEEN {dayi_Q} AND {end_window_Q}
                    """

                specific_cuebiq_id_data = sql_engine.read_sql(specific_cuebiq_id_query)
                # AND event_timestamp BETWEEN {start_query_time} AND {end_query_time}
                current_time = datetime.now(local_timezone)
                formatted_time = current_time.strftime("%H:%M:%S")
                print(formatted_time, "Finished Query")
                break

            except Exception as e:                
                # Handle the 502 Bad Gateway error
                print(f"An error occurred: {e}")
                retry_count += 1
                time.sleep(10)  # Adjust the delay time as needed


        ############  Retain from the query results pings that are not relevant for the trips in this group   ########
        npings = 0
        for index, row in selected_trips_df.iterrows():
            cuebiq_id = row['cuebiq_id']
            Trip_number = row['Trip_number']
            timestamp_start_t = row['timestamp_start_t']
            timestamp_end_t= row['timestamp_end_t']

            this_trip_pings =  specific_cuebiq_id_data[(specific_cuebiq_id_data['cuebiq_id'] == cuebiq_id) &
                                                (specific_cuebiq_id_data['event_timestamp'] >= timestamp_start_t - eight_hours) &
                                              (specific_cuebiq_id_data['event_timestamp'] <= timestamp_end_t + eight_hours)]

            columns_to_keep = ['cuebiq_id', 'event_timestamp', 'lat', 'lng']
            this_trip_pings = this_trip_pings[columns_to_keep]

            this_trip_pings.to_csv(Pings_V3_Before_After_filename, mode='a', index=False, header=not os.path.exists(Pings_V3_Before_After_filename))

            print(cuebiq_id, Trip_number, "was processed.  DELETE THIS PRINT STATEMENT AFTER DEBUGGING")
            Remaining_ID_For_V3_Queries_df.loc[(Remaining_ID_For_V3_Queries_df['cuebiq_id'] == cuebiq_id) & (Remaining_ID_For_V3_Queries_df['Trip_number'] == Trip_number), 'V3_pings_pulled'] = True
            ID_For_V3_Queries_df.loc[(ID_For_V3_Queries_df['cuebiq_id'] == cuebiq_id) & (ID_For_V3_Queries_df['Trip_number'] == Trip_number), 'V3_pings_pulled'] = True
            npings = npings + len(this_trip_pings)

        # After looping over all trips for this query, save the updated ID_For_V3_Queries_df
        print(npings, " pings were found in this query")
        ID_For_V3_Queries_df.to_csv(ID_For_V3_Queries_filename, index=False)


        count_false_pings_pulled = (ID_For_V3_Queries_df['V3_pings_pulled'] == False).sum()
        count_true_pings_pulled = (ID_For_V3_Queries_df['V3_pings_pulled'] == True).sum()

        end_time = time.time()
        elapsed_time = round((end_time - start_time)/60,1)
        current_time = datetime.now(local_timezone)
        formatted_time = current_time.strftime("%H:%M:%S")
        text_to_write = (
            f"\nCompleted at {formatted_time},   {elapsed_time} minutes to process\n"
            f"# nIDs_per_pull: {nIDs_per_pull}  n_max_days: {n_max_days} n_max_day_x_ids: {n_max_day_x_ids}\n"
            f"#  IDs_per_pull: {len(unique_cuebiq_ids)}         days: {istep},      day_x_ids: {istep * len(unique_cuebiq_ids)}  , trips: {len(selected_trips_df)} pings: {npings}\n"
            f"Covered dates from {dayi_Q}, to, {end_window_Q}\n{count_true_pings_pulled} trips have been processed. {count_false_pings_pulled} trips remain.\n \n"
            )
    print(text_to_write)
    with open(Log_filename, 'a') as file:
        file.write(text_to_write)

print("done")

### Finish up the few remaining rows that were not pulled in the first approach
Some rows were missed between iterations

In [None]:
#### With found start date
if os.path.exists(ID_For_V3_Queries_filename):
    ID_For_V3_Queries_df = pd.read_csv(ID_For_V3_Queries_filename)
    columns_to_retain = ['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t', 'V3_pings_pulled']
    ID_For_V3_Queries_df = ID_For_V3_Queries_df[columns_to_retain]

else:  # Create an empty data frame for the ID_For_V3_Queries_df
    print("something is seriously wrong")
    sys.exit()
    
# filter to get the remaining trips to pull
remaining_trips_df = ID_For_V3_Queries_df[ID_For_V3_Queries_df['V3_pings_pulled']==False]
remaining_trips_df = remaining_trips_df.sort_values(by='timestamp_start_t').reset_index(drop=True)

day_gap = 10
remaining_trips_df['timestamp_start_t_dt'] = pd.to_datetime(remaining_trips_df['timestamp_start_t'], unit='s')
remaining_trips_df['days_diff'] = remaining_trips_df['timestamp_start_t_dt'].diff().dt.days.fillna(0)
remaining_trips_df['group_num'] = (remaining_trips_df['days_diff'] >= day_gap).cumsum() + 1
# remaining_trips_df = remaining_trips_df.drop(columns=['timestamp_start_t_dt', 'days_diff'])


for group_num, group_data in remaining_trips_df.groupby('group_num'):
    start_time = time.time()
    selected_trips_df = remaining_trips_df[remaining_trips_df['group_num'] == group_num]

    if len(selected_trips_df) == 0:
        sys.exit()
    # Create the days for use in the processing date query
    current_date = selected_trips_df['timestamp_start_t_dt'].min()
    end_date = selected_trips_df['timestamp_start_t_dt'].max()
    istep = (end_date - current_date) + timedelta(days=2)

    start_date = current_date - timedelta(days=1)
    end_date_q = current_date + istep + timedelta(days=2)
    dayi_Q = start_date.strftime('%Y%m%d')
    end_window_Q = end_date_q.strftime('%Y%m%d')

    # Convert id list to a tuple that can be used in a SQL query
    unique_cuebiq_ids = selected_trips_df['cuebiq_id'].unique()
    cuebiq_id_df = pd.DataFrame({'cuebiq_id': unique_cuebiq_ids})
    cuebiq_id_list = cuebiq_id_df['cuebiq_id'].tolist()

    current_time = datetime.now(local_timezone)
    formatted_time = current_time.strftime("%H:%M:%S")
    # Write output for this query
    text_to_write = (
        f"\n\nStarting Group Number {group_num} at {formatted_time} \nids: from {dayi_Q} to {end_window_Q}\n"
        f"# IDs_per_pull: {len(unique_cuebiq_ids)}         days: {istep},      day_x_ids: {istep * len(unique_cuebiq_ids)}  ,   trips: {len(selected_trips_df)}"
    )

    print(text_to_write)
    with open(Log_filename, 'a') as file:
        file.write(text_to_write)
    ###########  Start Query for this set of ID's and dates
    max_retries = 5
    retry_count = 0
    while retry_count < max_retries:
        try:
            if len(cuebiq_id_list) == 1:
                cuebiq_id_value = cuebiq_id_list[0]

                specific_cuebiq_id_query = f"""
                    SELECT *
                    FROM paas_cda_pe_v3.device_location_uplevelled
                    WHERE cuebiq_id = {cuebiq_id_value}
                    AND country_code = 'US'
                    AND processing_date BETWEEN {dayi_Q} AND {end_window_Q}
                    """
            else:
                cuebiq_id_tuple = tuple(cuebiq_id_list)

                specific_cuebiq_id_query = f"""
                    SELECT *
                    FROM paas_cda_pe_v3.device_location_uplevelled
                    WHERE cuebiq_id IN {cuebiq_id_tuple}
                    AND country_code = 'US'
                    AND processing_date BETWEEN {dayi_Q} AND {end_window_Q}
                    """


            specific_cuebiq_id_data = sql_engine.read_sql(specific_cuebiq_id_query)
            # AND event_timestamp BETWEEN {start_query_time} AND {end_query_time}
            current_time = datetime.now(local_timezone)
            formatted_time = current_time.strftime("%H:%M:%S")
            print(formatted_time, "Finished Query")
            break

        except Exception as e:                
            # Handle the 502 Bad Gateway error
            print(f"An error occurred: {e}")
            retry_count += 1
            time.sleep(10)  # Adjust the delay time as needed


    ############  Retain from the query results pings that are not relevant for the trips in this group   ########
    npings = 0
    for index, row in selected_trips_df.iterrows():
        cuebiq_id = row['cuebiq_id']
        Trip_number = row['Trip_number']
        timestamp_start_t = row['timestamp_start_t']
        timestamp_end_t= row['timestamp_end_t']
        
        eight_hours = 8*60*60

        this_trip_pings =  specific_cuebiq_id_data[(specific_cuebiq_id_data['cuebiq_id'] == cuebiq_id) &
                                            (specific_cuebiq_id_data['event_timestamp'] >= timestamp_start_t - eight_hours) &
                                          (specific_cuebiq_id_data['event_timestamp'] <= timestamp_end_t + eight_hours)]

        columns_to_keep = ['cuebiq_id', 'event_timestamp', 'lat', 'lng']
        this_trip_pings = this_trip_pings[columns_to_keep]

        this_trip_pings.to_csv(Pings_V3_Before_After_filename, mode='a', index=False, header=not os.path.exists(Pings_V3_Before_After_filename))

        ID_For_V3_Queries_df.loc[(ID_For_V3_Queries_df['cuebiq_id'] == cuebiq_id) & (ID_For_V3_Queries_df['Trip_number'] == Trip_number), 'V3_pings_pulled'] = True
        npings = npings + len(this_trip_pings)

    # After looping over all trips for this query, save the updated ID_For_V3_Queries_df retaining only the columns I want to avoid creation of many columns of the index
    columns_to_retain = ['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t', 'V3_pings_pulled']
    ID_For_V3_Queries_df = ID_For_V3_Queries_df[columns_to_retain]
    ID_For_V3_Queries_df.to_csv(ID_For_V3_Queries_filename, index=False)


    count_false_pings_pulled = (ID_For_V3_Queries_df['V3_pings_pulled'] == False).sum()
    count_true_pings_pulled = (ID_For_V3_Queries_df['V3_pings_pulled'] == True).sum()
    
    end_time = time.time()
    elapsed_time = round((end_time - start_time)/60,1)
    current_time = datetime.now(local_timezone)
    formatted_time = current_time.strftime("%H:%M:%S")
    text_to_write = (
        f"\nCompleted at {formatted_time},   {elapsed_time} minutes to process\n"
        f"#  IDs_per_pull: {len(unique_cuebiq_ids)}         days: {istep},      day_x_ids: {istep * len(unique_cuebiq_ids)}  , trips: {len(selected_trips_df)} pings: {npings}\n"
        f"Covered dates from {dayi_Q}, to, {end_window_Q}\n{count_true_pings_pulled} trips have been processed. {count_false_pings_pulled} trips remain.\n \n"
        )
    print(text_to_write)
    with open(Log_filename, 'a') as file:
        file.write(text_to_write)


# Apply additional non-ML criteria that use data from the Cuebiq Device table (V3) that are not within the Gulf polygon

### First load the files

In [None]:
# Rec_Indicators_df =pd.read_csv(Rec_Indicators_Step1_filename, index=False)

# Load V3 pings
if 'Pings_V3_Before_After_df' not in locals():
    Pings_V3_Before_After_df =pd.read_csv(Pings_V3_Before_After_filename)

# Load OurTable pings
if 'AllPings_OurTable_df' not in locals():
    AllPings_OurTable_df =pd.read_csv(Pings_OurTable_Gulf_filename)


# Make the ping df's consistent
ping_columns_to_keep = ['cuebiq_id', 'lat', 'lng', 'event_timestamp']
AllPings_OurTable_df=AllPings_OurTable_df[ping_columns_to_keep]
Pings_V3_Before_After_df=Pings_V3_Before_After_df[ping_columns_to_keep]

### Now loop through all the trips and calculate the ping-level indicators

In [None]:
# Load the indicators with the first set of non-ML indicators
indicators_df =pd.read_csv(Rec_Indicators_Step1_filename)

# Check if the file exists
if os.path.exists(V3_Indicators_filename):
    # Load the existing file
    V3_Indicators_df = pd.read_csv(V3_Indicators_filename)
    existing_ids = set(zip(V3_Indicators_df['cuebiq_id'], V3_Indicators_df['Trip_number']))

    # Filter out rows from Indicators_df that are in the set of existing_ids
    remaining_indicators_df = indicators_df[
        ~indicators_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in existing_ids, axis=1)
    ]    
    print("V3 indicators have already been created for ", len(V3_Indicators_df), "trips.")
    print("There are ", len(indicators_df), " valid trips for which indicators need to be created")
    print("There are ", len(remaining_indicators_df), " trips for which V3 indicators need to be created.")
          
else:
    remaining_indicators_df = indicators_df
    print("V3 indicators do not exist. Need to start from scratch")

# Keep only a few columns
# indicator_columns = ['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t']
# remaining_indicators_df = remaining_indicators_df[indicator_columns]


# Loop through all of the IDs in the remaining indicators and 

unique_cuebiq_ids = remaining_indicators_df['cuebiq_id'].unique()

for cuebiq_id in tqdm(unique_cuebiq_ids):
    V3_indicators_this_id_df = remaining_indicators_df[remaining_indicators_df['cuebiq_id'] == cuebiq_id]
    V3_pings_this_id_df = Pings_V3_Before_After_df[Pings_V3_Before_After_df['cuebiq_id'] == cuebiq_id]
    # Initialize the empty DataFrame with the specified columns
    # trip_rows = pd.DataFrame(columns=indicator_columns)

    for index, row in V3_indicators_this_id_df.iterrows():
        cuebiq_id = row['cuebiq_id']
        Trip_number = row['Trip_number']
        timestamp_start_t = row['timestamp_start_t']
        timestamp_end_t = row['timestamp_end_t']

        eight_hours = 8*60*60
        pings_from_v3_df = V3_pings_this_id_df[(V3_pings_this_id_df['event_timestamp'] >= timestamp_start_t-eight_hours) &
                                                    (V3_pings_this_id_df['event_timestamp'] <= timestamp_end_t+eight_hours)]

        pings_from_v3_df= EliminateErrantPingsSpeed(pings_from_v3_df, 90)
        ############## Before  ##############
        onetrip_before_pings = pings_from_v3_df[
            (pings_from_v3_df['event_timestamp'] >= (timestamp_start_t - eight_hours)) &
            (pings_from_v3_df['event_timestamp'] <= timestamp_start_t)
            ]
        npings_before = len(onetrip_before_pings)

        #################### After  ##############
        onetrip_after_pings = pings_from_v3_df[
            (pings_from_v3_df['event_timestamp'] <= (timestamp_end_t + eight_hours)) &
            (pings_from_v3_df['event_timestamp'] >= timestamp_end_t)
            ]
        npings_after = len(onetrip_after_pings)

        ##### Check to see if any of the before & after pings were in one of the industrial polygons
        before_after_pings  = pd.concat([onetrip_before_pings, onetrip_after_pings], axis=0)
        industrial_TF = any_point_in_polygons(before_after_pings, Industrial_polygons)


        #################### V3 During the Trip ##############
        onetrip_V3_during_pings = pings_from_v3_df[
            (pings_from_v3_df['event_timestamp'] > (timestamp_start_t)) &
            (pings_from_v3_df['event_timestamp'] < (timestamp_end_t)) 
            ]

        onetrip_OurTable_pings = AllPings_OurTable_df[
            (AllPings_OurTable_df['cuebiq_id'] == cuebiq_id) &
            (AllPings_OurTable_df['event_timestamp'] > (timestamp_start_t)) &
            (AllPings_OurTable_df['event_timestamp'] < (timestamp_end_t)) 
            ]

        onetrip_pings = pd.concat([onetrip_OurTable_pings, onetrip_V3_during_pings], axis=0)
        onetrip_pings = EliminateErrantPingsSpeed(onetrip_pings, 60)

        # Identify pings that are in the Gulf and NOT on the islands 

        onetrip_pings['is_in_Gulf_waters'] = onetrip_pings.apply(lambda row: is_point_in_Gulf_not_Islands(row), axis=1)
        onetrip_pings['prev_is_in_Gulf_waters'] = onetrip_pings['is_in_Gulf_waters'].shift(1)
        onetrip_pings['consecutive_outside'] = (onetrip_pings['is_in_Gulf_waters'] == False) & (onetrip_pings['prev_is_in_Gulf_waters'] == False)

        npings_during_outside_gulf = onetrip_pings['consecutive_outside'].sum()  # Corrected line
        minutes_during_outside_gulf = onetrip_pings.loc[onetrip_pings['consecutive_outside'], 'time_diff_minutes_from_previous'].sum()
        km_during_outside_gulf = onetrip_pings.loc[onetrip_pings['consecutive_outside'], 'dist_fwd'].sum()
        avg_mph_during_outside_gulf = (km_during_outside_gulf/(0.000001 + minutes_during_outside_gulf))*37.2823
        trip_minutes = (timestamp_end_t- timestamp_start_t)/60
        pct_during_outside_gulf = minutes_during_outside_gulf/trip_minutes

        new_row = pd.DataFrame([{
            'cuebiq_id': cuebiq_id,
            'Trip_number': Trip_number,
            'timestamp_start_t': timestamp_start_t,
            'timestamp_end_t': timestamp_end_t,
            'npings_before': npings_before,
            'npings_after': npings_after,
            'npings_during_outside_gulf': npings_during_outside_gulf,
            'minutes_during_outside_gulf': minutes_during_outside_gulf,
            'pct_during_outside_gulf': pct_during_outside_gulf,
            'avg_mph_during_outside_gulf': avg_mph_during_outside_gulf,
            'industrial_TF': industrial_TF
            }])
        new_row.to_csv(V3_Indicators_filename, mode='a', index=False, header=not os.path.exists(V3_Indicators_filename))
        # trip_rows.append(new_row)        

    # trip_rows.to_csv(V3_Indicators_filename, mode='a', index=False, header=not os.path.exists(V3_Indicators_filename))




## Now, use the ping-level indicators to identify recreational trips

In [None]:
def Ping_Indicator_filtering(Indicators_df):
    
    
    
    # THE REMAINING CRITERIA MAKE USE OF V3 PING -- THESE NEED TO BE PULLED SEPARATELY
    # Filter in  trips with  pings before and after
    print(len(Indicators_df), "before ping-level filtering. There were",  (Indicators_df['Trip_duration_hrs'] > 10).sum(), "trips more than 10 hours")

    Indicators_df1=Indicators_df[(Indicators_df['npings_before'] > 0) & (Indicators_df['npings_after']>0)]
    print(len(Indicators_df1), "after eliminating trips that don't have pings both before & after")
    
    pct_threshold = 0.1
    Indicators_df2 = Indicators_df1[Indicators_df1['pct_during_outside_gulf'] <pct_threshold ]
    print(f"{len(Indicators_df2)} after dropping trips more than {pct_threshold*100}% of time during trip but outside the Gulf WKT. This reults in many lost trips")

    
    Indicators_df3 = Indicators_df2[Indicators_df2['industrial_TF']== False]
    print(len(Indicators_df3), "after dropping trips that went into one of the identified industrial polygons")

    # plot_cdf(Indicators_df, 'Trip_duration_hrs', 10)
    # plot_cdf(Indicators_df1, 'Trip_duration_hrs', 10)
    # plot_cdf(Indicators_df2, 'Trip_duration_hrs', 10)
    # plot_cdf(Indicators_df3, 'Trip_duration_hrs', 10)
    
    return Indicators_df3
    # return Indicators_df5  


In [None]:
# Apply the V3 & ping-level indicators and save
indicators_df =pd.read_csv(Rec_Indicators_Step1_filename)
V3_Indicators_df =pd.read_csv(V3_Indicators_filename)
# Combine the original indicators_df with tne new ones
# Merge the DataFrames with an outer join

merged_df = pd.merge(
    indicators_df,
    V3_Indicators_df,
    on=['cuebiq_id', 'Trip_number', 'timestamp_start_t', 'timestamp_end_t'],
    how='outer', # Use 'inner' if you want only the intersection
    indicator=True  # This adds a column to indicate the source of each row
)

merge_counts = merged_df['_merge'].value_counts()

# Check to make sure that V3 indicators have been found for all the valid trips in indicators_df
print(f"# Only indicators_df =: {merge_counts.get('left_only', 0)} , # only V3_Indicators_df = {merge_counts.get('right_only', 0)} , # both =  {merge_counts.get('both', 0)}")

if merge_counts.get('left_only', 0) > 0:
    print("There are selected trip from indicators_df for which V3 indicators have not been found. This should be fixed before proceeding. If you want to override this, you this if statement must be commented out.")
    sys.exit()

# Filter to retain only rows that appear in indicators_df
rec_indicators_with_V3_df = merged_df[merged_df['_merge'] == 'both'].copy()

rec_indicators_with_V3_df['Trip_duration_hrs'] =(rec_indicators_with_V3_df['timestamp_end_t']- rec_indicators_with_V3_df['timestamp_start_t'])/(60*60)

Rec_indicators_with_V3_df = Ping_Indicator_filtering(rec_indicators_with_V3_df)
Rec_indicators_with_V3_df.to_csv(Rec_indicators_with_V3_filename, index=False)


# Create the indicators used for identifying fully-tracked trips

In [None]:
Rec_indicators_with_V3_df =pd.read_csv(Rec_indicators_with_V3_filename)

if os.path.exists(DisappearanceIndicators_filename):
    # Load the existing file
    DisappearanceIndicators_df = pd.read_csv(DisappearanceIndicators_filename)
    print("DisappearanceIndicators have already been created for ", len(DisappearanceIndicators_df), "trips.")
    existing_trips = set(zip(DisappearanceIndicators_df['cuebiq_id'], DisappearanceIndicators_df['Trip_number']))

    # Filter out rows from Indicators_df that are in the set of existing_tris
    remaining_indicators_df = Rec_indicators_with_V3_df[
        ~Rec_indicators_with_V3_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in existing_trips, axis=1)
    ]    
    # Filter DisappearanceIndicators_df to remove rows that do not appear in Rec_indicators_with_V3_df
    new_trips = set(zip(Rec_indicators_with_V3_df['cuebiq_id'], Rec_indicators_with_V3_df['Trip_number']))
    DisappearanceIndicators_df = DisappearanceIndicators_df[
        DisappearanceIndicators_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in new_trips, axis=1)
    ]
else:
    remaining_indicators_df = Rec_indicators_with_V3_df

print("DisappearanceIndicators for ", len(DisappearanceIndicators_df), "trips are retained")
print("DisappearanceIndicators for ", len(remaining_indicators_df), " more trips need to be created")

# Re-save DisappearanceIndicators_df, replacing the original one reflecting any cleaning that has occurred
DisappearanceIndicators_df.to_csv(DisappearanceIndicators_filename, index=False)
    
# ##############################  DEBUGGING #########################################
# row_num = 2200
# rec_indicators_with_V3_df.reset_index(drop=True, inplace=True)
# cuebiq_id = rec_indicators_with_V3_df['cuebiq_id'].iloc[row_num]
# Trip_number = rec_indicators_with_V3_df['Trip_number'].iloc[row_num]
# timestamp_start_t = rec_indicators_with_V3_df['timestamp_start_t'].iloc[row_num]
# timestamp_end_t = rec_indicators_with_V3_df['timestamp_end_t'].iloc[row_num]
# ############################  DEBUGGING #########################################


for index, row in tqdm(remaining_indicators_df.iterrows(), total=remaining_indicators_df.shape[0], leave=True):
    # Access row data using row['column_name']
    cuebiq_id = row['cuebiq_id']
    Trip_number = row['Trip_number']
    timestamp_start_t = row['timestamp_start_t']
    timestamp_end_t = row['timestamp_end_t']
    ThisTrip_df = AllPings_OurTable_df[(AllPings_OurTable_df['cuebiq_id'] == cuebiq_id) &
                                       (AllPings_OurTable_df['event_timestamp'] >= timestamp_start_t) &
                                       (AllPings_OurTable_df['event_timestamp'] <= timestamp_end_t)]

    ThisTrip_df = EliminateErrantPingsSpeed(ThisTrip_df, 60)
    ThisTrip_df.reset_index(drop=True, inplace=True)

    Trip_duration_hrs = ThisTrip_df['time_diff_minutes_to_next'].sum()/60

    # find iBefore and iAfter, the indices of the pings before & after the longest break in the trip
    maxdiff_bkwd = ThisTrip_df['time_diff_minutes_to_next'].max()
    ThisTrip_df['Max_bkwd_i'] = 1*(maxdiff_bkwd == ThisTrip_df['time_diff_minutes_to_next'])
    iBefore = ThisTrip_df['time_diff_minutes_to_next'].idxmax()

    maxdiff_fwd  = ThisTrip_df['time_diff_minutes_from_previous'].max()
    iAfter = ThisTrip_df['time_diff_minutes_from_previous'].idxmax()
    ThisTrip_df['Max_fwd_i'] = 1*(maxdiff_fwd == ThisTrip_df['time_diff_minutes_from_previous'])

    max_hrs_gap = maxdiff_bkwd/60
    dist_during_gap = ThisTrip_df['dist_bkwd'].iloc[iBefore]

    # These lines were used to check the speeds before and after
    # ThisTrip_df['cum_dist'] = ThisTrip_df['dist_fwd'].cumsum()
    # ThisTrip_df['cum_time'] = ThisTrip_df['time_diff_minutes_from_previous'].cumsum()
    # speedbefore1 = ((ThisTrip_df['cum_dist'].iloc[iBefore] - ThisTrip_df['cum_dist'].iloc[iBefore-1]) / 
    #                (ThisTrip_df['cum_time'].iloc[iBefore] - ThisTrip_df['cum_time'].iloc[iBefore-1]))*37.2823

    # speedafter1 = ((ThisTrip_df['cum_dist'].iloc[iAfter+1] - ThisTrip_df['cum_dist'].iloc[iAfter]) / 
    #                (ThisTrip_df['cum_time'].iloc[iAfter+1] - ThisTrip_df['cum_time'].iloc[iAfter]))*37.2823

    # speedduring1 = ((ThisTrip_df['cum_dist'].iloc[iAfter] - ThisTrip_df['cum_dist'].iloc[iBefore]) / 
    #                (ThisTrip_df['cum_time'].iloc[iAfter] - ThisTrip_df['cum_time'].iloc[iBefore]))*37.2823

    # Calculate the speeds before, during and after the longest break in the series of pings
    speedbefore = (ThisTrip_df['ping_speed_fwd'].iloc[iBefore])*37.2823
    speedafter = (ThisTrip_df['ping_speed_bkwd'].iloc[iAfter])*37.2823
    speedduring = (ThisTrip_df['ping_speed_fwd'].iloc[iAfter])*37.2823

    ThisTrip_df['dist_from_origin'] = ThisTrip_df.apply(
        lambda row: haversine(row['lat'], row['lng'], ThisTrip_df['lat'].iloc[0], ThisTrip_df['lng'].iloc[0]), axis=1
    )
    ThisTrip_df['dist_from_end'] = ThisTrip_df.apply(
        lambda row: haversine(row['lat'], row['lng'], ThisTrip_df['lat'].iloc[-1], ThisTrip_df['lng'].iloc[-1]), axis=1
    )

    MaxDistFromOrigin = ThisTrip_df['dist_from_origin'].max()
    MaxDistFromEnd = ThisTrip_df['dist_from_end'].max()

    DistanceFromOriginBefore = ThisTrip_df['dist_from_origin'].loc[iBefore]
    DistanceFromOriginAfter = ThisTrip_df['dist_from_origin'].loc[iAfter]

    # DistanceFromEndBefore = haversine(ThisTrip_df['lat'].iloc[iBefore], ThisTrip_df['lng'].iloc[iBefore], ThisTrip_df['lat'].iloc[-1], ThisTrip_df['lng'].iloc[-1])
    # DistanceFromEndAfter = haversine(ThisTrip_df['lat'].iloc[iAfter], ThisTrip_df['lng'].iloc[iAfter], ThisTrip_df['lat'].iloc[-1], ThisTrip_df['lng'].iloc[-1])
    DistanceFromEndBefore = ThisTrip_df['dist_from_end'].loc[iBefore]
    DistanceFromEndAfter = ThisTrip_df['dist_from_end'].loc[iAfter]

    # DistanceFromCoastBefore = distance_to_coast_lat_lon(ThisTrip_df['lat'].iloc[iBefore], ThisTrip_df['lng'].iloc[iBefore])
    # DistanceFromCoastAfter = distance_to_coast_lat_lon(ThisTrip_df['lat'].iloc[iBefore], ThisTrip_df['lng'].iloc[iAfter])

    BeforeOverMaxEndOfTrip = DistanceFromEndBefore/MaxDistFromEnd
    AfterOverMaxEndOfTrip = DistanceFromEndAfter/MaxDistFromEnd
    BeforeOverMaxOrigin = DistanceFromOriginBefore/MaxDistFromOrigin
    AfterOverMaxOrigin = DistanceFromOriginAfter/MaxDistFromOrigin

    # Identify the rows in ThisTrip_df that meet the condition of being at least 1/2 km of max distance before & after gap
    dist_adjustment = 0.5
    max_distance_gap_pts = max(DistanceFromOriginBefore, DistanceFromOriginAfter) - dist_adjustment
    condition_origin = ThisTrip_df['dist_from_origin'] > max_distance_gap_pts
    condition_end = ThisTrip_df['dist_from_end'] > max_distance_gap_pts

    # Exclude the rows before or after the gap
    condition_origin.iloc[iBefore] = False
    condition_origin.iloc[iAfter] = False
    condition_end.iloc[iBefore] = False
    condition_end.iloc[iAfter] = False

    # Calcuate the number of pingswithin 0.5 km of the most distant point
    pings_further_out_origin = condition_origin.sum()
    pings_further_out_end = condition_end.sum()
    min_further_out_origin = ThisTrip_df.loc[condition_origin, 'time_diff_minutes_from_previous'].sum()
    min_further_out_end = ThisTrip_df.loc[condition_end, 'time_diff_minutes_from_previous'].sum()

    data = {
            'cuebiq_id': cuebiq_id,
            'Trip_number': Trip_number,
            'Trip_duration_hrs': Trip_duration_hrs,
            'max_hrs_gap': max_hrs_gap, 
            'dist_during_gap': dist_during_gap,
            'speedbefore': speedbefore,
            'speedafter': speedafter,
            'speedduring': speedduring,
            'DistanceFromOriginBefore': DistanceFromOriginBefore,
            'DistanceFromOriginAfter': DistanceFromOriginAfter,
            'DistanceFromEndBefore': DistanceFromEndBefore,
            'DistanceFromEndAfter': DistanceFromEndAfter,
            'pings_further_out':pings_further_out_origin,
            # 'pings_further_out_end':pings_further_out_end,
            'min_further_out': min_further_out_origin,
            # 'min_further_out_end':min_further_out_end,
            'BeforeOverMaxEndOfTrip': BeforeOverMaxEndOfTrip, 
            'AfterOverMaxEndOfTrip': AfterOverMaxEndOfTrip, 
            'BeforeOverMaxOrigin': BeforeOverMaxOrigin, 
            'AfterOverMaxOrigin': AfterOverMaxOrigin
            }
    DisappearanceIndicators_df = pd.DataFrame(data, index=[0])
    DisappearanceIndicators_df.to_csv(DisappearanceIndicators_filename, mode='a', header=not os.path.exists(DisappearanceIndicators_filename), index=False)


## Now apply non-ML criteria to exclude trips that are not fully tracked, 
then calculate a weight that can be used to infer representativeness of trips that do not disappear

### Custom Moving Average function that expands window if there are not enough observations

In [None]:
# FUNCTION that calculates modified moving average ensuring at least min_rows are included in each average
def custom_moving_average(df, column, time_column, initial_time_window, min_rows):
    # Define the initial time window and minimum number of rows
    avg_values = []
    for i in range(len(df)):
        current_time = df[time_column].iloc[i]
        window_size = initial_time_window
        while True:
            mask = (df[time_column] >= (current_time - window_size)) & (df[time_column] <= (current_time + window_size))
            selected_rows = df.loc[mask, column]
            if len(selected_rows) >= min_rows or window_size > df[time_column].max() - df[time_column].min():
                avg_values.append(selected_rows.mean())
                break
            window_size += 0.1  # Increment the window size
    return avg_values


In [None]:
DisappearanceAnalysis_df = pd.read_csv(DisappearanceIndicators_filename)

if 'crit1' in DisappearanceAnalysis_df.columns:
    recreate = input("The disappearance criterion have already been created. Do you want to start all over 'crit1' exists. Do you want to recreate the columns? (yes/no): ").strip().lower()
    if recreate != 'yes':
        print("Exiting without recreating the columns.")
        sys.exit("Exiting as per user's decision.")

# Criterion #1: Any gap < min_hrs_for_disappearance is an interruption
min_hrs_for_disappearance = 0.5
DisappearanceAnalysis_df['crit1'] = 1*(DisappearanceAnalysis_df['max_hrs_gap'] < min_hrs_for_disappearance)

# Criterion #2: Check to see if there is time spent further away from both the origin and the end point
ping_threshhold = 3
minute_threshhold = 30
DisappearanceAnalysis_df['crit2'] = 1*(DisappearanceAnalysis_df['pings_further_out']>=ping_threshhold)*(DisappearanceAnalysis_df['min_further_out']>=minute_threshhold/60)

# Criterion #3: Check to see if the time spent at the end appears to be a stopped based on speed before and after the gap
speed_threshhold3 = 5
hr_threshhold3 = 1
DisappearanceAnalysis_df['crit3'] = 1*(np.maximum(DisappearanceAnalysis_df['speedbefore'],DisappearanceAnalysis_df['speedafter'])<speed_threshhold3) \
                                    *(DisappearanceAnalysis_df['max_hrs_gap']<hr_threshhold3)

# Criterion #4: Check to see if the time spent at the end appears to be a stopped based on speed before, after and during the gap 
speed_threshhold4 = 5
hr_threshhold4 = 2
DisappearanceAnalysis_df['crit4'] = 1*(np.maximum.reduce([
                                        DisappearanceAnalysis_df['speedbefore'],
                                        DisappearanceAnalysis_df['speedafter'],
                                        DisappearanceAnalysis_df['speedduring']
                                    ])<speed_threshhold4) \
                                            *(DisappearanceAnalysis_df['max_hrs_gap']<hr_threshhold4)

# If any of the 4 interruption criteria are satisified -- classify as an interruption
DisappearanceAnalysis_df['Interruption_01'] = np.maximum.reduce([
    DisappearanceAnalysis_df['crit1'],
    DisappearanceAnalysis_df['crit2'],
    DisappearanceAnalysis_df['crit3'],
    DisappearanceAnalysis_df['crit4']
])

########################################################################################################
# Find the average % of trips of a given trip length that have interruptions. 
# The variable Avg_Interruption_01 should then be used as a weight in the travel cost model:
#
#         weight = 1/Avg_Interruption_01
#
########################################################################################################
import pandas as pd
import numpy as np



# Sort the DataFrame based on Trip_duration_hrs
DisappearanceAnalysis_df = DisappearanceAnalysis_df.sort_values(by='Trip_duration_hrs').reset_index(drop=True)

# initial_time_window = 1
# min_rows = 100

# Compute the custom moving average
DisappearanceAnalysis_df['Avg_Interruption_01'] = custom_moving_average(DisappearanceAnalysis_df, 'Interruption_01', 'Trip_duration_hrs', initial_time_window, min_rows)
DisappearanceAnalysis_df.to_csv(DisappearanceIndicators_filename, index=False)

In [None]:
DisappearanceIndicators_df =pd.read_csv(DisappearanceIndicators_filename)
indicators_df =pd.read_csv(Rec_indicators_with_V3_filename)


print(len(DisappearanceIndicators_df), len(indicators_df))

initial_time_window = 1
min_rows = 100

DisappearanceIndicators_df['Avg_Interruption_01'] = custom_moving_average(DisappearanceIndicators_df, 'Interruption_01', 'Trip_duration_hrs', initial_time_window, min_rows)
DisappearanceIndicators_df.to_csv(DisappearanceIndicators_filename, index=False)

print("From the total of ", len(DisappearanceAnalysis_df), "identified recreational trips,", DisappearanceAnalysis_df['Interruption_01'].sum(), "trips do not disappear, so are valid for travel cost model")

merged_df = pd.merge(
    DisappearanceIndicators_df,
    indicators_df,
    on=['cuebiq_id', 'Trip_number'],
    how='outer',  # Use 'inner' if you want only the intersection
    suffixes=('_disappear', '_indicator')
)

# Drop duplicate columns if any were created during the merge
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
merged_df.to_csv(Rec_Indicators_Final_All_Exclusions_And_Disappearance_filename, index = False)


###  Graph the average interruption rate and the density of # of trips by duration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting
fig, ax1 = plt.subplots(figsize=(10, 6))

# Line plot for Avg_Interruption_01 vs. Trip_duration_hrs
ax1.plot(DisappearanceAnalysis_df['Trip_duration_hrs'], DisappearanceAnalysis_df['Avg_Interruption_01'], linestyle='-', color='b')
ax1.set_xlabel('Trip Duration (hrs)')
ax1.set_ylabel('Average Interruption (01)', color='b')
ax1.tick_params(axis='y', labelcolor='b')

# Create a secondary y-axis for the KDE plot
ax2 = ax1.twinx()
sns.kdeplot(DisappearanceAnalysis_df['Trip_duration_hrs'], ax=ax2, color='r', linestyle='--')
ax2.set_ylabel('Density of trips', color='r')
ax2.tick_params(axis='y', labelcolor='r')

# Add title
GraphTitle = f'Average Interruption vs. Trip Duration:\nTime Window = {initial_time_window} & Minimum Rows = {min_rows}' 
plt.title(GraphTitle)

# Show plot
plt.grid(True)
plt.show()

# Add new variables that track all of the stops and trawls within a given trip

In [None]:
DisappearanceIndicators_df =pd.read_csv(DisappearanceIndicators_filename)
# In initial coding, a trip was called "interrupted" if it was fully tracked, meaning the interruption was for a short time.
CompleteRecTrips_df = DisappearanceIndicators_df[DisappearanceIndicators_df['Interruption_01'] == 1]
print("len(CompleteRecTrips_df)", len(CompleteRecTrips_df))
stop_trawl_df=pd.read_csv(Stop_Trawls_Indicators_filename)
print("len(stop_trawl_df)", len(stop_trawl_df))


Remove rows from the stop and trawl files that were dropped in the duplicate cleaning process (8-22-2024)

In [None]:
# Create an up-to-date list of the complete trips
complete_trips_set = set(zip(CompleteRecTrips_df['cuebiq_id'], CompleteRecTrips_df['Trip_number']))

#######################################################
# Filter stop_trawl_df 
# to keep only rows that are in the set of complete trips
stop_trawl_df = pd.read_csv(Stop_Trawls_Indicators_filename)
filtered_df = stop_trawl_df[
    stop_trawl_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in complete_trips_set, axis=1)
].copy()
stop_trawl_df = filtered_df
stop_trawl_df.to_csv(Stop_Trawls_Indicators_filename, index=False)

#######################################################
# Filter trawls_df 
# to keep only rows that are in the set of complete trips
trawls_df= pd.read_csv(Trawls_Indicators_filename)
filtered_df = trawls_df[
    trawls_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in complete_trips_set, axis=1)
].copy()
trawls_df = filtered_df
trawls_df.to_csv(Trawls_Indicators_filename, index=False)

#######################################################
# Filter stop_trawl_df 
# to keep only rows that are in the set of complete trips
stops_df= pd.read_csv(Stops_Indicators_filename)
filtered_df = stops_df[
    stops_df.apply(lambda row: (row['cuebiq_id'], row['Trip_number']) in complete_trips_set, axis=1)
].copy()
stops_df = filtered_df
stops_df.to_csv(Stops_Indicators_filename, index=False)

print(len(CompleteRecTrips_df), len(stop_trawl_df), len(trawls_df), len(stops_df))

In [None]:
class OperationCancelled(Exception):
    pass
# Use only the trips that do not dissapear
CompleteRecTrips_df = DisappearanceIndicators_df[DisappearanceIndicators_df['Interruption_01'] == 1]
CompleteRecTrips_df = CompleteRecTrips_df.reset_index(drop=True)

Debugging = True
NotDebugging = not Debugging

    # Focus on a single trip
if Debugging:
    CompleteRecTrips_df=CompleteRecTrips_df[(CompleteRecTrips_df['cuebiq_id']==1931836223) & (CompleteRecTrips_df['Trip_number']==1)]
    # CompleteRecTrips_df=CompleteRecTrips_df.head(1)


if NotDebugging:
    # Check for the existing of the Stop_Trawls_Indicators already. If they already exist there are various options
    if os.path.exists(Stop_Trawls_Indicators_filename):
        stop_trawl_df = pd.read_csv(Stop_Trawls_Indicators_filename)
        inputtext = (f"There are {len(stop_trawl_df)} rows already processed and "
                     f"{len(CompleteRecTrips_df)} trips that need to be processed. \n"
                     "Type    yes  to backup and start from scratch \n"
                     "Type    no   to terminate \n"
                     f"Type         anything else or enter to eliminate the {len(stop_trawl_df)} rows already processed and proceed.")
        confirm = input(inputtext)        
        # Check the user's input
        if confirm.lower() != 'no':
            raise OperationCancelled('Exited because I did not want to recreate the stops and trawls indicators files.')
        elif confirm.lower() != 'no':
            backup_filename = Stop_Trawls_Indicators_filename + ".backup"
            os.rename(Stop_Trawls_Indicators_filename, backup_filename)
            print("Stop_Trawls file has been backed up")
            if os.path.exists(Trawls_Indicators_filename):
                backup_filename = Trawls_Indicators_filename + ".backup"
                os.rename(Trawls_Indicators_filename, backup_filename)
                print("Trawls file has been backed up")
            if os.path.exists(Stops_Indicators_filename):
                backup_filename = Stops_Indicators_filename + ".backup"
                os.rename(Stops_Indicators_filename, backup_filename)
                print("Stops file has been backed up")
        else:
            # Eliminate the rows from CompleteRecTrips_df that have already been processed
            merged = pd.merge(CompleteRecTrips_df, stop_trawl_df, on=['cuebiq_id', 'Trip_number'], how='left', indicator=True)
            CompleteRecTrips_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
            CompleteRecTrips_df.reset_index(drop=True, inplace=True)            
    
if 'AllPings_OurTable_df' not in locals():
    print("loading AllPings_OurTable_df")
    AllPings_OurTable_df =pd.read_csv(Pings_OurTable_Gulf_filename)

    
##  LOOP OVER ALL COMPLETE TRIPS AND CREATE INDICATORS OF STOPS AND TRAWLS
# for index, row in CompleteRecTrips_df.head().iterrows():
# for index, row in CompleteRecTrips_df.iterrows():
for index, row in tqdm(CompleteRecTrips_df.iterrows(), total=CompleteRecTrips_df.shape[0]):

    cuebiq_id = row['cuebiq_id']
    Trip_number = row['Trip_number']
    ThisTripIndicators = Rec_Indicators_df[(Rec_Indicators_df['cuebiq_id'] == cuebiq_id) & (Rec_Indicators_df['Trip_number'] == Trip_number)]
    timestamp_start_t = ThisTripIndicators['timestamp_start_t'].iloc[0]
    timestamp_end_t = ThisTripIndicators['timestamp_end_t'].iloc[0]
    Max_distance_traveled_origin_t = ThisTripIndicators['Max_distance_traveled_origin_t'].iloc[0]

    
    ThisTripPings_df = AllPings_OurTable_df[(AllPings_OurTable_df['cuebiq_id'] == cuebiq_id) & 
                                         (AllPings_OurTable_df['event_timestamp'] >= timestamp_start_t) &
                                         (AllPings_OurTable_df['event_timestamp'] <= timestamp_end_t)]

    ThisTripPings_df= EliminateErrantPingsSpeed(ThisTripPings_df, 60)
    ThisTripPings_df['dist_fwd'].fillna(value=0, inplace=True)
    ThisTripPings_df = ThisTripPings_df.reset_index(drop=True)

    origin_lat = ThisTripPings_df['lat'].iloc[0]
    origin_lng = ThisTripPings_df['lng'].iloc[0]

    ThisTripPings_df = ThisTripPings_df.drop(columns=['classification_type', 'row_index'])

    # cumulative time and distance
    ThisTripPings_df['cumulative_time_minutes'] = ThisTripPings_df['time_diff_minutes_from_previous'].cumsum()
    ThisTripPings_df['cumulative_distance'] = ThisTripPings_df['dist_fwd'].cumsum()

    Trip_Duration_t = ThisTripPings_df.cumulative_time_minutes.max()

    # Define the column names
    stopcolumns = ['cuebiq_id', 'Trip_number', 'stop_num', 'stop_duration', 
                   'stop_avg_lat', 'stop_avg_lng', 
                   'stop_max_lat', 'stop_max_lng', 
                   'stop_avg_dist_from_origin','stop_max_dist_from_origin', 'Max_distance_traveled_origin_t']
    stops_df = pd.DataFrame(columns=stopcolumns)
    stops_row_df = pd.DataFrame(columns=stopcolumns)

    trawlcolumns = ['cuebiq_id', 'Trip_number', 
                    'trawl_num', 'trawl_duration', 'cumulative_dist_trawl', 
                    'trawl_start_lat', 'trawl_start_lng', 
                    'trawl_end_lat', 'trawl_end_lng', 
                    'trawl_mid_lat', 'trawl_mid_lng', 
                    'trawl_max_lat', 'trawl_max_lng', 
                    'trawl_avg_speed', 'trawl_max_speed',
                    'trawl_avg_dist_from_origin', 'trawl_max_dist_from_origin',
                    'Max_distance_traveled_origin_t']
    trawls_df = pd.DataFrame(columns=trawlcolumns)
    trawl_row_df = pd.DataFrame(columns=trawlcolumns)

    stop_trawl_columns = ['cuebiq_id', 'Trip_number', 'stop_num', 'trawl_num']

#-----------------------------------------------------------------------------------------------            
    #Speed calculation using the liear interpoloation algorithm
    if Trip_Duration_t > 60:

        ################################################################
        # Assuming ThisTripPings_df is already defined and has the columns 'cumulative_time_minutes' and 'cumulative_distance'

        # Initialize Time_step and data arrays
        Time_step = 5
        x = ThisTripPings_df.cumulative_time_minutes.values
        xx = np.append(-(Time_step + 1), x)
        xx = np.append(xx, xx[-1] + (Time_step + 1))
        y = ThisTripPings_df.cumulative_distance.values
        yy = np.append(y[0], y)
        yy = np.append(yy, yy[-1])

        # Create the interpolation function
        # f_totdist = interpolate.interp1d(xx, yy)
        f_totdist = interpolate.interp1d(xx, yy, fill_value="extrapolate")

        # Define speed calculation function at a time  x
        def speed_next_5min(x):   
            return (f_totdist(x + Time_step) - f_totdist(x)) / Time_step

        # Define speed calculation function at a time x
        def speed_prev_5min(x):   
            return (f_totdist(x) - f_totdist(x-Time_step)) / Time_step

        # Initialize the column to store the speed
        ThisTripPings_df['speed_next_5min'] = np.nan

        ThisTripPings_df['speed_prev_5min']=speed_prev_5min(x)*37.2823
        ThisTripPings_df['speed_next_5min']=speed_next_5min(x)*37.2823
    
        mph_stop = 1
        mph_trawl = 5
        stop_speed = mph_stop
        trawl_speed = mph_trawl

        # speed_next_5min - speed in the next 5 minutes
        # speed_prev_5min - speed in the previous 5 minutes
        ThisTripPings_df['StopAhead'] = 1*((ThisTripPings_df['speed_next_5min'] <= stop_speed))
        ThisTripPings_df['StopAhead'].iloc[-1] = 0
        ThisTripPings_df['StopBehind'] = 1*((ThisTripPings_df['speed_prev_5min'] <= stop_speed))
        ThisTripPings_df['StopBehind'].iloc[0] = 0
        ThisTripPings_df['InStop'] = ThisTripPings_df[['StopAhead', 'StopBehind']].max(axis=1)

        # Trawling will be considered to be a period during which maximum speed was above stop speed and all consecutive pings are below the trawl_speed
        ThisTripPings_df['TrawlAhead'] = 1*(ThisTripPings_df['speed_next_5min'] <= trawl_speed)
        ThisTripPings_df['TrawlAhead'].iloc[-1] = 0
        ThisTripPings_df['TrawlBehind'] = 1*(ThisTripPings_df['speed_prev_5min'] <= trawl_speed)
        ThisTripPings_df['TrawlBehind'].iloc[0] = 0
        ThisTripPings_df['InTrawl'] = ThisTripPings_df[['TrawlAhead', 'TrawlBehind']].max(axis=1)
        ThisTripPings_df['dist_from_origin'] = ThisTripPings_df.apply(
            lambda row: haversine(row['lat'], row['lng'], origin_lat, origin_lng), axis=1
        )
        ThisTripPings_df['ping_speed_fwd']=ThisTripPings_df['ping_speed_fwd']*37.2823
        ThisTripPings_df['ping_speed_bkwd']=ThisTripPings_df['ping_speed_bkwd']*37.2823
        # Iterate over the DataFrame
        istop = 0
        itrawl = 0

        
        npings_in_stop = 0
        cumulative_time_stop = 0
        cum_lat_stop = 0
        cum_lng_stop = 0

        npings_in_trawl = 0
        cumulative_time_trawl = 0
        cum_lat_trawl = 0
        cum_lng_trawl = 0
        
        for index, row in ThisTripPings_df.iterrows():
            ######################################################################################
            # code for stop variables
            if row['InStop'] == 1:
                npings_in_stop = npings_in_stop+1
                cumulative_time_stop += row['time_diff_minutes_from_previous']
                cum_lat_stop += row['lat']
                cum_lng_stop += row['lng']
                if row['dist_from_origin']>stop_max_dist_from_origin:
                    stop_max_dist_from_origin=ThisTripPings_df.loc[index, 'dist_from_origin']
                    stop_max_lat = row['lat']
                    stop_max_lng = row['lng']
                    
                if (index + 1 < len(ThisTripPings_df) and ThisTripPings_df.iloc[index + 1]['InStop'] == 0) | (index==len(ThisTripPings_df)):
                    istop += 1
                    stop_avg_lat = cum_lat_stop/npings_in_stop
                    stop_avg_lng = cum_lng_stop/npings_in_stop

                    stop_avg_dist_from_origin = haversine(stop_avg_lat, stop_avg_lng, origin_lat, origin_lng)
                    # Create a one-row DataFrame with the calculated values
                    stops_row_data = {'cuebiq_id': [cuebiq_id], 'Trip_number': Trip_number, 
                                      'stop_num': [istop],'stop_duration': [cumulative_time_stop],
                                      'stop_avg_lat': [stop_avg_lat],'stop_avg_lng': [stop_avg_lng], 
                                      'stop_max_lat': [stop_avg_lat],'stop_max_lng': [stop_avg_lng], 
                                      'stop_avg_dist_from_origin': [stop_avg_dist_from_origin], 'stop_max_dist_from_origin': [stop_max_dist_from_origin],
                                      'Max_distance_traveled_origin_t': [Max_distance_traveled_origin_t]
                                     }
                    stops_row_df = pd.DataFrame(stops_row_data)
                    
                    stops_df = pd.concat([stops_df, stops_row_df], ignore_index=True)
                    
            else:
                npings_in_stop = 0
                cumulative_time_stop = 0
                cum_lat_stop = 0
                cum_lng_stop = 0
                stop_max_dist_from_origin=0

            ######################################################################################
            # code for Trawl variables
            if row['InTrawl'] == 1:
                npings_in_trawl = npings_in_trawl+1
                cumulative_time_trawl += row['time_diff_minutes_from_previous']
                cumulative_dist_trawl += row['dist_fwd']
                cum_lat_trawl += row['lat']
                cum_lng_trawl += row['lng']
                
                if trawl_start_lat== 0:
                    trawl_start_lat = row['lat']
                    trawl_start_lng = row['lng'] 
                # trawl_max_speed = np.maximum(trawl_max_speed, ThisTripPings_df['speed_prev_5min'])
                trawl_max_speed = np.maximum(trawl_max_speed, ThisTripPings_df.loc[index, 'speed_prev_5min'])
                if row['dist_from_origin']>trawl_max_dist_from_origin:
                    trawl_max_dist_from_origin=row['dist_from_origin']
                    trawl_max_lat = row['lat']
                    trawl_max_lng = row['lng']
        

                # if ((index + 1 < len(ThisTripPings_df) and ThisTripPings_df.iloc[index + 1]['InTrawl'] == 0) or 
                if (index + 1 < len(ThisTripPings_df) and ThisTripPings_df.iloc[index + 1]['InTrawl'] == 0) | (index==len(ThisTripPings_df)-1):
                    # Save this trawl only if it is not entirely a stop
                    if trawl_max_speed>stop_speed:
                        trawl_avg_speed = cumulative_dist_trawl/cumulative_time_trawl
                        trawl_end_lat = row['lat']
                        trawl_end_lng = row['lng']

                        trawl_mid_lat = (trawl_end_lat + trawl_start_lat)/2
                        trawl_mid_lng = (trawl_end_lng + trawl_start_lng)/2

                        itrawl += 1
                        trawl_lat = cum_lat_trawl/npings_in_trawl
                        trawl_lng = cum_lng_trawl/npings_in_trawl

                        trawl_avg_dist_from_origin = haversine(trawl_mid_lat, trawl_mid_lng, origin_lat, origin_lng)
                        
                        trawls_row_data = {'cuebiq_id': [cuebiq_id], 'Trip_number': Trip_number, 
                            'trawl_num': [itrawl],'trawl_duration': [cumulative_time_trawl],'cumulative_dist_trawl': [cumulative_dist_trawl],
                            'trawl_start_lat': [trawl_start_lat],'trawl_start_lng': [trawl_start_lng], 
                            'trawl_end_lat': [trawl_end_lat],'trawl_end_lng': [trawl_end_lng], 
                            'trawl_mid_lat': [trawl_mid_lat],'trawl_mid_lng': [trawl_mid_lng], 
                            'trawl_max_lat': [trawl_max_lat],'trawl_max_lng': [trawl_max_lng], 
                            'trawl_avg_speed': [trawl_avg_speed],'trawl_max_speed': [trawl_max_speed], 
                            'trawl_avg_dist_from_origin': [trawl_avg_dist_from_origin], 'trawl_max_dist_from_origin': [trawl_max_dist_from_origin],
                            'Max_distance_traveled_origin_t': [Max_distance_traveled_origin_t]
                        }
                        trawls_row_df = pd.DataFrame(trawls_row_data)

                        trawls_df = pd.concat([trawls_df, trawls_row_df], ignore_index=True)
            else:
                npings_in_trawl = 0
                firstrow = 0
                cumulative_time_trawl = 0
                cumulative_dist_trawl = 0 
                trawl_max_speed = 0
                trawl_start_lat = 0
                trawl_start_lng = 0
                trawl_max_dist_from_origin = 0

    # Write to CSV files as long as not debugging
    if NotDebugging:
        if len(stops_df) ==0:
            values = [cuebiq_id, Trip_number, 0, 0, 0, 0, 0,0, 0, 0, Max_distance_traveled_origin_t]
            stops_df = pd.DataFrame([values], columns=stopcolumns)
        stops_df.to_csv(Stops_Indicators_filename, mode='a', header=not os.path.exists(Stops_Indicators_filename), index=False)

        if len(trawls_df) ==0:
            values = [cuebiq_id, Trip_number, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, Max_distance_traveled_origin_t]
            trawls_df = pd.DataFrame([values], columns=trawlcolumns)
        trawls_df.to_csv(Trawls_Indicators_filename, mode='a', header=not os.path.exists(Trawls_Indicators_filename), index=False)

        values = [cuebiq_id, Trip_number, istop, itrawl]
        stop_trawl_df = pd.DataFrame([values], columns=stop_trawl_columns)
        stop_trawl_df.to_csv(Stop_Trawls_Indicators_filename, mode='a', header=not os.path.exists(Stop_Trawls_Indicators_filename), index=False)
    

print("Finished")

# Functions that are used to inspect a single trip

In [None]:
def selectpings(random_trip_df):

    # Identify the cuebiq_id, and times for this random trip
    global timestamp_start_t, timestamp_end_t, trip_t, cuebiq_id_t
    cuebiq_id_t = random_trip_df['cuebiq_id'].iloc[0]
    timestamp_start_t = random_trip_df['timestamp_start_t'].iloc[0] 
    timestamp_end_t= random_trip_df['timestamp_end_t'].iloc[0] 
    trip_t= random_trip_df['Trip_number'].iloc[0] 

    start_time = time.time()
    ########################################################################################
    ###### SELECT THE RELEVANT DATA FROM THE GULF PINGS DATA FRAME ########################
    pings_Gulf_df = AllPings_OurTable_df[
        (AllPings_OurTable_df['cuebiq_id'] == cuebiq_id_t) &
        (AllPings_OurTable_df['event_timestamp'] >= (timestamp_start_t)) &
        (AllPings_OurTable_df['event_timestamp'] <= (timestamp_end_t))
    ]
    print("Selecting pings for ID",cuebiq_id_t, "trip", trip_t, "The Gulf DF had ", len(pings_Gulf_df) , "pings" )

    if len(pings_Gulf_df) < 10:
        print("The Gulf DF had ", len(pings_Gulf_df) , "pings")
        pings_from_v3_df = []
        return pings_from_v3_df, pings_Gulf_df
        
    else:
        This_Group_df = ID_Groups_df[(ID_Groups_df['Min_In_Group'] <= cuebiq_id_t) & (ID_Groups_df['Max_In_Group'] >= cuebiq_id_t)]
        Group_num = This_Group_df['Cubeq_ID_Group'].iloc[0]
        filename = f"V3_pings_{Group_num}.csv"
        Pings_V3_Group_filename= os.path.join(V3_Pings_Groups_directory,filename)
        Pings_V3_Group_df = pd.read_csv(Pings_V3_Group_filename)
        
        eight_hours = 8*60*60
        # All relevant V3 pings
        pings_from_v3_df = Pings_V3_Group_df[
            (Pings_V3_Group_df['cuebiq_id'] == cuebiq_id_t) &
            (Pings_V3_Group_df['event_timestamp'] >= (timestamp_start_t-eight_hours)) &
            (Pings_V3_Group_df['event_timestamp'] <= (timestamp_end_t+eight_hours))
            ]

        pings_from_v3_df.drop_duplicates(inplace=True)
        pings_from_v3_df = pings_from_v3_df.sort_values(by='event_timestamp')
        pings_from_v3_df=EliminateErrantPings(pings_from_v3_df)

        # V3 pings in the middle of the trip
        ping_V3_during_trip = pings_from_v3_df[
            (pings_from_v3_df['event_timestamp'] >= (timestamp_start_t)) &
            (pings_from_v3_df['event_timestamp'] <= (timestamp_end_t))
            ]

        ping_V3_during_trip.drop_duplicates(inplace=True)
        ping_V3_during_trip = ping_V3_during_trip.sort_values(by='event_timestamp')
        ping_V3_during_trip=EliminateErrantPings(ping_V3_during_trip)

        # Drop from ping_V3_during_trip any pings with the same event_timestamp, lat, and lng as in pings_Gulf_df
        merged_df = pd.merge(ping_V3_during_trip, pings_Gulf_df, on=['event_timestamp', 'lat', 'lng'], how='left', indicator=True)

        # Filter out rows that are only in ping_V3_during_trip
        filtered_df = merged_df[merged_df['_merge'] == 'left_only']
        filtered_df.drop('_merge', axis=1, inplace=True)
        ping_V3_during_trip = filtered_df
        
        return pings_from_v3_df, pings_Gulf_df, ping_V3_during_trip
    

In [None]:
def organizepings(pings_from_v3_df, pings_Gulf_df, ping_V3_during_trip):
    global onetrip_before_pings, onetrip_after_pings, onetrip_gulf_pings
    # Select the pings 8hours before and 8 hours after
    eight_hours = 8*60*60

    ################# During Trip from Gulf Pings  #################
    onetrip_gulf_pings = pings_Gulf_df
    # onetrip_gulf_pings=is_point_outside_all_islands(onetrip_gulf_pings)

    if len(onetrip_gulf_pings)>0:
        onetrip_gulf_pings = onetrip_gulf_pings.sort_values(by='event_timestamp')
        onetrip_gulf_pings = EliminateErrantPingsSpeed(onetrip_gulf_pings, 60)
        onetrip_gulf_pings['Hours_from_start'] = round((onetrip_gulf_pings['event_timestamp']-timestamp_start_t)/(60*60),1)
        onetrip_gulf_pings['during_trip'] = "DD"
        onetrip_gulf_pings.loc[onetrip_gulf_pings.index[0], 'during_trip'] = "DS"
        onetrip_gulf_pings.loc[onetrip_gulf_pings.index[-1], 'during_trip'] = "DE"
    else:
        onetrip_gulf_pings = pd.DataFrame(columns=['event_timestamp', 'lat', 'lng','Hours_from_start',  'during_trip'])

    ############## Before
    onetrip_before_pings = pings_from_v3_df[
        (pings_from_v3_df['event_timestamp'] >= (timestamp_start_t - eight_hours)) &
        (pings_from_v3_df['event_timestamp'] <= timestamp_start_t)
        ]

    if len(onetrip_before_pings)>0:
        onetrip_before_pings = onetrip_before_pings.sort_values(by='event_timestamp')
        onetrip_before_pings = EliminateErrantPingsSpeed(onetrip_before_pings,90)
        onetrip_before_pings['Hours_from_start'] = round((onetrip_before_pings['event_timestamp']-timestamp_start_t)/(60*60),1)
        onetrip_before_pings['during_trip'] = "BB"
        onetrip_before_pings.loc[onetrip_before_pings.index[0], 'during_trip'] = "BS"
        onetrip_before_pings.loc[onetrip_before_pings.index[-1], 'during_trip'] = "BE"
    else:
        onetrip_before_pings = pd.DataFrame(columns=['event_timestamp', 'lat', 'lng','Hours_from_start',  'during_trip'])
    
    #################### After
    onetrip_after_pings = pings_from_v3_df[
        (pings_from_v3_df['event_timestamp'] <= (timestamp_end_t + eight_hours)) &
        (pings_from_v3_df['event_timestamp'] >= timestamp_end_t)
        ]
    if len(onetrip_after_pings)>0:
        onetrip_after_pings = onetrip_after_pings.sort_values(by='event_timestamp')
        onetrip_after_pings = EliminateErrantPingsSpeed(onetrip_after_pings,90)
        
        onetrip_after_pings['Hours_from_start'] = round((onetrip_after_pings['event_timestamp']-timestamp_start_t)/(60*60),1)
        onetrip_after_pings['during_trip'] = "AA"
        onetrip_after_pings.loc[onetrip_after_pings.index[0], 'during_trip'] = "AS"
        onetrip_after_pings.loc[onetrip_after_pings.index[-1], 'during_trip'] = "AE"
    else:
        onetrip_after_pings = pd.DataFrame(columns=['event_timestamp', 'lat', 'lng','Hours_from_start',  'during_trip'])

    #################### V3 During the Trip
    onetrip_V3_during_pings = pings_from_v3_df[
        (pings_from_v3_df['event_timestamp'] > (timestamp_start_t)) &
        (pings_from_v3_df['event_timestamp'] < (timestamp_end_t)) 
        ]
    
    ### New Code to narrow points that don't overlap with our Gulf WKT excluding islands
    polygons = [wkt.loads(Gulf_wkt)]  
    df = onetrip_V3_during_pings
    df['is_in_Gulf'] = df.apply(lambda row: is_inside_any_polygon(row['lat'], row['lng'], polygons), axis=1)
    outside_gulf = df[df['is_in_Gulf'] == False]
    # print("len(outside_gulf)", len(outside_gulf))
    inside_gulf = df[df['is_in_Gulf'] == True]
    # print("len(inside_gulf)", len(inside_gulf))
    
    on_islands = is_point_on_islands(inside_gulf)
    # print("len(on_islands)", len(on_islands))
    onetrip_V3_during_pings = pd.concat([outside_gulf, on_islands])
    # print("len(onetrip_V3_during_pings)", len(onetrip_V3_during_pings))
    columns_to_drop = ['is_in_Gulf']
    onetrip_V3_during_pings.drop(columns=columns_to_drop, inplace=True)
    
    ########################
    if len(onetrip_V3_during_pings)>0:
        onetrip_V3_during_pings = onetrip_V3_during_pings.sort_values(by='event_timestamp')
        onetrip_V3_during_pings['Hours_from_start'] = round((onetrip_V3_during_pings['event_timestamp']-timestamp_start_t)/(60*60),1)
        onetrip_V3_during_pings['during_trip'] = "II"
        # onetrip_V3_during_pings.loc[onetrip_island_pings.index[0], 'during_trip'] = "IS"
        # onetrip_V3_during_pings.loc[onetrip_island_pings.index[-1], 'during_trip'] = "IE"
        # print("len(onetrip_V3_during_pings)", len(onetrip_V3_during_pings))
    else:
        onetrip_V3_during_pings = pd.DataFrame(columns=['event_timestamp', 'lat', 'lng','Hours_from_start',  'during_trip'])

    ########################
    # Concatenate the Four DataFrames
    onetrip_df = pd.concat([
                            onetrip_V3_during_pings[['event_timestamp', 'lat', 'lng', 'Hours_from_start', 'during_trip']],
                            onetrip_before_pings[['event_timestamp', 'lat', 'lng','Hours_from_start',  'during_trip']],
                            onetrip_gulf_pings[['event_timestamp', 'lat', 'lng', 'Hours_from_start', 'during_trip']],
                            onetrip_after_pings[['event_timestamp', 'lat', 'lng', 'Hours_from_start', 'during_trip']]
                           ])
    # delete empty rows and fill in blanks
    onetrip_df = onetrip_df[onetrip_df['event_timestamp'].notna() & (onetrip_df['event_timestamp'] != '')]
    onetrip_df = onetrip_df.dropna()

    
    # Sort the DataFrame by 'event_timestamp'
    onetrip_df = onetrip_df.sort_values(by='event_timestamp')

    # Clean up to eliminate bad pings
    onetrip_df = EliminateErrantPingsSpeed(onetrip_df, 60)
    
    # Should the next row be commented out? It seems redundant but in some cases onetrip_V3_during_pings doesn't have Hours_from_start
    onetrip_df['Hours_from_start'] = (onetrip_df['event_timestamp']-timestamp_start_t)/(60*60)
    # Reset index
    onetrip_df.reset_index(drop=True, inplace=True)
    return onetrip_df
    print("pings are organized")

In [None]:
import folium
from shapely import wkt

# def makethemap():
def makethemap(onetrip_df, width, height):


    # Calculate the bounding box
    min_lat = onetrip_df['lat'].min()
    max_lat = onetrip_df['lat'].max()
    min_lng = onetrip_df['lng'].min()
    max_lng = onetrip_df['lng'].max()

    # Create a map with bounds and satellite tiles
    m = folium.Map(location=[(min_lat + max_lat) / 2, (min_lng + max_lng) / 2], zoom_start=10, tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google Satellite', width=width, height=height)
    m.fit_bounds([[min_lat, min_lng], [max_lat, max_lng]])

    # Create feature groups
    circle_markers = folium.FeatureGroup(name='Circle Markers')
    poly_lines = folium.FeatureGroup(name='PolyLines')

    # Keep track of the first and last occurrence of during_trip == 0
    first_during_trip_0 = None
    last_during_trip_0 = None

    # Keep track of previous coordinates
    prev_coords = None

    AlabamaIsland_polygon = wkt.loads(AlabamaIsland)

    # Add the AlabamaIsland polygon to the map as a GeoJson layer
    folium.GeoJson(data=AlabamaIsland_polygon.__geo_interface__, name='Alabama Island').add_to(m)

    # Add the industrial polygons to the folium map
    for polygon in Industrial_polygons:
        folium.GeoJson(data=polygon.__geo_interface__, name='Industrial Polygon').add_to(m)


    ##################### CHAT GPT CODE TO ADD COLORS BASED ON SPEED
    onetrip_df=pingspeed(onetrip_df)

    # Add CircleMarker and lines for each ping
    for index, row in onetrip_df.iterrows():
        radius = 5
        if row['during_trip'][0] == 'D':
            if row['Avg_ping_speed'] < 0.02682238:  # 1 mph
                color = 'green'
            elif row['Avg_ping_speed'] <= 0.134111898:  # 5 mph
                color = 'orange'
            elif row['Avg_ping_speed'] <= 3*0.134111898:  # 15 mph
                color = 'pink'
            else:
                color = 'red'

            if first_during_trip_0 is None:
                first_during_trip_0 = True
        elif row['during_trip'][0] == 'B':
            color = 'blue'
        elif row['during_trip'][0] == 'I':
            color = 'black'
        else:
            color = 'red'
            last_during_trip_0 = True

        # Set larger radius for the first and last occurrence of during_trip == 0
        if row['during_trip'] == 'DS':
            radius = 7 
            # radius = 7 if first_during_trip_0 or last_during_trip_0 else 3
            # border_color = 'darkblue' if first_during_trip_0 else 'darkblue'
            border_color = 'darkblue' 
        elif row['during_trip'][-1] == 'DE':
            # radius = 6 if first_during_trip_0 or last_during_trip_0 else 3
            radius = 7 
            border_color = 'maroon' 
            # border_color = 'maroon' if first_during_trip_0 else 'maroon'
        else:
            border_color = color

        # Add CircleMarker to circle_markers feature group
        popup_text = f"{round(row['Hours_from_start'],1)}hrs/{round(row['Avg_ping_speed'] * 37.2823, 1)}mph"
        folium.CircleMarker(
            location=[row['lat'], row['lng']],
            radius=radius,  # Adjust the radius as needed
            color=border_color,
            fill=True,
            fill_color=color,  # Assign color based on ping_speed
            fill_opacity=0.6,
            popup=popup_text
        ).add_to(circle_markers)


        # Draw line if during_trip == 'D' and prev_coords is not None
        if row['during_trip'][0] == 'I' and row['during_trip'][-1] == 'I':
            folium.PolyLine(locations=[prev_coords, [row['lat'], row['lng']]], color='black').add_to(poly_lines)
        elif row['during_trip'][0] == 'D' and row['during_trip'][-1] == 'D':
            folium.PolyLine(locations=[prev_coords, [row['lat'], row['lng']]], color='pink').add_to(poly_lines)
        elif row['during_trip'][0] == 'B' and row['during_trip'][-1] == 'B':
            folium.PolyLine(locations=[prev_coords, [row['lat'], row['lng']]], color='blue').add_to(poly_lines)
        elif row['during_trip'][0] == 'A' and row['during_trip'][-1] == 'A':
            folium.PolyLine(locations=[prev_coords, [row['lat'], row['lng']]], color='red').add_to(poly_lines)

        # Update prev_coords
        prev_coords = [row['lat'], row['lng']]


    for index, row in platforms.iterrows():
        lat, lng = row['lat'], row['lng']
        folium.CircleMarker(location=[lat, lng], radius=3, color='yellow', fill=True, fill_color='grey').add_to(m)

    for index, row in LA_AR.iterrows():
        lat, lng = row['lat'], row['lng']
        folium.CircleMarker(location=[lat, lng], radius=3, color='blue', fill=True, fill_color='grey').add_to(m)

    for index, row in TX_AR.iterrows():
        lat, lng = row['lat'], row['lng']
        folium.CircleMarker(location=[lat, lng], radius=3, color='green', fill=True, fill_color='grey').add_to(m)


    for index, row in AL_AR.iterrows():
        lat, lng = row['lat'], row['lng']
        folium.CircleMarker(location=[lat, lng], radius=3, color='grey', fill=True, fill_color='grey').add_to(m)

    # Add feature groups to the map
    circle_markers.add_to(m)
    poly_lines.add_to(m)

    ping_V3_during_trip['Hours_from_start'] = (ping_V3_during_trip['event_timestamp']-timestamp_start_t)/(60*60)
    for idx, row in ping_V3_during_trip.iterrows():
        popup_text = f"{round(row['Hours_from_start'],1)}hrs/{round(row['Avg_ping_speed'] * 37.2823, 1)}mph"
        folium.CircleMarker(
            location=[row['lat'], row['lng']],
            radius=5,  # Adjust the radius as needed
            color='purple',
            fill=True,
            fill_color='purple',
            popup=popup_text
        ).add_to(m)


    print("idi", cuebiq_id_t, "trip_t", trip_t, "Pings: Before", len(onetrip_before_pings), "After", len(onetrip_after_pings), "During ", len(onetrip_gulf_pings))
    # Display the map
    return m


In [None]:
def organize_summary(summary_indicators_df):
    
    summary_indicators_df['RecTripRating'] = rec_trip_rating
    if len(rec_trip_rating)>0:
        summary_indicators_df['comments']=comments
        # Assuming summary_indicators_df is your DataFrame
        # summary_indicators_df = summary_indicators_df[['cuebiq_id', 'Trip_number', 'RecTripRating', 'comments', 
        #                                            'Prob_371', 'Total_distance', 'Max_distance_traveled_origin_t', 
        #                                            'maxspeed_mph', 'Trip_Duration_hrs', 'final_over_max', 
        #                                            'Begin_End_Dist_from_Coast_max', 'Begin_End_Dist_from_Coast_min']]

        merged_df = pd.concat([summary_indicators_df, random_trip_df], axis=1)
        merged_df.to_csv(RecTripRating_filename, mode='a', header=not os.path.exists(RecTripRating_filename), index=False)

    del summary_indicators_df


In [None]:
def summary_indicators():
    # if 'summary_indicators_df' in globals():
    #     del summary_indicators_df

    summary_indicators_df = pd.DataFrame()
    # print(summary_indicators_df.columns)
    summary_indicators_df['cuebiq_id'] = random_trip_df['cuebiq_id']
    summary_indicators_df['Trip_number'] = random_trip_df['Trip_number']
    summary_indicators_df['Prob_371']=random_trip_df['Probability_371']
    summary_indicators_df['Trip__hrs']=random_trip_df['Trip_Duration_t']/60
    summary_indicators_df['max_time_gap']=onetrip_gulf_pings['time_diff_minutes_from_previous'].max()/60
    summary_indicators_df['stopped_pct']=random_trip_df['pdf_GT_0_LE_0_010_km_per_min_t']+random_trip_df['pdf_GT_0_010_LE_0_025_km_per_min_t']+random_trip_df['pdf_GT_0_025_LE_0_050_km_per_min_t']
    summary_indicators_df['island_min']=random_trip_df['minutes_during_outside_gulf']
    summary_indicators_df['island_pct']=random_trip_df['pct_during_outside_gulf']
    summary_indicators_df['Max_distance_traveled_origin_t']=random_trip_df['Max_distance_traveled_origin_t']
    summary_indicators_df['Total_distance']=random_trip_df['Total_distance_traveled_t']
    summary_indicators_df['island_speed'] = random_trip_df['avg_mph_during_outside_gulf']
    # summary_indicators_df['stop_min']=random_trip_df['time_stopped_t']
    # summary_indicators_df['stopped_pct']=random_trip_df['pct_time_stopped']

    # summary_indicators_df['trawl_min']=random_trip_df['time_trawling_t']
    
    summary_indicators_df['maxspeed_mph']=random_trip_df['maxspeed_t']*37.2823
    summary_indicators_df['maxspeed_2']=(onetrip_gulf_pings['ping_speed_fwd'].max())*37.2823
    summary_indicators_df['final_over_max'] = random_trip_df['Distance_from_origin_t']/random_trip_df['Max_distance_traveled_origin_t']
    # summary_indicators_df['Begin_End_Dist_from_Coast_max'] = random_trip_df[
    #     ['first_distance_from_coast_t', 'last_distance_from_coast_t']].max(axis=1)
    
    
    # summary_indicators_df['Begin_End_Dist_from_Coast_min'] = random_trip_df[
    #     ['first_distance_from_coast_t', 'last_distance_from_coast_t']].min(axis=1)

    return summary_indicators_df

# Create summary stats of Raw indicators 
to compare Spectus and AIS Data

In [None]:
AllIndicators_df = pd.read_csv(DisappearanceIndicators_filename)
AllIndicators_df = pd.read_csv(Rec_indicators_with_V3_filename)

print("len(AllIndicators_df)", len(AllIndicators_df))

In [None]:
import pandas as pd

# Assuming AllIndicators_df is your DataFrame and it contains the required columns
variables = [
    'trips_per_day', 
    'break_duration_t',
    'Weekday_trips', 
    'Trip_Duration_t',
    'Total_distance_traveled_t', 
    'Distance_from_origin_t', 
    'first_distance_from_coast_t',
    'last_distance_from_coast_t', 
    'Max_distance_traveled_origin_t',
    'Weekend_trip_t', 
    'time_stopped_t', 
    'pct_time_stopped_2',
    'number_of_stops_t', 
    'longest_stop_t',
    'shortest_stop_t',
    'time_trawling_t',
    'pct_time_trawling_2',
    'number_of_trawl_t',
    'longest_trawl_t', 
    'shortest_trawl_t', 
    'distance_trawling_t',
    'time_moving_t', 
    'pct_time_moving_2', 
    'number_of_moves_t',
    'longest_move_t', 
    'shortest_move_t',
    'distance_moving_t',
    'dist_from_origin_during_stops', 
    'Trip_pings_mov_traw_per_time_t',
    'move_efficiency_t', 
    'move_speed_t',
    'maxspeed_t', 
    'max_distance_from_coast_t',
    'WSPD', 
    'GST',
    'WVHT', 
    'ATMP'
]

# Calculate mean and standard deviation for the specified variables
means = AllIndicators_df[variables].mean()
medians = AllIndicators_df[variables].median()
stds = AllIndicators_df[variables].std()
mins = AllIndicators_df[variables].min()
maxes = AllIndicators_df[variables].max()


# Create a new DataFrame to store the results
summary_df = pd.DataFrame({
    'Mean': means,
    'Median': medians,
    'Standard Deviation': stds,
    'Min': mins,
    'Max': maxes    
})

summary_df.loc['Trips'] = [len(AllIndicators_df), None, None, None, None]

# Display the summary table
print(summary_df.head(3))


### Now create a the same table for the AIS data

In [None]:
AIS_indicators_df = pd.read_csv(AIS_indicators_file_path)

### REPLACE THE pct_time VARIABLES BECAUSE THESE WERE INCORRECT IN THE RAW SPECTUS DATA -- THEY DO NOT APPEAR TO BE INCORRECT IN THE AIS DATA, BUT THIS IS DONE FO THE SAKE OF CONSISTENCY
AIS_indicators_df['pct_time_stopped_2']=AIS_indicators_df['pdf_EQ_0_km_per_min_t']+AIS_indicators_df['pdf_GT_0_LE_0_010_km_per_min_t']+AIS_indicators_df['pdf_GT_0_010_LE_0_025_km_per_min_t']
AIS_indicators_df['pct_time_trawling_2']=AIS_indicators_df['pdf_GT_0_025_LE_0_050_km_per_min_t']+AIS_indicators_df['pdf_GT_0_05_LE_0_0.75_km_per_min_t']+AIS_indicators_df['pdf_GT_0_07.5_LE_0_1_km_per_min_t']
AIS_indicators_df['pct_time_moving_2']=AIS_indicators_df['pdf_GT_0_2_LE_0_3_km_per_min_t']+AIS_indicators_df['pdf_GT_0_3_LE_0_4_km_per_min_t']+AIS_indicators_df['pdf_GT_0_4_LE_0_5_km_per_min_t']+AIS_indicators_df['pdf_GT_0_5_LE_0_6_km_per_min_t']+AIS_indicators_df['pdf_GT_0_6_LE_0_7_km_per_min_t']+AIS_indicators_df['pdf_GT_0_6_LE_0_7_km_per_min_t']+AIS_indicators_df['pdf_GT_0_7_LE_0_8_km_per_min_t']+AIS_indicators_df['pdf_GT_0_8_LE_0_9_km_per_min_t']+AIS_indicators_df['pdf_GT_0_9_LE_1_0_km_per_min_t']+AIS_indicators_df['pdf_GT_1_km_per_min_t']

In [None]:
# Assuming AllIndicators_df is your DataFrame and it contains the required columns
variables = [
    'trips_per_day', 
    'break_duration_t',
    'Weekday_trips', 
    'Trip_Duration_t',
    'Total_distance_traveled_t', 
    'Distance_from_origin_t', 
    'first_distance_from_coast_t',
    'last_distance_from_coast_t', 
    'Max_distance_traveled_origin_t',
    'Weekend_trip_t', 
    'time_stopped_t', 
    'pct_time_stopped_2',
    'number_of_stops_t', 
    'longest_stop_t',
    'shortest_stop_t',
    'time_trawling_t',
    'pct_time_trawling_2',
    'number_of_trawl_t',
    'longest_trawl_t', 
    'shortest_trawl_t', 
    'distance_trawling_t',
    'time_moving_t', 
    'pct_time_moving_2', 
    'number_of_moves_t',
    'longest_move_t', 
    'shortest_move_t',
    'distance_moving_t',
    'dist_from_origin_during_stops', 
    'Trip_pings_mov_traw_per_time_t',
    'move_efficiency_t', 
    'move_speed_t',
    'maxspeed_t', 
    'max_distance_from_coast_t',
    'WSPD', 
    'GST',
    'WVHT', 
    'ATMP'
]

# Calculate mean and standard deviation for the specified variables
means = AIS_indicators_df[variables].mean()
stds = AIS_indicators_df[variables].std()
medians = AIS_indicators_df[variables].median()
mins = AIS_indicators_df[variables].min()
maxes = AIS_indicators_df[variables].max()

# Create a new DataFrame to store the results
AIS_summary_df = pd.DataFrame({
    'AIS_Mean': means,
    'AIS_Median': medians,
    'AIS_Standard Deviation': stds,
    'AIS_Mins': mins,
    'AIS_Maxes': maxes
})

AIS_summary_df.loc['Trips'] = [len(AIS_indicators_df), None, None, None, None]

# Display the summary table
print(AIS_summary_df.head(2))


### Save the summary stats to a csv file

In [None]:
# merged_summary_df = pd.merge(AIS_summary_df, summary_df)
merged_summary_df = summary_df.merge(AIS_summary_df, left_index=True, right_index=True)

FeaturesSummaryTable_filename = os.path.join(Results_directory,'FeaturesSummaryTable.csv')
merged_summary_df.to_csv(FeaturesSummaryTable_filename, index = True)

# DEBUGGING 

In [None]:
disappear_df = pd.read_csv(DisappearanceIndicators_filename)
indicators_df = pd.read_csv(Rec_indicators_with_V3_filename)


In [None]:
indicators_df.head(3)

In [None]:
merged = pd.merge(
    disappear_df,
    indicators_df,
    on=['cuebiq_id', 'Trip_number'],
    how='outer',  # Use 'outer' to keep all rows from both DataFrames
)


In [None]:
print("len(disappear_df), len(indicators_df), len(merged)")
print(len(disappear_df), len(indicators_df), len(merged))
