# This notebook carries out the station-non-station analysis, identifying recreational fishing trips that appear to have launched from a location where creel data are gathered. 

## Key inputs: 
* **rec_indicators_with_V3** - features used for classification
* **DisappearanceIndicators.csv** - indicators of whether a trip is fully tracked

## Key outputs:
* **Station_NonStationAnalysis_full.csv** -- includes an indicator of whether the trip stopped at a station

# 0. Preliminaries

## Load libraries and functions

In [None]:
# # Install modules
!pip install tqdm
!pip install statsmodels

# # Suppress all warnings
# import warnings
# warnings.filterwarnings("ignore")

import pandas as pd
import os
import sys
import time
import csv
from datetime import datetime, timedelta
from datetime import datetime
import pytz  
import warnings
import numpy as np
import math
local_timezone = pytz.timezone('US/Central')

import csv
import datetime
import datetime as dt
import geopandas as gpd
import heapq
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pytz
import requests
import seaborn as sns
import time
import math
import statsmodels
import inspect
import folium

from scipy import interpolate
from shapely import wkt
from shapely.geometry import Polygon
from zipfile import ZipFile
from shapely.geometry import Point  # Import the Point class from shapely.geometry
from datetime import datetime
from tqdm import tqdm

class OperationCancelled(Exception):
    pass

local_timezone = pytz.timezone('US/Central')

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Set directories to be used

In [None]:
#################################################################
# Directory for this output
OurTable_V3_directory = '~/RecFishing/Analysis with Our Tables and V3/Data Files'
# Expand the tilde to the user's home directory
OurTable_V3_directory = os.path.expanduser(OurTable_V3_directory)
# Check to make sure the directory exist
DirExist = os.path.exists(OurTable_V3_directory)
print(OurTable_V3_directory, "exists = " ,DirExist)


#################################################################
# Directory for output from the first draw
Batch01_directory = '~/RecFishing/Analysis with Our Tables and V3/Data Files/Batch01'
# Expand the tilde to the user's home directory
Batch01_directory = os.path.expanduser(Batch01_directory)
# Check to make sure the directory exist
DirExist = os.path.exists(Batch01_directory)
print(Batch01_directory, "exists = " ,DirExist)



#################################################################
# Directory for Groups of V3 Pings
V3_Pings_Groups_directory = '~/RecFishing/Analysis with Our Tables and V3/Data Files/V3_Ping_Groups'
V3_Pings_Groups_directory = os.path.expanduser(V3_Pings_Groups_directory)
print(V3_Pings_Groups_directory, "exists = " ,os.path.exists(V3_Pings_Groups_directory))

#################################################################
# Directory some core data fro analysis
CoreData_Directory = '~/RecFishing/CoreData'
CoreData_Directory = os.path.expanduser(CoreData_Directory)
print(CoreData_Directory, "exists = ", os.path.exists(CoreData_Directory))

#################################################################
# Directory  with Original_directory material
Original_directory = '~/RecFishing/DataflowStudioJobs'
Original_directory = os.path.expanduser(Original_directory)
DirExist = os.path.exists(Original_directory)
print(Original_directory, "exists = ", DirExist)


#################################################################
# Directory  with Original_directory material
Previously_Processed_directory = '~/RecFishing/DataflowStudioJobs/FinalCode - Rec Fishing Identification'
Previously_Processed_directory = os.path.expanduser(Previously_Processed_directory)
DirExist = os.path.exists(Previously_Processed_directory)
print(Previously_Processed_directory, "exists = ", DirExist)

#################################################################
# Directory with Travel Cost files
Travel_Cost_directory = '~/RecFishing/Travel Costs with Dedicated Table/CSV Files'
# Expand the tilde to the user's home directory
Travel_Cost_directory = os.path.expanduser(Travel_Cost_directory)
DirExist = os.path.exists(Travel_Cost_directory)
print(Travel_Cost_directory, "exists = ", DirExist)

#################################################################
# Directory with Weather data and related files
Weather_Data_directory = '~/RecFishing/uploaded_files/Weather Data'
# Expand the tilde to the user's home directory
Weather_Data_directory = os.path.expanduser(Weather_Data_directory)
print(Weather_Data_directory, "exists = ", DirExist)


#################################################################
# Directory with other Uploaded data 
Uploaded_Data_directory = '~/RecFishing/uploaded_files'
# Expand the tilde to the user's home directory
Uploaded_Data_directory = os.path.expanduser(Uploaded_Data_directory)

#################################################################
# Results and Analysis
Results_directory = '~/RecFishing/Analysis with Our Tables and V3/Results'
Results_directory = os.path.expanduser(Results_directory)

Figures_directory = '~/RecFishing/Analysis with Our Tables and V3/Results/Figures'
Figures_directory = os.path.expanduser(Figures_directory)

TrajectoryMaps_directory = '~/RecFishing/Analysis with Our Tables and V3/Results/Figures/Maps'
TrajectoryMaps_directory = os.path.expanduser(TrajectoryMaps_directory)


####################################################################################
####################  AIS Directory #################################################
AIS_Directory = '~/RecFishing/AIS Files/Data'
AIS_Directory = os.path.expanduser(AIS_Directory)
DirExist = os.path.exists(AIS_Directory)
print(AIS_Directory, "exists = ", DirExist)

# ID_list_RandomSample from ScheduledExecution5.pkl


## Set input and output files to be used

In [None]:
def check_file_existence(file_path):
    if not os.path.exists(file_path):
        print(f"{file_path} Does NOT Exist")


######################################################################################################################
#########################  Log File  ################
Log_filename  =  os.path.join(OurTable_V3_directory, 'Log.txt')

######################################################################################################################
########################## Complete list of randomized IDs- without bernouli sampling 740k #########################
# PKL_File_With_Random_IDs_Filename  =  os.path.join(Original_directory, 'cuebiq_id_list_wo_sampling_740k.pkl')
PKL_File_With_Random_IDs_Filename  =  os.path.join(CoreData_Directory, 'cuebiq_id_list_wo_sampling_740k.pkl')
check_file_existence(PKL_File_With_Random_IDs_Filename)
    
# Data gathered and used prior to the NOAA Webinar in February 2024
IDs_Used_in_NOAA_Webinar_filename = os.path.join(CoreData_Directory, 'IDs_From_Random_Draw_Prior_to_NOAA_Webinar.csv')
check_file_existence(IDs_Used_in_NOAA_Webinar_filename)
Ping_Used_in_NOAA_Webinar_filename = os.path.join(CoreData_Directory, 'Pings_From_Random_Draw_Prior_to_NOAA_Webinar.csv')
check_file_existence(Ping_Used_in_NOAA_Webinar_filename)
                                                     
# List of IDs that have been processed for Indicators
AlreadyFullyProcessedIDs_Filename  =  os.path.join(OurTable_V3_directory, 'RandomlyChosenCuebiq_ids.List_of_Processed_ids.csv')
check_file_existence(AlreadyFullyProcessedIDs_Filename)
    
######################################################################################################################
#########################  ID Checklist with columns for ID, Pings, Indicators Created (TF) & Trips  ################
IDs_Pulled_from_Dedicated_Table_filename  =  os.path.join(OurTable_V3_directory, 'IDs_Pulled_From_Dedicated_Table.csv')
check_file_existence(IDs_Pulled_from_Dedicated_Table_filename)
    
ID_For_V3_Queries_filename  =  os.path.join(OurTable_V3_directory, 'IDs_from_V3.csv')
check_file_existence(ID_For_V3_Queries_filename)
    
RecTripRating_filename =  os.path.join(OurTable_V3_directory, 'RecTripRating.csv')
check_file_existence(RecTripRating_filename)
    
# This file contains information about the rows of Pings_V3_temp_filename that can be used to avoid loading the entire file into a data frame
V3_Pings_Index_filename =  os.path.join(OurTable_V3_directory, 'V3_Pings_File_Index.csv')
check_file_existence(V3_Pings_Index_filename)
    
ID_Groups_filename = os.path.join(OurTable_V3_directory,'Cuebiq_ID_Groups.csv')
check_file_existence(ID_Groups_filename)

######################################################
# Pings in the OurTable for a single large draw of IDs TEMPORARY FILE
Pings_OurTable_temp_filename = os.path.join(OurTable_V3_directory,'Pings_OurTable_temp.csv')
check_file_existence(Pings_OurTable_temp_filename)
    
# Pings from V3 corresponding with the IDs found in the OurTable 
Pings_V3_temp_filename = os.path.join(OurTable_V3_directory,'Pings_V3_temp.csv')
check_file_existence(Pings_V3_temp_filename)
    
# Set output file names
Indicators_IDs_checked_filename = os.path.join(OurTable_V3_directory, 'IDs_Checked_Indicators_OurTable.csv')
check_file_existence(Indicators_IDs_checked_filename)

cuebiq_id_list_and_count_filename= os.path.join(OurTable_V3_directory,'cuebiq_id_list_and_count.csv')
check_file_existence(cuebiq_id_list_and_count_filename)

# List of IDs and dates for V3 query Pings in the OurTable for a single large draw of IDs TEMPORARY FILE
OurTable_IDs_and_Dates_filename = os.path.join(OurTable_V3_directory,'OurTable_IDs_and_Dates.csv')
check_file_existence(OurTable_IDs_and_Dates_filename)
    
##########################################################################################
############################### PINGS FILES   ##############################################
Pings_OurTable_Gulf_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Gulf_ALL.csv')
check_file_existence(Pings_OurTable_Gulf_filename)

Combined_Pings_OurTable_Gulf_filename= os.path.join(OurTable_V3_directory,'Combined_Pings_OurTable_Gulf_ALL.csv')
check_file_existence(Combined_Pings_OurTable_Gulf_filename)

Pings_V3_Before_After_filename= os.path.join(OurTable_V3_directory,'Pings_V3_Before_After.csv')
check_file_existence(Pings_V3_Before_After_filename)

Combined_Pings_V3_Before_After_filename= os.path.join(OurTable_V3_directory,'Combined_Pings_V3_Before_After.csv')
check_file_existence(Combined_Pings_V3_Before_After_filename)

Pings_OurTable_Gulf_MT19_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Gulf_MT19.csv')
check_file_existence(Pings_OurTable_Gulf_MT19_filename)

Pings_OurTable_Coast_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Coast.csv')
check_file_existence(Pings_OurTable_Coast_filename)

Pings_OurTable_Outside_our_wkts_filename= os.path.join(OurTable_V3_directory,'Pings_OurTable_Outside_our_wkts.csv')
check_file_existence(Pings_OurTable_Outside_our_wkts_filename)
    
##########################################################################################
############################### INDICATORS  ##############################################
Indicators_filename = os.path.join(OurTable_V3_directory,'Indicators_OurTable.csv')
check_file_existence(Indicators_filename)
    
# cuebiq_id_count_filename= os.path.join(EEZ_V3_directory,'cuebiq_id_count_distribution_EEZ_V3.csv')
Indicators_Classified_filename = os.path.join(OurTable_V3_directory,'Indicators_OurTable.Predictions.csv')
check_file_existence(Indicators_Classified_filename)

Rec_Indicators_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable.csv')
check_file_existence(Rec_Indicators_filename)

Rec_Indicators_Step1_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable.Step1.csv')
check_file_existence(Rec_Indicators_filename)

V3_Indicators_filename =  os.path.join(OurTable_V3_directory,'V3_indicators.csv')
check_file_existence(V3_Indicators_filename)

Rec_indicators_with_V3_filename = os.path.join(OurTable_V3_directory,'rec_indicators_with_V3.csv')
check_file_existence(Rec_indicators_with_V3_filename)

Indicators_with_V3_indicators_filename= os.path.join(OurTable_V3_directory,'Indicators_with_V3_indicators_indicators.csv')
check_file_existence(Indicators_with_V3_indicators_filename)

# Rec_Indicators_Selected_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable_Selected.csv')
Rec_Indicators_Selected_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_OurTable_Selected_May2024.csv')
check_file_existence(Rec_Indicators_Selected_filename)

Rec_Indicators_Final_All_Exclusions_And_Disappearance_filename = os.path.join(OurTable_V3_directory,'Rec_Indicators_Final_All_Exclusions_And_Disappearance.csv')
check_file_existence(Rec_Indicators_Final_All_Exclusions_And_Disappearance_filename)

# Sorted_Results_file_path = os.path.join(OurTable_V3_directory,'Indicators_EEZ_and_V3.Predictions.sorted.csv')
# RecFishing_Results_file_path =  os.path.join(OurTable_V3_directory,'RecFishingBoat Predictions.sorted.csv')

DisappearanceIndicators_filename = os.path.join(OurTable_V3_directory,'DisappearanceIndicators.csv')
check_file_existence(DisappearanceIndicators_filename)

DisappearanceAnalysis_filename = os.path.join(OurTable_V3_directory,'DisappearanceAnalysis.csv')
check_file_existence(DisappearanceAnalysis_filename)

Stops_Indicators_filename = os.path.join(OurTable_V3_directory,'Stops_Indicators.csv')
check_file_existence(Stops_Indicators_filename)

Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Trawls_Indicators.csv')
check_file_existence(Trawls_Indicators_filename)

Stop_Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Stop_Trawls_Indicators.csv')
check_file_existence(Stop_Trawls_Indicators_filename)

Combined_Stops_Indicators_filename = os.path.join(OurTable_V3_directory,'Combined_Stops_Indicators.csv')
check_file_existence(Combined_Stops_Indicators_filename)

Combined_Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Combined_Trawls_Indicators.csv')
check_file_existence(Combined_Trawls_Indicators_filename)

Combined_Stop_Trawls_Indicators_filename = os.path.join(OurTable_V3_directory,'Combined_Stop_Trawls_Indicators.csv')
check_file_existence(Combined_Stop_Trawls_Indicators_filename)

##########################################################################################
############################### Files that COMBINE BATCH01 and newer data ################
Combined_indicators_with_disappearance_filename = os.path.join(OurTable_V3_directory,'Combined_indicators_with_disappearance.csv')
check_file_existence(Combined_indicators_with_disappearance_filename)

#################################################################
####################   Results and Analysis #################################################
Station_NonStationAnalysis_filename  = os.path.join(Results_directory,'Station_NonStationAnalysis.csv')
check_file_existence(Station_NonStationAnalysis_filename)

Station_NonStationAnalysis_full_filename  = os.path.join(Results_directory,'Station_NonStationAnalysis_full.csv')
check_file_existence(Station_NonStationAnalysis_full_filename)


##########################################################################################
############################### WEATHER data files ####################################
Buoys_file_path  = os.path.join(Weather_Data_directory,'Buoys.csv')
check_file_existence(Buoys_file_path)

Weather_file_path  = os.path.join(Weather_Data_directory,'DailyWeatherData.csv')
check_file_existence(Weather_file_path)

##########################################################################################
############################### SUPPLEMENTARY MAP DATA  ############################
Industrial_polygons_filename  = os.path.join(Uploaded_Data_directory,'Polygons Around Industrial Sites.wkt')
check_file_existence(Industrial_polygons_filename)


station_points_filename = os.path.join(Uploaded_Data_directory,'LA_TX_Union_Station_WGS84.csv')
check_file_existence(station_points_filename)

MRIP_station_points_filename =os.path.join(Uploaded_Data_directory,'MRIP_stations.csv')
check_file_existence(MRIP_station_points_filename)

##########################################################################################
############################### AIS Files INCLUDING CLASSIFIER ############################
RF_Classfier_filename = os.path.join(AIS_Directory, 'rf_model_AIS_2019.pkl')
check_file_existence(RF_Classfier_filename)

RF_Importance_Factors_filename = os.path.join(AIS_Directory, 'rf_classifier_importance_factors.csv')
check_file_existence(RF_Importance_Factors_filename)

AIS_Predictions_filename = os.path.join(AIS_Directory, 'RandomForest_Predictions2019AISData.csv')
check_file_existence(AIS_Predictions_filename)


### Dedicate Table Names for reference
# Dedicated table with all Pings within the Gulf WKT for 1/12019 - 4/22/2022
#  Table Name:  dedicated.ScheduledExecution5.DeviceTable   
#  Code used for call:  RecFishing/DataflowStudioJobs/ScheduledEx5-updated.ipynb

# Dedicated table with all Pings within the Gulf WKT AND Origin for 1/12019 - 4/22/2022
#  Table Name:  dedicated.ScheduledExecution5_parallel_origin.DeviceTable
#  Code used for call:  RecFishing/DataflowStudioJobs/ScheduledEx5-origin.ipynb

##########################################################################################
############################### Results & Analysis ############################


## Date and Distance Functions 

In [None]:
def get_dates_sequence(
    date_start, 
    date_end, 
    date_format
):
    return [
        (datetime.strptime(date_start, date_format) + timedelta(days=x)).strftime(date_format)
        for x in range (
        0,
        (datetime.strptime(date_end, date_format) - datetime.strptime(date_start, date_format) + timedelta(days=1)).days
        )
    ]

In [None]:
date_format = "%Y%m%d"

first_date = "20190101"
last_date_to_compute = "20220422"

In [None]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [None]:
def calculate_speed(df):
    # Sort the DataFrame based on 'event_timestamp'
    df.sort_values(by='event_timestamp', inplace=True)
    
    # Calculate distances and speeds
    df['dist_fwd'] = haversine(df['lat'], df['lng'], df['lat'].shift(1), df['lng'].shift(1))
    df['time_fwd'] = df['event_timestamp'] - df['event_timestamp'].shift(1)
    df['ping_speed_fwd'] = 60 * abs(df['dist_fwd'] / (0.000001 + df['time_fwd']))
    df['ping_speed_fwd'].iloc[0] = 0.0  # KM/minute


## H3 Functions

#### Default resolution for all H3 analysis

In [None]:
# Resolution 10 has an average edge length of 75.86 meters, meaning, the distance between two opposite vertices is 151.5 meters
resolution = 10

#### Function to get the h3 cell for a complete data frame

In [None]:
!pip install h3
from shapely.geometry import Point
import h3

# Function to get H3 cells for a GeoDataFrame
def get_h3_cells_for_dataframe(dataframe, resolution):
    h3_cells = set()
    for index, row in dataframe.iterrows():
        point = Point(row['lng'], row['lat'])
        h3_cells.add(h3.latlng_to_cell(point.y, point.x, resolution))
    return h3_cells

#### Function to get H3 neighbors

In [None]:
def get_h3_cells_and_neighbors(dataframe, resolution):
    h3_cells_and_neighbors = set()
    for index, row in dataframe.iterrows():
        point = Point(row['lng'], row['lat'])
        h3_cell = h3.latlng_to_cell(point.y, point.x, resolution)
        neighbors = h3.grid_disk(h3_cell, 1)

        # Add both the original H3 cell and its neighbors at lower resolution to the set
        h3_cells_and_neighbors.add(h3_cell)
        h3_cells_and_neighbors.update(neighbors)

    return h3_cells_and_neighbors


### Function that compares columns in two data frames

In [None]:
def CompareColumsInTwoDataFrames(df1, df2):
    # Get the current frame
    frame = inspect.currentframe()
    # Get the arguments from the caller's frame
    args, _, _, values = inspect.getargvalues(frame.f_back)

    # Extract the names of the arguments
    df1_name = [name for name in values if values[name] is df1][0]
    df2_name = [name for name in values if values[name] is df2][0]

    columns_only_in_1 = list(set(df1.columns) - set(df2.columns))
    columns_only_in_1 = sorted(columns_only_in_1)

    # Get the list of columns in Batch01_merged_df that are not in indicators_df
    columns_only_in_2 = list(set(df2.columns) - set(df1.columns))
    columns_only_in_2 = sorted(columns_only_in_2)

    common_in_both  = list(set(df1.columns).intersection(set(df2.columns)))
    common_in_both = sorted(common_in_both)
    
    if len(columns_only_in_1)>0:
        print(" ")
        print("Columns in ", df1_name, "that aren't in", df2_name,":")
        print(columns_only_in_1)
    else:
        print("There are no columns in ", df1_name, "that aren't in", df2_name)
    print(" ")
        

    if len(columns_only_in_2)>0:
        print(" ")
        print("Columns in ", df2_name, "that aren't in", df1_name,":")
        print(columns_only_in_2)
    else:
        print("There are no columns in ", df2_name, "that aren't in", df1_name)

    print(" ")
    print("Columns in both", df1_name, "and", df2_name, ":", common_in_both)

# CompareColumsInTwoDataFrames(Batch01_ind1_df, indicators_df)
# indicators_df.head(3)

### Function that gets H3 neighbors, and the neighbors of the neighbors -- a double ring around the original points

In [None]:
def get_h3_cells_and_neighbors_two_levels(dataframe, resolution):
    h3_cells_and_neighbors = set()
    for index, row in dataframe.iterrows():
        point = Point(row['lng'], row['lat'])
        h3_cell = h3.latlng_to_cell(point.y, point.x, resolution)
        neighbors = h3.grid_disk(h3_cell, 1)
        
        # Add the original H3 cell and its neighbors at the current resolution
        h3_cells_and_neighbors.add(h3_cell)
        h3_cells_and_neighbors.update(neighbors)
        
        # Add neighbors of neighbors at the same resolution
        for neighbor in neighbors:
            second_layer_neighbors = h3.grid_disk(neighbor, 1)
            h3_cells_and_neighbors.update(second_layer_neighbors)

    return h3_cells_and_neighbors


## Errant pings code

In [None]:
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

def EliminateErrantPingsSpeed(pings_df, mph_limit):
    km_per_min_limit = mph_limit*(0.0268224)
    # Calculate speed moving forward e.g., row 0 is time since previous trip

    pings_df.sort_values(by='event_timestamp', inplace=True)
    pings_df = pings_df.drop_duplicates()


    # create (or recreate) the time difference variables
    pings_df['time_diff_minutes_from_previous'] = pings_df["event_timestamp"].diff()/60.0
    pings_df['time_diff_minutes_from_previous'].fillna(value=0, inplace=True)

    pings_df['time_diff_minutes_to_next'] = pings_df["event_timestamp"].diff(-1)/60.0
    pings_df['time_diff_minutes_to_next'].fillna(value=99999, inplace=True)

    
    pings_df_shifted_down = pings_df.shift(1)
    pings_df['dist_fwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_down['lat'], pings_df_shifted_down['lng'])
    pings_df['ping_speed_fwd'] = abs(pings_df['dist_fwd']/(0.00001+pings_df['time_diff_minutes_from_previous']))
    pings_df['ping_speed_fwd'].fillna(value=0, inplace=True)

    # Calculate speed moving backward e.g., first row is the speed to the next ping
    pings_df_shifted_up = pings_df.shift(-1)
    pings_df['dist_bkwd'] = haversine(pings_df['lat'], pings_df['lng'], pings_df_shifted_up['lat'], pings_df_shifted_up['lng'])
    pings_df['ping_speed_bkwd'] = abs(pings_df['dist_bkwd']/(0.00001+pings_df['time_diff_minutes_to_next']))
    pings_df['ping_speed_bkwd'].fillna(value=0, inplace=True)

    pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2
    pings_df['row_index'] = pings_df.reset_index().index

    # Step 2: Check if the maximum of ping_speed > km_per_min_limit
    iIteration=0
    while len(pings_df) > 2 and pings_df['Avg_ping_speed'].max() > km_per_min_limit:
        iIteration=iIteration+1

        max_index = pings_df['Avg_ping_speed'].idxmax()
    
        # Step 4: Recalculate ping_speed_fwd for the row after the row that was dropped
        if max_index + 1 < len(pings_df) and max_index - 1 >= 0:
            lat_after = pings_df.iloc[max_index+1]['lat']
            lon_after = pings_df.iloc[max_index+1]['lng']
            lat_before = pings_df.iloc[max_index - 1]['lat']
            lon_before = pings_df.iloc[max_index - 1]['lng']
            distance = haversine(lat_before, lon_before, lat_after, lon_after)
            time_diff = pings_df.iloc[max_index+1]['event_timestamp']-pings_df.iloc[max_index-1]['event_timestamp']
            new_speed = distance /time_diff
            
            # Calculate new fwd speed for the row before
            index_before = max_index - 1
            index_after = max_index + 1
            
            # Update the value using .loc[] or .iloc[] with a single call
            pings_df.loc[index_before, 'ping_speed_fwd'] = new_speed
            pings_df.loc[index_after, 'ping_speed_bkwd'] = new_speed

            pings_df = pings_df[pings_df['event_timestamp'].notna() & (pings_df['event_timestamp'] != '')]

        
        ################ Debugging ###############
        pings_df = pings_df.drop(max_index)
        pings_df['Avg_ping_speed'] = (pings_df['ping_speed_fwd'] + pings_df['ping_speed_bkwd']) / 2

        # Reset index (I don't know if this is really necessary)
        pings_df.reset_index(drop=True, inplace=True)
    
    return pings_df

### Merge Trip DFs
Merges two data frames based on cuebiq_id and Trip_number. If the list of IDs is not identical, this returns an empty data frame.

In [None]:
def Merge_Trip_dfs(df1, df2):
    # Check if the lists of values of cuebiq_id are the same in both data frames
    if set(df1['cuebiq_id']) != set(df2['cuebiq_id']):
        print("The two data frames do not have the same values of cuebiq_id")
        return pd.DataFrame()
    
    # Identify common columns, excluding 'cuebiq_id' and 'Trip_number'
    common_cols = [col for col in df1.columns if col in df2.columns and col not in ['cuebiq_id', 'Trip_number']]
    
    # Drop common columns from df2
    df2 = df2.drop(columns=common_cols)
    
    # Merge the data frames on 'cuebiq_id' and 'Trip_number'
    merged_df = pd.merge(df1, df2, on=['cuebiq_id', 'Trip_number'], how='inner')
    
    return merged_df

# Load Auxiliary Files -- polygons, stations, etc.

### Load stations (MRIP & TPWD) and find H3 cells and neighbors to those cells

In [None]:
# Load your dataframes

# Texas -- Data provided Mark Fisher <Mark.Fisher@tpwd.texas.gov> on 6/3/2022
TX_station_points_df = pd.read_csv('../../uploaded_files/TPWD Stations.csv', encoding='latin-1')
TX_station_points_df.rename(columns={'y': 'lat', 'x': 'lng'}, inplace=True)
TX_station_points_df.dropna(subset=['lat'], inplace=True)  # Drop empty rows
TX_station_points_h3_cells = get_h3_cells_and_neighbors(TX_station_points_df, resolution)

# MRIP MS, AL & FL -- downloaded from the site NOAA Site Register
MRIP_station_points_df = pd.read_csv('../../uploaded_files/MRIP-sites-LA-AL-MS.csv')
MRIP_station_points_df.rename(columns={'SITE_LAT': 'lat', 'SITE_LONG': 'lng'}, inplace=True)

AL_MRIP_station_points_df = MRIP_station_points_df[MRIP_station_points_df['STATE_CODE'] == 1].copy()
MS_MRIP_station_points_df = MRIP_station_points_df[MRIP_station_points_df['STATE_CODE'] == 28].copy()

AL_MRIP_station_points_h3_cells= get_h3_cells_and_neighbors(AL_MRIP_station_points_df, resolution)
MS_MRIP_station_points_h3_cells= get_h3_cells_and_neighbors(MS_MRIP_station_points_df, resolution)

FL_MRIP_station_points_df = pd.read_csv('../../uploaded_files/FL Sites -- MRIP Site Registry Escambia and Santa Rosa Counties.csv')
FL_MRIP_station_points_df.rename(columns={'SITE_LAT': 'lat', 'SITE_LONG': 'lng'}, inplace=True)
FL_MRIP_station_points_df = FL_MRIP_station_points_df[FL_MRIP_station_points_df['STATUS'] == "Active"].copy()
FL_MRIP_station_points_h3_cells = get_h3_cells_and_neighbors(FL_MRIP_station_points_df, resolution)

# LA Creel Stations -- personal communication from Nicole Smith (WLF) <nsmith@wlf.la.gov> on 12/13/23
LA_CREEL_station_points_df = pd.read_csv('../../uploaded_files/LA creel sites-1.csv')
LA_CREEL_station_points_df.rename(columns={'Latitude': 'lat', 'Longitude': 'lng'}, inplace=True)
LA_CREEL_station_points_df = LA_CREEL_station_points_df[LA_CREEL_station_points_df['Active_Flg'] == 1].copy()
LA_CREEL_station_points_df = LA_CREEL_station_points_df[LA_CREEL_station_points_df['lat'] >0].copy()
LA_CREEL_station_points_h3_cells= get_h3_cells_and_neighbors(LA_CREEL_station_points_df, resolution)

# All Ports in GOM Gathered from Marine Traffic Website
MarineTrafficPorts_df = pd.read_csv('../../uploaded_files/MarineTrafficPorts.csv')
MarineTrafficPorts_df.rename(columns={'lon': 'lng'}, inplace=True)

LargePorts_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 3].copy()
Medium_Anchorage_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 4].copy()
Medium_Marina_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 5].copy()
Medium_Port_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 7].copy()
Small_Marina_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 9].copy()
Small_Port_Marine_traffic_df = MarineTrafficPorts_df[MarineTrafficPorts_df['MarineTrafficPortType_num'] == 10].copy()

LargePorts_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(LargePorts_Marine_traffic_df, resolution)
Medium_Anchorage_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(Medium_Anchorage_Marine_traffic_df, resolution)
Medium_Marina_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(Medium_Marina_Marine_traffic_df, resolution)
Medium_Port_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(Medium_Port_Marine_traffic_df, resolution)
Small_Marina_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(Small_Marina_Marine_traffic_df, resolution)
Small_Port_Marine_traffic_h3_cells= get_h3_cells_and_neighbors(Small_Port_Marine_traffic_df, resolution)

In [None]:
# The maps of stations
state_station_points_df = pd.read_csv('../../uploaded_files/LA_TX_Union_Station_WGS84.csv')

# MRIP_station_points_df = pd.read_csv('../uploaded_files/MRIP_stations.csv')
MRIP_station_points_df = pd.read_csv('../../uploaded_files/MRIP-sites-LA-AL-MS.csv')

MRIP_station_points_df.rename(columns={'SITE_LAT': 'lat', 'SITE_LONG': 'lng'}, inplace=True)

# Create sets of H3 cells and their neighbors for each dataframe
state_station_h3_cells = get_h3_cells_and_neighbors(state_station_points_df, resolution)
state_station_h3_cells_alone = get_h3_cells_for_dataframe(state_station_points_df, resolution)

mrip_station_h3_cells = get_h3_cells_and_neighbors(MRIP_station_points_df, resolution)
mrip_station_h3_cells_alone = get_h3_cells_for_dataframe(MRIP_station_points_df, resolution)

# Combine all the h3 cells into a single set
combined_h3_cells = state_station_h3_cells | mrip_station_h3_cells
combined_h3_cells_alone = state_station_h3_cells_alone | mrip_station_h3_cells_alone


### State polygons for identifying state of starting point

In [None]:
from shapely.geometry import Point, Polygon
from shapely.wkt import loads

## WKT strings from <script src="https://gist.github.com/JoshuaCarroll/49630cbeeb254a49986e939a26672e9c.js"></script>

# # Test on a point on a rectangular state
# wkt_string = "POLYGON((-109.0448 37.0004,-102.0424 36.9949,-102.0534 41.0006,-109.0489 40.9996,-109.0448 37.0004,-109.0448 37.0004))"
# colorado_polygon = loads(wkt_string)
# min_lat,min_lng,max_lat,max_lng=colorado_polygon.bounds

# avg_lat = .5*(min_lat+max_lat)
# avg_lng =  .5*(min_lng+max_lng)
# avg_point = Point(avg_lat,avg_lng)

# is_within_polygon = avg_point.within(colorado_polygon)
# print(avg_point,"is_within_COLORADO",is_within_polygon)


# Define the WKT string for the TX polygon
TX_wkt_string = "POLYGON((-106.5715 31.8659,-106.5042 31.7504,-106.3092 31.6242,-106.2103 31.4638,-106.0181 31.3912,-105.7874 31.1846,-105.5663 31.0012,-105.4015 30.8456,-105.0032 30.6462,-104.8521 30.3847,-104.7437 30.2591,-104.6915 30.0738,-104.6777 29.9169,-104.5679 29.7644,-104.5280 29.6475,-104.4044 29.5603,-104.2067 29.4719,-104.1559 29.3834,-103.9774 29.2948,-103.9128 29.2804,-103.8208 29.2481,-103.5640 29.1378,-103.4692 29.0682,-103.3154 29.0105,-103.1616 28.9601,-103.0957 29.0177,-103.0298 29.1330,-102.8677 29.2157,-102.8979 29.2565,-102.8375 29.3570,-102.8004 29.4898,-102.7002 29.6881,-102.5134 29.7691,-102.3843 29.7596,-102.3047 29.8788,-102.1509 29.7834,-101.7004 29.7572,-101.4917 29.7644,-101.2939 29.6308,-101.2582 29.5269,-101.0056 29.3642,-100.9204 29.3056,-100.7707 29.1642,-100.7007 29.0946,-100.6306 28.9012,-100.4974 28.6593,-100.3601 28.4675,-100.2969 28.2778,-100.1733 28.1882,-100.0195 28.0526,-99.9344 27.9435,-99.8438 27.7638,-99.7119 27.6641,-99.4812 27.4839,-99.5375 27.3059,-99.4290 27.1948,-99.4455 27.0175,-99.3164 26.8829,-99.2065 26.6867,-99.0967 26.4116,-98.8138 26.3574,-98.6668 26.2257,-98.5474 26.2343,-98.3276 26.1357,-98.1697 26.0457,-97.9143 26.0518,-97.6643 26.0050,-97.4020 25.8419,-97.3526 25.9074,-97.0148 25.9679,-97.0697 26.1789,-97.2249 26.8253,-97.0752 27.4230,-96.6096 28.0599,-95.9285 28.4228,-95.3036 28.7568,-94.7296 29.0742,-94.3355 29.3810,-93.8205 29.6021,-93.9317 29.8013,-93.8136 29.9157,-93.7230 30.0489,-93.6996 30.1214,-93.7216 30.2021,-93.7038 30.2792,-93.7628 30.3278,-93.7587 30.3835,-93.7010 30.4380,-93.7024 30.5079,-93.7299 30.5362,-93.6694 30.6296,-93.6090 30.7466,-93.5527 30.8114,-93.5747 30.8834,-93.5307 30.9376,-93.5074 31.0318,-93.5266 31.0812,-93.5335 31.1787,-93.5980 31.1670,-93.6832 31.3055,-93.6708 31.3830,-93.6887 31.4369,-93.7202 31.5107,-93.8315 31.5820,-93.8123 31.6440,-93.8232 31.7188,-93.8342 31.7936,-93.8782 31.8309,-93.9221 31.8869,-93.9661 31.9335,-94.0430 32.0081,-94.0430 33.4681,-94.0430 33.5414,-94.1528 33.5689,-94.1968 33.5872,-94.2627 33.5872,-94.3176 33.5689,-94.3945 33.5597,-94.4275 33.5780,-94.4275 33.6055,-94.4495 33.6421,-94.4879 33.6329,-94.5236 33.6421,-94.6637 33.6695,-94.7461 33.7061,-94.8999 33.7791,-95.0757 33.8818,-95.1526 33.9251,-95.2254 33.9604,-95.2858 33.8750,-95.5399 33.8841,-95.7568 33.8887,-95.8420 33.8408,-96.0274 33.8556,-96.3528 33.6901,-96.6179 33.8442,-96.5836 33.8898,-96.6673 33.8955,-96.7538 33.8179,-96.8335 33.8613,-96.8774 33.8613,-96.9159 33.9388,-97.0917 33.7392,-97.1645 33.7449,-97.2180 33.8978,-97.3746 33.8225,-97.4611 33.8305,-97.4460 33.8761,-97.6945 33.9798,-97.8648 33.8476,-97.9651 33.8978,-98.0983 34.0299,-98.1752 34.1141,-98.3743 34.1425,-98.4773 34.0640,-98.5529 34.1209,-98.7520 34.1232,-98.9539 34.2095,-99.0637 34.2073,-99.1832 34.2141,-99.2505 34.3593,-99.3823 34.4613,-99.4318 34.3774,-99.5718 34.4160,-99.6158 34.3706,-99.8094 34.4726,-99.9934 34.5631,-100.0017 36.4975,-103.0408 36.5008,-103.0655 32.0011,-106.6168 32.0023,-106.5715 31.8659))"
TX_polygon = loads(TX_wkt_string)

# Define the WKT string for the LA polygon
LA_wkt_string = "POLYGON((-94.0430 33.0225,-93.0048 33.0179,-91.1646 33.0087,-91.2209 32.9269,-91.1220 32.8773,-91.1481 32.8358,-91.1412 32.7642,-91.1536 32.6382,-91.1069 32.5804,-91.0080 32.6093,-91.0904 32.4588,-91.0355 32.4379,-91.0286 32.3742,-90.9064 32.3150,-90.9723 32.2616,-91.0464 32.1942,-91.0739 32.1198,-91.0464 32.0593,-91.1014 31.9918,-91.1865 31.9498,-91.3101 31.8262,-91.3527 31.7947,-91.3925 31.6230,-91.5134 31.6218,-91.4310 31.5668,-91.5161 31.5130,-91.5244 31.3701,-91.5477 31.2598,-91.6425 31.2692,-91.6603 31.2328,-91.5848 31.1917,-91.6287 31.1047,-91.5614 31.0318,-91.6397 30.9988,-89.7336 31.0012,-89.8517 30.6686,-89.7858 30.5386,-89.6347 30.3148,-89.5688 30.1807,-89.4960 30.1582,-89.1843 30.2140,-89.0373 30.1463,-88.8354 30.0905,-88.7421 29.8383,-88.8712 29.5758,-88.9371 29.1833,-89.0359 28.9649,-89.2282 28.8832,-89.4754 28.9048,-89.7418 29.1210,-90.1126 28.9529,-90.6619 28.9120,-91.0355 28.9553,-91.3211 29.1210,-91.9061 29.2864,-92.7452 29.4360,-93.8177 29.6009,-93.8631 29.6749,-93.8933 29.7370,-93.9304 29.7930,-93.9276 29.8216,-93.8370 29.8883,-93.7985 29.9811,-93.7601 30.0144,-93.7106 30.0691,-93.7354 30.0929,-93.6996 30.1166,-93.7271 30.1997,-93.7106 30.2899,-93.7656 30.3350,-93.7601 30.3871,-93.6914 30.4416,-93.7106 30.5102,-93.7463 30.5433,-93.7106 30.5954,-93.6914 30.5906,-93.6859 30.6545,-93.6365 30.6781,-93.6200 30.7513,-93.5925 30.7890,-93.5513 30.8150,-93.5623 30.8645,-93.5788 30.8881,-93.5541 30.9187,-93.5294 30.9423,-93.5760 31.0082,-93.5101 31.0318,-93.5596 31.0906,-93.5321 31.1211,-93.5349 31.1799,-93.5953 31.1658,-93.6282 31.2292,-93.6118 31.2668,-93.6859 31.3044,-93.6694 31.3888,-93.7051 31.4240,-93.6859 31.4427,-93.7573 31.4755,-93.7189 31.5083,-93.8040 31.5411,-93.8425 31.6113,-93.8205 31.6581,-93.7985 31.7071,-93.8480 31.8029,-93.9029 31.8892,-93.9606 31.9149,-94.0430 32.0081,-94.0430 32.7041,-94.0430 33.0225,-94.0430 33.0225))"
LA_polygon = loads(LA_wkt_string)

# Define the WKT string for the MS polygon
MS_wkt_string = "POLYGON((-90.3049 35.0041,-88.1955 35.0075,-88.0994 34.8882,-88.1241 34.7044,-88.2573 33.6661,-88.4756 31.8939,-88.4180 30.8657,-88.3850 30.1594,-88.8327 30.0905,-89.1870 30.2104,-89.4919 30.1570,-89.5757 30.1796,-89.6457 30.3326,-89.7748 30.5232,-89.8531 30.6663,-89.7377 30.9988,-91.6287 30.9988,-91.5601 31.0341,-91.6273 31.1106,-91.5916 31.1658,-91.6589 31.2304,-91.6452 31.2656,-91.5436 31.2609,-91.5271 31.3724,-91.5161 31.4099,-91.5120 31.5071,-91.4502 31.5692,-91.5147 31.6230,-91.3966 31.6253,-91.3513 31.7936,-91.2744 31.8589,-91.1673 31.9755,-91.0767 32.0267,-91.0767 32.1198,-91.0437 32.1942,-91.0107 32.2221,-90.9132 32.3150,-91.0313 32.3742,-91.0217 32.4263,-91.0986 32.4634,-91.0080 32.6070,-91.1096 32.5746,-91.1536 32.6394,-91.1426 32.7226,-91.1426 32.7873,-91.1536 32.8519,-91.1206 32.8796,-91.2195 32.9257,-91.2085 32.9995,-91.2016 33.0444,-91.2016 33.1192,-91.1041 33.1835,-91.1536 33.3397,-91.1646 33.4223,-91.2291 33.4337,-91.2524 33.5414,-91.1838 33.6135,-91.2524 33.6878,-91.1261 33.6969,-91.1426 33.7883,-91.0437 33.7700,-91.0327 33.8339,-91.0657 33.8795,-91.0876 33.9434,-90.9998 33.9889,-90.9229 34.0253,-90.9009 34.0891,-90.9668 34.1345,-90.9119 34.1709,-90.8501 34.1345,-90.9338 34.2277,-90.8267 34.2833,-90.6921 34.3434,-90.6509 34.3774,-90.6152 34.3978,-90.5589 34.4432,-90.5740 34.5179,-90.5823 34.5880,-90.5356 34.7506,-90.5136 34.7913,-90.4532 34.8780,-90.3543 34.8476,-90.2911 34.8702,-90.3062 35.0041,-90.3049 35.0041))"
MS_polygon = loads(MS_wkt_string)

# Define the WKT string for the AL polygon
AL_wkt_string = "POLYGON((-88.1955 35.0041,-85.6068 34.9918,-85.1756 32.8404,-84.8927 32.2593,-85.0342 32.1535,-85.1358 31.7947,-85.0438 31.5200,-85.0836 31.3384,-85.1070 31.2093,-84.9944 31.0023,-87.6009 30.9953,-87.5926 30.9423,-87.6256 30.8539,-87.4072 30.6745,-87.3688 30.4404,-87.5240 30.1463,-88.3864 30.1546,-88.4743 31.8939,-88.1021 34.8938,-88.1721 34.9479,-88.1461 34.9107,-88.1955 35.0041))"
AL_polygon = loads(AL_wkt_string)

# Define the WKT string for the AL polygon
FL_wkt_string = "POLYGON((-87.6050 30.9988,-86.5613 30.9964,-85.5313 31.0035,-85.1193 31.0012,-85.0012 31.0023,-84.9847 30.9364,-84.9367 30.8845,-84.9271 30.8409,-84.9257 30.7902,-84.9147 30.7489,-84.8611 30.6993,-84.4272 30.6911,-83.5991 30.6509,-82.5595 30.5895,-82.2134 30.5682,-82.2134 30.5315,-82.1997 30.3883,-82.1544 30.3598,-82.0638 30.3598,-82.0226 30.4877,-82.0473 30.6308,-82.0514 30.6757,-82.0377 30.7111,-82.0514 30.7371,-82.0102 30.7678,-82.0322 30.7914,-81.9717 30.7997,-81.9608 30.8244,-81.8893 30.8056,-81.8372 30.7914,-81.7960 30.7796,-81.6696 30.7536,-81.6051 30.7289,-81.5666 30.7324,-81.5295 30.7229,-81.4856 30.7253,-81.4609 30.7111,-81.4169 30.7088,-81.2274 30.7064,-81.2357 30.4345,-81.1725 30.3160,-81.0379 29.7763,-80.5861 28.8603,-80.3650 28.4771,-80.3815 28.1882,-79.9255 27.1789,-79.8198 26.8425,-79.9118 26.1394,-79.9997 25.5115,-80.3815 24.8802,-80.8704 24.5384,-81.9250 24.3959,-82.2066 24.4496,-82.3137 24.5484,-82.1997 24.6982,-81.3977 25.2112,-81.4622 25.6019,-81.9456 25.9235,-82.2876 26.3439,-82.5307 26.9098,-82.8342 27.3315,-83.0182 27.7565,-83.0017 28.0574,-82.8548 28.6098,-83.0264 28.9697,-83.2050 29.0478,-83.5318 29.4157,-83.9767 29.9133,-84.1072 29.8930,-84.4409 29.6940,-85.0465 29.4551,-85.3610 29.4946,-85.5807 29.7262,-86.1946 30.1594,-86.8510 30.2175,-87.5171 30.1499,-87.4429 30.3006,-87.3750 30.4256,-87.3743 30.4830,-87.3907 30.5658,-87.4004 30.6344,-87.4141 30.6763,-87.5253 30.7702,-87.6256 30.8527,-87.5912 30.9470,-87.5912 30.9682,-87.6050 30.9964,-87.6050 30.9988))"
FL_polygon = loads(FL_wkt_string)

# Find the min and max bounds for each state
# TX_min_lat,TX_min_lng,TX_max_lat,TX_max_lng=TX_polygon.bounds
# LA_min_lat,LA_min_lng,LA_max_lat,LA_max_lng=LA_polygon.bounds
# MS_min_lat,MS_min_lng,MS_max_lat,MS_max_lng=MS_polygon.bounds
# AL_min_lat,AL_min_lng,AL_max_lat,AL_max_lng=AL_polygon.bounds
# FL_min_lat,FL_min_lng,AL_max_lat,FL_max_lng=FL_polygon.bounds

TX_min_lng,TX_min_lat,TX_max_lng,TX_max_lat=TX_polygon.bounds
LA_min_lng,LA_min_lat,LA_max_lng,LA_max_lat=LA_polygon.bounds
MS_min_lng,MS_min_lat,MS_max_lng,MS_max_lat=MS_polygon.bounds
AL_min_lng,AL_min_lat,AL_max_lng,AL_max_lat=AL_polygon.bounds
FL_min_lng,FL_min_lat,FL_max_lng,FL_max_lat=FL_polygon.bounds


avg_lat = .5*(TX_min_lat+TX_max_lat)
avg_lng = .5*(TX_min_lng+TX_max_lng)
avg_point = Point(avg_lng,avg_lat)

TX_within_polygon = avg_point.within(TX_polygon)
LA_within_polygon = avg_point.within(LA_polygon)
MS_within_polygon = avg_point.within(MS_polygon)
AL_within_polygon = avg_point.within(AL_polygon)
FL_within_polygon = avg_point.within(FL_polygon)

# # Initialize variables
TX = LA = MS = AL = FL = 0
if TX_within_polygon:
    TX = 1
elif LA_within_polygon:
    LA = 1
elif MS_within_polygon:
    MS = 1
elif AL_within_polygon:
    AL = 1
elif FL_within_polygon:
    FL = 1
    
print(avg_point,"is_within_TX",TX_within_polygon)
print(avg_point,"is_within_LA",LA_within_polygon)
print(avg_point,"is_within_MS",MS_within_polygon)
print(avg_point,"is_within_AL",AL_within_polygon)
print(avg_point,"is_within_FL",FL_within_polygon)

print(TX,LA,MS,AL,FL)
print(avg_lat,avg_lng)

# 1. Prepare for analysis

## Load csv files from into data frames

In [None]:
# # Input files
disappear_df = pd.read_csv(DisappearanceIndicators_filename)
indicators_df = pd.read_csv(Rec_indicators_with_V3_filename)

rec_indicators_df = pd.merge(
    disappear_df,
    indicators_df,
    on=['cuebiq_id', 'Trip_number'],
    how='outer',  # Use 'outer' to keep all rows from both DataFrames
)


V3_Pings_df = pd.read_csv(Combined_Pings_V3_Before_After_filename)
Pings_OurTable_Gulf_df= pd.read_csv(Combined_Pings_OurTable_Gulf_filename)

# 2. Maps of stops and trajectories before and after a trip the points

#### Load indicators and device-table data. Add a unique identifier for each trip

In [None]:
rec_indicators_df = rec_indicators_df.sort_values(by=['cuebiq_id', 'Trip_number']).reset_index(drop=True)
rec_indicators_df['Trip_number_full_list'] = rec_indicators_df.index + 1

### Map the stops for a single trip

In [None]:
index = 2000

# Load the stations
station_points_df = pd.read_csv(station_points_filename)
MRIP_station_points_df = pd.read_csv(MRIP_station_points_filename)

# Grab the TripToLookAt row from rec_indicators_df
trip_row = rec_indicators_df.iloc[index]

# EXTRACT THE ROWS FROM THE DEVICE TABLE DATA SET THAT ARE 8 HOURS ON EITHER SIDE OF THE TRIP
cuebiqid_value = trip_row['cuebiq_id']
trip_start_epochtime = trip_row['timestamp_start_t']
trip_end_epochtime = trip_row['timestamp_end_t']

# Conversion factors
Eight_hours = 8*60*60
km_per_min_To_mph = 37.2823
mph_To_km_per_min = 1/37.2823

########### Create a data frame for points before and after the trip #######
# Choose the right cuebiq_id
Device_df = V3_Pings_df[(V3_Pings_df['cuebiq_id'] == cuebiqid_value) & 
                        (V3_Pings_df['event_timestamp'] >= trip_start_epochtime-Eight_hours) &
                        (V3_Pings_df['event_timestamp'] <= trip_end_epochtime+Eight_hours)
                         ]
Device_df = Device_df.sort_values(by='event_timestamp')
# Eliminate inaccuate pings
Device_df = EliminateErrantPingsSpeed(Device_df, 90)
# Filter out pings that occurred during the trip
Device_df = Device_df[~((Device_df['event_timestamp'] > trip_start_epochtime) & (Device_df['event_timestamp'] < trip_end_epochtime))]

Gulf_Pings_df = Pings_OurTable_Gulf_df[(Pings_OurTable_Gulf_df['cuebiq_id'] == cuebiqid_value) & 
                                      (Pings_OurTable_Gulf_df['event_timestamp'] >= trip_start_epochtime) &
                                      (Pings_OurTable_Gulf_df['event_timestamp'] <= trip_end_epochtime)]
Gulf_Pings_df = EliminateErrantPingsSpeed(Gulf_Pings_df, 60)                                                             
                                                             
# print(cuebiqid_value,"len(Device_df)", len(Device_df))

##  FIRST PLL THE LINSE FROM THE DEVICE TABLE BEFORE AND AFTER
DuringTripLine = Gulf_Pings_df[['lat', 'lng']].values.tolist()



# Prepare before and after strings of points, eliminating errant pings
before_df=[]
before_df= Device_df[Device_df['event_timestamp'] <= trip_start_epochtime]
print("before_df", len(before_df))
if len(before_df) > 1:
    BeforeTripLine = before_df[['lat', 'lng']].values.tolist()
    calculate_speed(before_df)
    stops_before_df = before_df[before_df['ping_speed_fwd'] < mph_To_km_per_min].copy()

after_df = []
after_df = Device_df[Device_df['event_timestamp'] >= trip_end_epochtime]
if len(after_df)>1:
    AfterTripLine = after_df[['lat', 'lng']].values.tolist()
    calculate_speed(after_df)
    stops_after_df = after_df[after_df['ping_speed_fwd'] < mph_To_km_per_min].copy()

print(index, cuebiqid_value, "before:", len(before_df), "after: ", len(after_df))

###  SATELLITE MAP CODE ###
# Create a folium map centered at the first point of the trajectory
avlat = Device_df['lat'].mean()
avlng = Device_df['lng'].mean()
map_center = [avlat,avlng]

# Create a Folium map with a satellite view
my_map = folium.Map(location=[after_df['lat'].iloc[0], after_df['lng'].iloc[0]], zoom_start=13, tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google Satellite')

# Add TX & Louisinana  station points
for index, row in station_points_df.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lng']],
        radius=4,  # Adjust the radius as needed
        color='green',  # Color of the dot
        fill=True,
        fill_color='green',  # Color to fill the dot
        fill_opacity=1,  # Opacity of the fill
    ).add_to(my_map)
    
# Add MRIP station points
for index, row in MRIP_station_points_df.iterrows():
    folium.CircleMarker(
        location=[row['SITE_LAT'], row['SITE_LONG']],
        radius=4,  # Adjust the radius as needed
        color='yellow',  # Color of the dot
        fill=True,
        fill_color='yellow',  # Color to fill the dot
        fill_opacity=1,  # Opacity of the fill
    ).add_to(my_map)
    
# Add CircleMarker for points in before_df
for index, row in stops_before_df.iterrows():
    location = [row['lat'], row['lng']]
    folium.CircleMarker(
        location=location,
        radius=5,  # Adjust the radius as needed
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
    ).add_to(my_map)

# Add CircleMarker for points in after_df
for index, row in stops_after_df.iterrows():
    location = [row['lat'], row['lng']]
    folium.CircleMarker(
        location=location,
        radius=5,  # Adjust the radius as needed
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
    ).add_to(my_map)


# Add during trip line    
if len(DuringTripLine) >0:
    folium.PolyLine(
        locations=DuringTripLine,
        color='pink',
        weight=2,
        opacity=1,
    ).add_to(my_map)

# Add the first PolyLine
if len(BeforeTripLine) >0:
    folium.PolyLine(
        locations=BeforeTripLine,
        color='blue',
        weight=2,
        opacity=1,
        popup='Before Trip'
    ).add_to(my_map)

    # Add marker at First point before trip
    time_before = (trip_start_epochtime-Device_df['event_timestamp'].iloc[0])/3600
    folium.Marker(
        location=BeforeTripLine[0],  # First point of BeforeTripLine
        icon=folium.Icon(color='blue'),
        popup=f'First point {time_before} hours before trip'
    ).add_to(my_map)

# Add the second PolyLine
if len(AfterTripLine)>0:
    folium.PolyLine(
        locations=AfterTripLine,
        color='red',
        weight=2,
        opacity=1,
        popup='After Trip'
    ).add_to(my_map)
    
    timeafter = round((Device_df['event_timestamp'].iloc[-1] - trip_end_epochtime)/3600,1)
    # Add marker at last point before trip
    folium.Marker(
        location=AfterTripLine[-1],  # Last point of AfterTripLine
        icon=folium.Icon(color='red'),
        popup=f'Last point {timeafter} hours after trip '
    ).add_to(my_map)

   
    
# Iterate over each H3 cell and map that cell and its neigbors
for h3_cell in combined_h3_cells:
    # Get the polygon vertices of the H3 cell
    polygon_vertices = h3.h3_to_geo_boundary(h3_cell)

    # Convert the tuple to a list of (latitude, longitude) pairs
    polygon_vertices_list = [(lat, lng) for lat, lng in polygon_vertices]

    # Plot the polygon on the map
    folium.Polygon(
        locations=polygon_vertices_list,
        color='yellow',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.4
    ).add_to(my_map)
    
# Iterate over each H3 cell in the list and map just that cell
for h3_cell in combined_h3_cells_alone:
    # Get the polygon vertices of the H3 cell
    polygon_vertices = h3.h3_to_geo_boundary(h3_cell)

    # Convert the tuple to a list of (latitude, longitude) pairs
    polygon_vertices_list = [(lat, lng) for lat, lng in polygon_vertices]

    # Plot the polygon on the map
    folium.Polygon(
        locations=polygon_vertices_list,
        color='purple',
        fill=True,
        fill_color='purple',
        fill_opacity=0.2
    ).add_to(my_map)
    

# # END OF MAP CODE ###

# Define the filename
# folder_name = '../RecMapsBeforeAndAfter'

# html_filename = os.path.join(folder_name, f'TESTRecMapOfPathsBeforeAndAfter_{index}.html')

# Save and display the map
# my_map.save(html_filename)
my_map
# IFrame(src=html_filename, width=700, height=600)
  

## Loop over multiple trips to look generate
#### This includes perturbations so that it can be exported

In [None]:
# Load the stations
station_points_df = pd.read_csv(station_points_filename)
MRIP_station_points_df = pd.read_csv(MRIP_station_points_filename)

for indicator_index in range(803, 850):

    # Grab the TripToLookAt row from rec_indicators_df
    trip_row = rec_indicators_df.iloc[indicator_index]

    # EXTRACT THE ROWS FROM THE DEVICE TABLE DATA SET THAT ARE 8 HOURS ON EITHER SIDE OF THE TRIP
    cuebiqid_value = trip_row['cuebiq_id']
    trip_start_epochtime = trip_row['timestamp_start_t']
    trip_end_epochtime = trip_row['timestamp_end_t']
    prob = trip_row['max_prob']
    vt  = trip_row['Predicted_Class']
    
    # Conversion factors
    Eight_hours = 8*60*60
    km_per_min_To_mph = 37.2823
    mph_To_km_per_min = 1/37.2823

    ######### First select the relevant pings during the trip
    Gulf_Pings_df = Pings_OurTable_Gulf_df[(Pings_OurTable_Gulf_df['cuebiq_id'] == cuebiqid_value) & 
                                          (Pings_OurTable_Gulf_df['event_timestamp'] >= trip_start_epochtime) &
                                          (Pings_OurTable_Gulf_df['event_timestamp'] <= trip_end_epochtime)]
    Gulf_Pings_df = EliminateErrantPingsSpeed(Gulf_Pings_df, 60)                                                             

    ########### Create a data frame for points before and after the trip #######
    # Choose the right cuebiq_id
    Device_df = V3_Pings_df[(V3_Pings_df['cuebiq_id'] == cuebiqid_value)]
    # Eliminate inaccuate pings
    # Filter to only pings within 8 hours of the start or end of the trip
    Device_df = Device_df[(Device_df['event_timestamp'] >= (trip_start_epochtime-Eight_hours))] 
    Device_df = Device_df[(Device_df['event_timestamp'] <= (trip_end_epochtime+Eight_hours))] 
    # Filter out pings that occurred during the trip
    Device_df = Device_df[~((Device_df['event_timestamp'] > trip_start_epochtime) & (Device_df['event_timestamp'] < trip_end_epochtime))]

    Device_df = Device_df.sort_values(by='event_timestamp')

    #################################### Perturbations ###########################################
    # Perturb the latitud and Longitude by approximately +/- 1/4 of the width of a H3 cell
    # Side to side distance of an H3 cell longitude: 0.001425451
    # Top to bottom distance on an H3 cell in latitude: 0.001355713
    lat_perturbation = 0.001355713/4  
    lng_perturbation = 0.001425451/4  

    Device_df['lat'] += np.random.uniform(-lat_perturbation, lat_perturbation, size=len(Device_df))
    Device_df['lng'] += np.random.uniform(-lng_perturbation, lng_perturbation, size=len(Device_df))
    Gulf_Pings_df['lat'] += np.random.uniform(-lat_perturbation, lat_perturbation, size=len(Gulf_Pings_df))
    Gulf_Pings_df['lng'] += np.random.uniform(-lng_perturbation, lng_perturbation, size=len(Gulf_Pings_df))

    
    DuringTripLine = Gulf_Pings_df[['lat', 'lng']].values.tolist()

    
    ##  FIRST PLL THE LINSE FROM THE DEVICE TABLE BEFORE AND AFTER
    # Prepare before and after strings of points, eliminating errant pings
    before_df=[]
    before_df= Device_df[Device_df['event_timestamp'] <= trip_start_epochtime]
    before_df = EliminateErrantPingsSpeed(before_df, 90)

    if len(before_df) > 1:
        BeforeTripLine = before_df[['lat', 'lng']].values.tolist()
        calculate_speed(before_df)
        stops_before_df = before_df[before_df['ping_speed_fwd'] < mph_To_km_per_min].copy()

    after_df = []
    after_df = Device_df[Device_df['event_timestamp'] >= trip_end_epochtime]
    after_df = EliminateErrantPingsSpeed(after_df, 90)

    if len(after_df)>1:
        AfterTripLine = after_df[['lat', 'lng']].values.tolist()
        calculate_speed(after_df)
        stops_after_df = after_df[after_df['ping_speed_fwd'] < mph_To_km_per_min].copy()

    print(indicator_index, cuebiqid_value, "before:", len(before_df), "after: ", len(after_df))

    # Create map only if there's enough pings to make it interesting
    if (len(before_df) > 10) & (len(after_df) > 1): 
        ###  SATELLITE MAP CODE ###
        # Create a folium map centered at the first point of the trajectory
        avlat = Device_df['lat'].mean()
        avlng = Device_df['lng'].mean()
        map_center = [avlat,avlng]

        # Create a Folium map with a satellite view
        my_map = folium.Map(location=[avlat, avlng], zoom_start=13, tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google Satellite')

        # Add TX & Louisinana  station points
        for index, row in station_points_df.iterrows():
            folium.CircleMarker(
                location=[row['lat'], row['lng']],
                radius=4,  # Adjust the radius as needed
                color='green',  # Color of the dot
                fill=True,
                fill_color='green',  # Color to fill the dot
                fill_opacity=1,  # Opacity of the fill
            ).add_to(my_map)

        # Add MRIP station points
        for index, row in MRIP_station_points_df.iterrows():
            folium.CircleMarker(
                location=[row['SITE_LAT'], row['SITE_LONG']],
                radius=4,  # Adjust the radius as needed
                color='yellow',  # Color of the dot
                fill=True,
                fill_color='yellow',  # Color to fill the dot
                fill_opacity=1,  # Opacity of the fill
            ).add_to(my_map)

        # Add CircleMarker for points in before_df
        for index, row in stops_before_df.iterrows():
            location = [row['lat'], row['lng']]
            folium.CircleMarker(
                location=location,
                radius=5,  # Adjust the radius as needed
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.7,
            ).add_to(my_map)
            
        # Add during trip line    
        if len(DuringTripLine) >0:
            folium.PolyLine(
                locations=DuringTripLine,
                color='pink',
                weight=2,
                opacity=1,
            ).add_to(my_map)

        # Add CircleMarker for points in after_df
        for index, row in stops_after_df.iterrows():
            location = [row['lat'], row['lng']]
            folium.CircleMarker(
                location=location,
                radius=5,  # Adjust the radius as needed
                color='red',
                fill=True,
                fill_color='red',
                fill_opacity=0.7,
            ).add_to(my_map)


        # Add the first PolyLine
        if len(BeforeTripLine) >0:
            folium.PolyLine(
                locations=BeforeTripLine,
                color='blue',
                weight=2,
                opacity=1,
                popup='Before Trip'
            ).add_to(my_map)

            # Add marker at First point before trip
            time_before = round((trip_start_epochtime-Device_df['event_timestamp'].iloc[0])/3600,1)
            folium.Marker(
                location=BeforeTripLine[0],  # First point of BeforeTripLine
                icon=folium.Icon(color='blue'),
                popup=f'{time_before} hours before trip. VT {vt} with P( {round(prob,2)}'
            ).add_to(my_map)

        # Add the second PolyLine
        if len(AfterTripLine)>0:
            folium.PolyLine(
                locations=AfterTripLine,
                color='red',
                weight=2,
                opacity=1,
                popup='After Trip'
            ).add_to(my_map)

            timeafter = round((Device_df['event_timestamp'].iloc[-1] - trip_end_epochtime)/3600,1)
            # Add marker at last point before trip
            folium.Marker(
                location=AfterTripLine[-1],  # Last point of AfterTripLine
                icon=folium.Icon(color='red'),
                popup=f'{timeafter} hours after trip. VT {vt} with P( {round(prob,2)}'
            ).add_to(my_map)



        # Iterate over each H3 cell and map that cell and its neigbors
        for h3_cell in combined_h3_cells:
            # Get the polygon vertices of the H3 cell
            polygon_vertices = h3.h3_to_geo_boundary(h3_cell)

            # Convert the tuple to a list of (latitude, longitude) pairs
            polygon_vertices_list = [(lat, lng) for lat, lng in polygon_vertices]

            # Plot the polygon on the map
            folium.Polygon(
                locations=polygon_vertices_list,
                color='yellow',
                fill=True,
                fill_color='yellow',
                fill_opacity=0.4
            ).add_to(my_map)

        # Iterate over each H3 cell in the list and map just that cell
        for h3_cell in combined_h3_cells_alone:
            # Get the polygon vertices of the H3 cell
            polygon_vertices = h3.h3_to_geo_boundary(h3_cell)

            # Convert the tuple to a list of (latitude, longitude) pairs
            polygon_vertices_list = [(lat, lng) for lat, lng in polygon_vertices]

            # Plot the polygon on the map
            folium.Polygon(
                locations=polygon_vertices_list,
                color='purple',
                fill=True,
                fill_color='purple',
                fill_opacity=0.2
            ).add_to(my_map)


        # # END OF MAP CODE ###

        # Define the filename
        html_filename = os.path.join(TrajectoryMaps_directory, f'MapOfPathsBeforeAndAfter_{indicator_index}.html')

        # Save and display the map
        my_map.save(html_filename)

my_map  

### Zip the maps to a single file

In [None]:
import zipfile
import os

os.chdir(TrajectoryMaps_directory)


# Directory containing the files to be zipped
# directory = './MapsForExporting'  # Change this to the desired directory
# rootdir = '.'

# Prefix for the files you want to include in the zip archive
# file_prefix = 'SatRecMapOfPathsBeforeAndAfter_'
file_prefix = 'MapOfPathsBeforeAndAfter_'

# Name of the zip file
zip_filename = 'PerturbedMapsBeforeAndAfter.zip'

# Get a list of files that match the specified prefix
files_to_zip = [filename for filename in os.listdir(TrajectoryMaps_directory) if filename.startswith(file_prefix)]

# Create a zip file and add the matching files to it
with zipfile.ZipFile(zip_filename, 'w') as zip_file:
    for file in files_to_zip:
        file_path = os.path.join(TrajectoryMaps_directory, file)
        zip_file.write(file_path, arcname=os.path.basename(file_path))

print(f'Successfully created {zip_filename} containing {len(files_to_zip)} files.')


# 3. Identify where vessel has "stopped"

In [None]:
indicators_df = pd.read_csv(Combined_indicators_with_disappearance_filename)

# Variables used for all trips
Eight_hours = 8 * 60 * 60
One_mph = 0.0268224   # Speeds are calculated in km/min. 1 mph = 0.0268224 km/min

# Check whether the output file already exists
if os.path.isfile(Station_NonStationAnalysis_filename):
    df = pd.read_csv(Station_NonStationAnalysis_filename)
    FirstWrite = False
    start_row = df['num'].max()+1
else:
    FirstWrite = True
    start_row = 1
    
# Run through all rows of the Rec_indicators
end_row = len(indicators_df)

# Loop over the specified range of rows
irow = start_row
print("irow", irow)
# for index, row in tqdm(TC_trips_1_df.iterrows(), total=len(TC_trips_1_df)):

for index, row in tqdm(indicators_df.iloc[start_row-1:end_row].iterrows(), total=end_row-start_row):
    # print("irow", irow)
    intersection_results_df = []
    id_value = row['cuebiq_id']
    Trip_number = row['Trip_number']
    Trip_Start = row['timestamp_start_t']
    Trip_End = row['timestamp_end_t']
    Interruption_01 = row['Interruption_01']
    Avg_Interruption_01 = row['Avg_Interruption_01']

    # Get pings before trip
    this_trip_before_df = V3_Pings_df[
        (V3_Pings_df['cuebiq_id'] == id_value) &
        (V3_Pings_df['event_timestamp'] >= (Trip_Start-Eight_hours)) &
        (V3_Pings_df['event_timestamp'] <= Trip_Start) ]
    this_trip_before_df=EliminateErrantPingsSpeed(this_trip_before_df, 90)
    
    # Get pings after trip
    this_trip_after_df = V3_Pings_df[
        (V3_Pings_df['cuebiq_id'] == id_value) &
        (V3_Pings_df['event_timestamp'] <= (Trip_End+Eight_hours)) &
        (V3_Pings_df['event_timestamp'] >= Trip_End)     ]
    this_trip_after_df=EliminateErrantPingsSpeed(this_trip_after_df, 90)
    
    # Concatenate before and after 
    before_and_after_df = pd.concat([this_trip_before_df, this_trip_after_df], ignore_index=True)
    nPings = len(before_and_after_df)
    nStops = 0
    # Set key results to zero if no before or after pings are found. This should never happen
    if len(before_and_after_df)==0:
        intersection_results_df = pd.DataFrame({
            'num': [irow],
            'cuebiq_id': [id_value],
            'Trip_number': [Trip_number],
            'Interruption_01': [Interruption_01],
            'Avg_Interruption_01': [Avg_Interruption_01],
            'nPings': [nPings],
            'nStops': [nStops]
        })
        print("Something is wrong because no before or after pings were found")
        continue            

    ##################################################################################
    # Find out what state we're in and save those indicators
    avg_lng = before_and_after_df['lng'].mean()
    avg_lat = before_and_after_df['lat'].mean()
    avg_point = Point(avg_lng,avg_lat)
    # 
    TX_within_polygon = avg_point.within(TX_polygon)
    LA_within_polygon = avg_point.within(LA_polygon)
    MS_within_polygon = avg_point.within(MS_polygon)
    AL_within_polygon = avg_point.within(AL_polygon)
    FL_within_polygon = avg_point.within(FL_polygon)

    # # Initialize variables
    TX = LA = MS = AL = FL = 0
    if TX_within_polygon:
        TX = 1
    elif LA_within_polygon:
        LA = 1
    elif MS_within_polygon:
        MS = 1
    elif AL_within_polygon:
        AL = 1
    elif FL_within_polygon:
        FL = 1


    ##################################################################################
    # Restrict the results to only points where the device is going less that one mph
    before_and_after_df = before_and_after_df[(before_and_after_df['Avg_ping_speed'] <= One_mph)]
    nStops = len(before_and_after_df)

    # Get h3 cells of all points where the device is "stopped"
    before_and_after_h3_cells = get_h3_cells_for_dataframe(before_and_after_df, resolution)

    # Find all interesections
    LA_CREEL_station_points_intersection = list(set(before_and_after_h3_cells).intersection(LA_CREEL_station_points_h3_cells))
    FL_MRIP_station_points_intersection = list(set(before_and_after_h3_cells).intersection(FL_MRIP_station_points_h3_cells))
    AL_MRIP_station_points_intersection = list(set(before_and_after_h3_cells).intersection(AL_MRIP_station_points_h3_cells))
    MS_MRIP_station_points_intersection = list(set(before_and_after_h3_cells).intersection(MS_MRIP_station_points_h3_cells))
    TX_station_points_intersection = list(set(before_and_after_h3_cells).intersection(TX_station_points_h3_cells))
    LargePorts_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(LargePorts_Marine_traffic_h3_cells))
    Medium_Anchorage_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(Medium_Anchorage_Marine_traffic_h3_cells))
    Medium_Port_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(Medium_Port_Marine_traffic_h3_cells))
    Medium_Port_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(Medium_Port_Marine_traffic_h3_cells))
    Small_Marina_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(Small_Marina_Marine_traffic_h3_cells))
    Small_Port_Marine_traffic_intersection = list(set(before_and_after_h3_cells).intersection(Small_Port_Marine_traffic_h3_cells))


    intersection_results_df = pd.DataFrame({
    'num': [irow],
    'cuebiq_id': [id_value],
    'Trip_number': [Trip_number],
    'Interruption_01': [Interruption_01],
    'Avg_Interruption_01': [Avg_Interruption_01],
    'nPings': [nPings],
    'nStops': [nStops],
    'TX': [TX],
    'LA': [LA],
    'MS': [MS],
    'AL': [AL],
    'FL': [FL],
    'LA_CREEL_station_points_0_1': [1*(len(LA_CREEL_station_points_intersection)>0)],
    'FL_MRIP_station_points_0_1': [1*(len(FL_MRIP_station_points_intersection)>0)],
    'AL_MRIP_station_points_0_1': [1*(len(AL_MRIP_station_points_intersection)>0)],
    'MS_MRIP_station_points_0_1': [1*(len(MS_MRIP_station_points_intersection)>0)],
    'TX_station_points_0_1': [1*(len(TX_station_points_intersection)>0)],
    'LargePorts_Marine_traffic_0_1': [1*(len(LargePorts_Marine_traffic_intersection)>0)],
    'Medium_Anchorage_Marine_traffic_0_1': [1*(len(Medium_Anchorage_Marine_traffic_intersection)>0)],
    'Medium_Port_Marine_traffic_0_1': [1*(len(Medium_Port_Marine_traffic_intersection)>0)],
    'Medium_Port_Marine_traffic_0_1': [1*(len(Medium_Port_Marine_traffic_intersection)>0)],
    'Small_Marina_Marine_traffic_0_1': [1*(len(Small_Marina_Marine_traffic_intersection)>0)],
    'Small_Port_Marine_traffic_0_1': [1*(len(Small_Port_Marine_traffic_intersection)>0)]
    })

    intersection_results_df.to_csv(Station_NonStationAnalysis_filename, mode='a', header=not os.path.exists(Station_NonStationAnalysis_filename), index=False) 
    
    current_time_utc = datetime.utcnow()
    central_timezone = pytz.timezone('US/Central')
    current_time_central = current_time_utc.replace(tzinfo=pytz.utc).astimezone(central_timezone)
    formatted_time = current_time_central.strftime("%H:%M:%S")
    # print(formatted_time, "Row", irow, "In FL", FL)
    irow = irow +1

    
    

### Combine the indicators file with the data frame that identifies whether the vessel has stopped near a station

In [None]:
Combined_indicators_with_disappearance__df = pd.read_csv(Combined_indicators_with_disappearance_filename)
DisappearanceIndicators_df = pd.read_csv(DisappearanceIndicators_filename)
print(len(Combined_indicators_with_disappearance__df), len(DisappearanceIndicators_df))

In [None]:
# Get the columns of both DataFrames
combined_columns = set(Combined_indicators_with_disappearance__df.columns)
disappearance_columns = set(DisappearanceIndicators_df.columns)

# Find common columns
common_columns = combined_columns.intersection(disappearance_columns)
print(f"Common columns: {common_columns}")

# Find columns in Combined_indicators_with_disappearance__df but not in DisappearanceIndicators_df
only_in_combined = combined_columns - disappearance_columns
print(" ")
print(f"Columns only in Combined_indicators_with_disappearance__df: {only_in_combined}")

# Find columns in DisappearanceIndicators_df but not in Combined_indicators_with_disappearance__df
only_in_disappearance = disappearance_columns - combined_columns
print(" ")
print(f"Columns only in DisappearanceIndicators_df: {only_in_disappearance}")


In [None]:
# Input file names
# indicators_and_stops_filename = 'RecTripIndicators_WithStationStops.csv'  

# Load the files
intersection_results_df= pd.read_csv(Station_NonStationAnalysis_filename)
indicators_df = pd.read_csv(Combined_indicators_with_disappearance_filename)

# Merge in the stops
indicators_and_stops_df = intersection_results_df.merge(indicators_df, how='left', 
                                              on=['cuebiq_id', 'Trip_number'], suffixes=('', '_y'))
indicators_and_stops_df = indicators_and_stops_df.drop(indicators_and_stops_df.filter(like='_y'), axis=1)


indicators_and_stops_df.head(5)

# Save to a complete data frame
indicators_and_stops_df.to_csv(Station_NonStationAnalysis_full_filename, index = False)


# 4. Analysis of station and non-station trip

## A. Summary of station and non-station trips by state: launch_sites.csv 

In [None]:
LaunchSites_filename = Station_NonStationAnalysis_filename  #'launch_sites.csv'
# Station_NonStationAnalysis_full_filename  = os.path.join(Results_directory,'Station_NonStationAnalysis_full.csv')
count_filename = os.path.join(Results_directory,'SiteCountByState_nStops.csv')
percent_filename = os.path.join(Results_directory, 'PercentMonitoredByState_nStops.csv')
summary_percent_filename= os.path.join(Results_directory,'SummaryPercentMonitoredByState_nStops.csv')
DiffOfMeansResults_filename= os.path.join(Results_directory,'StationVsNon_comparison_of_means.csv')
DiffOfMeansResults_2_filename= os.path.join(Results_directory,'StationVsNon_comparison_of_means_3-1-25.csv')


In [None]:
# pd.set_option('display.max_columns', None)
# ###### DEBUGGING ########
# ###### DEBUGGING ########
# ###### DEBUGGING ########
# print(LaunchSites_filename)
# LaunchSites_filename_df = pd.read_csv(LaunchSites_filename)
# print(" len LaunchSites_filename_df ", len(LaunchSites_filename_df))
# Station_NonStationAnalysis_full_filename_df = pd.read_csv(Station_NonStationAnalysis_full_filename)
# print("len Station_NonStationAnalysis_full_filename_df", len(Station_NonStationAnalysis_full_filename_df))
# print(len(Station_NonStationAnalysis_full_filename_df) - Station_NonStationAnalysis_full_filename_df['Interruption_01'].sum())
# print(len(LaunchSites_filename_df))

# filter1 = Station_NonStationAnalysis_full_filename_df[Station_NonStationAnalysis_full_filename_df['nStops']>=10]
# filter2 = filter1[filter1['Interruption_01'] == 1]
# print(len(filter1))
# print(filter1['Interruption_01'].sum())

# # print(LaunchSites_filename_df.columns)
# # for col in Station_NonStationAnalysis_full_filename_df.columns:
# #     print(col)

# print(LaunchSites_filename_df['TX_station_points_0_1'].sum(),Station_NonStationAnalysis_full_filename_df['TX_station_points_0_1'].sum())

#### Load the indicators file and create a new variable, StopAtStation

In [None]:
# Indicators and stops file name

indicators_and_stops_df = pd.read_csv(Station_NonStationAnalysis_full_filename)

# List of columns to sum
AllStationVariablesColumns = ['LA_CREEL_station_points_0_1', 'FL_MRIP_station_points_0_1', 'AL_MRIP_station_points_0_1', 'MS_MRIP_station_points_0_1', 'TX_station_points_0_1']

# Summing the values in the specified columns
indicators_and_stops_df['StopAtStation'] = (indicators_and_stops_df[AllStationVariablesColumns].sum(axis=1) > 0).astype(int)

indicators_and_stops_df.to_csv(Station_NonStationAnalysis_full_filename, index = False)


### ADD THE NEW SPEED VARIABLES #####
Combined_indicators_complete_with_New_speeds_filename = os.path.join(OurTable_V3_directory,'Combined_indicators_complete_with_New_speeds_All.csv')
# Indicators_with_Disappearance_df = pd.read_csv(Combined_indicators_with_disappearance_filename)
newspeed_df = pd.read_csv(Combined_indicators_complete_with_New_speeds_filename)
indicators_and_stops_df = Merge_Trip_dfs(newspeed_df, indicators_and_stops_df)

In [None]:
from scipy.stats import ttest_ind

# Lists of Variagles
station_list = ['LA_CREEL_station_points_0_1','FL_MRIP_station_points_0_1','AL_MRIP_station_points_0_1','MS_MRIP_station_points_0_1','TX_station_points_0_1']
states_to_sum = ['TX', 'LA', 'MS', 'AL', 'FL']
sites_to_sum = ['LA_CREEL_station_points_0_1', 'FL_MRIP_station_points_0_1',
                'AL_MRIP_station_points_0_1', 'MS_MRIP_station_points_0_1',
                'TX_station_points_0_1', 'LargePorts_Marine_traffic_0_1',
                'Medium_Anchorage_Marine_traffic_0_1', 'Medium_Port_Marine_traffic_0_1',
                'Small_Marina_Marine_traffic_0_1', 'Small_Port_Marine_traffic_0_1']

# Initialize dfs fro the results
all_results_dfs = []
all_normalized_dfs = []
all_PercentStation_dfs = []


# Delete the csv file that stores the difference-of-means results
if os.path.exists(DiffOfMeansResults_2_filename):
    os.remove(DiffOfMeansResults_2_filename)

nStops = 0
FirstWrite = True
for nStops in range(31):
    print("nStops:", nStops)
    for iOnlyCompleteTrips in [0, 1]:
        # Define name for data frame for this value of nStops
        result_df_name = f"result_{nStops:02d}_df"
        locals()[result_df_name] = pd.DataFrame(index=sites_to_sum, columns=states_to_sum)

        normalized_df_name = f"normalized_{nStops:02d}_df"
        locals()[normalized_df_name] = pd.DataFrame(index=sites_to_sum, columns=states_to_sum)

        Sum_PercentStation_name = f"all_stations_normalized_{nStops:02d}_df"
        locals()[Sum_PercentStation_name] = pd.DataFrame(index=sites_to_sum, columns=states_to_sum)

        
        # LaunchSites_df = pd.read_csv(LaunchSites_filename)
        LaunchSites_df = pd.read_csv(Station_NonStationAnalysis_full_filename)


        LaunchSites_df.fillna(0, inplace=True)
        # Create Variable indicating if the site is not counted
        LaunchSites_df['InsufficientStops'] = (LaunchSites_df['nStops'] < nStops).astype(int)

        LaunchSites_df[states_to_sum] = LaunchSites_df[states_to_sum].astype(int)
        LaunchSites_df[sites_to_sum] = LaunchSites_df[sites_to_sum].astype(int)


        # Replace missing values with zeros
        # MissingRow = LaunchSites_df['TX'].isna().sum()

        MissingRows = LaunchSites_df['InsufficientStops'].sum()

        # Create a data frame for this value of nStops
        LaunchSites_df_counted = LaunchSites_df[(LaunchSites_df['nStops'] > nStops) & 
                                                (LaunchSites_df['Interruption_01'] >= iOnlyCompleteTrips) ]
        GrandTotal = len(LaunchSites_df_counted)
        # print(nStops, GrandTotal, MissingRows, GrandTotal+MissingRows)


        # Create an empty DataFrame to store the results
        locals()[result_df_name] = pd.DataFrame(index=sites_to_sum, columns=states_to_sum)

        # Fill in the DataFrame with the corresponding values
        for state_column in states_to_sum:
            for site_column in sites_to_sum:
                locals()[result_df_name].at[site_column, 'SiteName'] = site_column
                locals()[result_df_name].at[site_column, state_column] = np.dot(LaunchSites_df_counted[state_column], LaunchSites_df_counted[site_column])

        results_temp_df = locals()[result_df_name]
        # Add a final row for the sum of all variables in states_to_sum
        locals()[result_df_name].loc['StateTotal'] = LaunchSites_df_counted[states_to_sum].sum()

        # Add a final column for the sum of all values in sites_to_sum
        locals()[result_df_name]['AllStates'] = LaunchSites_df_counted[sites_to_sum].sum()
        locals()[result_df_name].at['StateTotal','AllStates'] = float(GrandTotal)

        ########################################
        # Create a data frame with the percentages
        # Replace 'SiteName' with the actual name of the column you want to move to the first position
        column_to_move = 'SiteName'
        site_name_column = locals()[result_df_name][column_to_move]
        # Normalize the DataFrame
        locals()[normalized_df_name] = locals()[result_df_name].div(locals()[result_df_name].loc['StateTotal'])
        # Add the 'SiteName' column back to the normalized DataFrame
        locals()[normalized_df_name][column_to_move] = site_name_column

        # Move the specified column to the first position
        columns = list(locals()[normalized_df_name].columns)
        columns.insert(0, columns.pop(columns.index(column_to_move)))
        locals()[normalized_df_name] = locals()[normalized_df_name][columns]

        rows_to_drop = [
            'LargePorts_Marine_traffic_0_1',
            'Medium_Anchorage_Marine_traffic_0_1',
            'Medium_Port_Marine_traffic_0_1',
            'Small_Marina_Marine_traffic_0_1',
            'Small_Port_Marine_traffic_0_1'
        ]
        locals()[normalized_df_name].loc['StateTotal'] = locals()[result_df_name].loc['StateTotal']
        locals()[normalized_df_name].at['StateTotal','AllStates'] = float(GrandTotal)

        # Drop the specified rows
        locals()[normalized_df_name] = locals()[normalized_df_name].drop(rows_to_drop)
        stations_to_sum = [
            'LA_CREEL_station_points_0_1',
            'FL_MRIP_station_points_0_1',
            'AL_MRIP_station_points_0_1',
            'MS_MRIP_station_points_0_1',
            'TX_station_points_0_1'
        ]

        # Create the new "AllStations" row by summing the specified rows
        locals()[normalized_df_name].loc['AllStations'] = locals()[normalized_df_name].loc[stations_to_sum].sum()
        locals()[normalized_df_name].loc['AllStations','SiteName'] = "Total"

        # Add a row that shows the value of nStops
        locals()[result_df_name]['nStops'] = nStops
        locals()[normalized_df_name]['nStops'] = nStops
        locals()[result_df_name]['Interruption_01'] = iOnlyCompleteTrips
        locals()[normalized_df_name]['Interruption_01'] = iOnlyCompleteTrips

        # Create a new data frame that only includes the last row of the normalized data frame
        locals()[Sum_PercentStation_name] = locals()[normalized_df_name].loc[['AllStations']]
        locals()[Sum_PercentStation_name]['TripsAnalyzed'] = GrandTotal

        # print(nStops,locals()[normalized_df_name])


        # Reorder columns of results
        column_to_move = 'SiteName'
        columns = list(locals()[result_df_name].columns)
        columns.insert(0, columns.pop(columns.index(column_to_move)))
        locals()[result_df_name] = locals()[result_df_name][columns]

        columns = list(locals()[result_df_name].columns)
        columns.insert(0, columns.pop(columns.index(column_to_move)))
        locals()[result_df_name] = locals()[result_df_name][columns]

        # Append the current DataFrame to the list
        all_results_dfs.append(locals()[result_df_name])
        all_normalized_dfs.append(locals()[normalized_df_name])
        all_PercentStation_dfs.append(locals()[Sum_PercentStation_name])

        ###########################################################################
        ##### New lines to carry out variable comparisons at the same time
        ###########################################################################
        # Only implement this for nStops == 10
        if nStops == 10:

            variables_to_compare = ['moving_mph_new','max_mph_new', 'max_distance_from_coast_mi_t', 'Total_distance_traveled_mi_t', 'Trip_Duration_t', 'move_efficiency_t', 'Weekend_trip_t']
            # Initialize an empty DataFrame to store the results
            # results_df = pd.DataFrame(columns=['RunTitle', 'Variable', 'StopAtStation=0 Mean', 'StopAtStation=1 Mean', 'Mean Difference', 'P-value'])

            LaunchSites_df_counted['AllStates'] = 1

            statestocheck = ['AllStates', 'TX', 'LA', 'MS', 'AL', 'FL']
            for statename in statestocheck:
                # Bring in new speeds
                LaunchSites_df_counted2 = LaunchSites_df_counted.merge(newspeed_df, on=['cuebiq_id', 'Trip_number'], how='left')
                LaunchSites_df_counted2 = LaunchSites_df_counted2.drop_duplicates()
                # Filter only the rows for this state
                LaunchSites_df_counted2 = LaunchSites_df_counted2[LaunchSites_df_counted2[statename] == 1]
                
                LaunchSites_df_counted2['Total_distance_traveled_mi_t']=LaunchSites_df_counted2['Total_distance_traveled_t']*0.621371
                LaunchSites_df_counted2['max_distance_from_coast_mi_t']=LaunchSites_df_counted2['max_distance_from_coast_t']*0.621371

                # Loop through each variable
                station_df = LaunchSites_df_counted2[(LaunchSites_df_counted2['StopAtStation'] == 1)]
                nonstation_df = LaunchSites_df_counted2[(LaunchSites_df_counted2['StopAtStation'] == 0)]
                for variable in variables_to_compare:
                    # Create data frames fro station & nonstation trips
                    var_station_df = station_df[(~np.isinf(station_df[variable])) & (~np.isnan(station_df[variable]))]
                    var_nonstation_df = nonstation_df[(~np.isinf(nonstation_df[variable])) & (~np.isnan(nonstation_df[variable]))]

                    # Calculate stats for this variable
                    station_var_mean = var_station_df[variable].mean()
                    nonstation_var_mean = var_nonstation_df[variable].mean()
                    t_stat, p_value = ttest_ind(var_nonstation_df[variable],var_station_df[variable])

                    # Create a dictionary for the row to append
                    row_data = {
                        "State": statename,
                        "variable": variable,
                        "nStops": nStops,
                        "iOnlyCompleteTrips": iOnlyCompleteTrips,
                        "station_var_mean": station_var_mean,
                        "nonstation_var_mean": nonstation_var_mean,
                        "t_stat": t_stat,
                        "p_value": p_value,
                        "total_var_count": len(var_station_df) + len(var_nonstation_df),
                        "LaunchSites_count": len(LaunchSites_df_counted2)
                    }

                    # Convert to DataFrame
                    row_df = pd.DataFrame([row_data])

                    # Append to CSV (include header only if the file doesn't exist)
                    file_exists = os.path.isfile(DiffOfMeansResults_2_filename)
                    row_df.to_csv(DiffOfMeansResults_2_filename, mode='a', header=not file_exists, index=False)

            ###########################################################################
            ####### End of Stuff for calculating statistics for comparing station & non-station trips
            ###########################################################################
        
        
AllResults_df = pd.concat(all_results_dfs, axis=0)
AllResults_df.to_csv(count_filename, index=False)

AllNormalized_df = pd.concat(all_normalized_dfs, axis=0)
AllNormalized_df.to_csv(percent_filename, index=False)

All_PercentStation_df = pd.concat(all_PercentStation_dfs, axis=0)
All_PercentStation_df.to_csv(summary_percent_filename, index=False)


print("Done")

In [None]:
# # # cc_df = Merge_Trip_dfs(newspeed_df, LaunchSites_df_counted)
# # # LaunchSites_df_counted['Total_distance_traveled_mi_t']
# # for col in LaunchSites_df_counted2.columns:
# #     print(col)
# station_var_mean

In [None]:
# AllResults_df[AllResults_df['nStops'] == 10].head(10)

## Create counts of all the trips that are used in the analysis.

In [None]:
Complete_Indicators_df = pd.read_csv(Indicators_Classified_filename)
print("with dups", len(Complete_Indicators_df))
Complete_Indicators_df = Complete_Indicators_df.sort_values(by=['cuebiq_id', 'Trip_number'])
Complete_Indicators_df = Complete_Indicators_df.drop_duplicates(subset=['cuebiq_id', 'Trip_number'], keep='first')
print("without dups", len(Complete_Indicators_df))

# Count the rows where Predicted_Class == 371
count_371 = Complete_Indicators_df[Complete_Indicators_df['Predicted_Class'] == 371].shape[0]

# Print the result
print(f"Number of rows with Predicted_Class == 371: {count_371}")


### Create graph of station as a function of nStops

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter


# Load the data from the CSV file
# Note that when Interruption_01 = 0, this includes all trips
for iNotDisappear in [0, 1]:
    df = pd.read_csv(summary_percent_filename)
    df = df[df['Interruption_01'] == iNotDisappear]

    # Extract data for each state and 'AllStates'
    tx_data = df['TX']
    la_data = df['LA']
    ms_data = df['MS']
    al_data = df['AL']
    fl_data = df['FL']
    all_states_data = df['AllStates']
    trips_analyzed_data = df['TripsAnalyzed']
    n_stops_data = df['nStops']

    # # Plot the data for each state with different line styles and thickness
    # plt.plot(n_stops_data, tx_data, label='TX', linestyle='-', linewidth=0.8)
    # plt.plot(n_stops_data, la_data, label='LA', linestyle='-', linewidth=0.8)
    # plt.plot(n_stops_data, ms_data, label='MS', linestyle='-', linewidth=0.8)
    # plt.plot(n_stops_data, al_data, label='AL', linestyle='-', linewidth=0.8)
    # # plt.plot(n_stops_data, fl_data, label='FL', linestyle='-', linewidth=0.8)
    # plt.plot(n_stops_data, all_states_data, label='All States', linestyle='-', linewidth=2.0)

    linewidth = 0.8
    marksize = 3
    plt.plot(n_stops_data, tx_data, label='TX', linestyle='-',  linewidth=linewidth, marker='o', markersize=marksize, color='#1b9e77')   # green
    plt.plot(n_stops_data, la_data, label='LA', linestyle='--', linewidth=linewidth, marker='s', markersize=marksize, color='#d95f02')   # orange
    plt.plot(n_stops_data, ms_data, label='MS', linestyle=':',  linewidth=linewidth, marker='^', markersize=marksize, color='#7570b3')   # purple
    plt.plot(n_stops_data, al_data, label='AL', linestyle='-.', linewidth=linewidth, marker='D', markersize=marksize, color='#e7298a')   # pink
    plt.plot(n_stops_data, all_states_data, label='All States', linestyle='-', linewidth=3.0, color='black')

    # Add a legend to the plot
    plt.legend(loc='upper left')

    # Set labels for the axes
    plt.xlabel('Stops required for inclusion')
    plt.ylabel('Station trips as a percent of all trips analyzed')

    # Set the y-axis range for the primary y-axis (left axis)
    plt.ylim(0.5, 1.0)

    # Format the left y-axis as a percentage
    percent_formatter = FuncFormatter(lambda x, _: '{:.0%}'.format(x))
    plt.gca().yaxis.set_major_formatter(percent_formatter)

    ################# Second Axis Stuff ###################
    # Create a secondary y-axis for 'TripsAnalyzed'
    # ax2 = plt.gca().twinx()
    # ax2.plot(n_stops_data, trips_analyzed_data, label='Trips Analyzed', linestyle='--', linewidth=1.5, color='orange')

    # Set labels for the secondary y-axis
    # ax2.set_ylabel('TripsAnalyzed') #, color='orange')

    # Format the right y-axis with commas
    comma_formatter = FuncFormatter(lambda x, _: '{:,}'.format(int(x)))
    # ax2.yaxis.set_major_formatter(comma_formatter)

    # Add a legend to the plot
    plt.legend(loc='upper right')

    # Set the title of the plot
    titletext = f"Percentage of all trips that stop near a station\n(All Trips)"
    fig_title = os.path.join(Figures_directory, f'AnalysisOfTripsNearStations_AllTrips.jpg')

    if iNotDisappear >0:
        titletext = f"Percentage of all trips that stop near a station\n(Fully-tracked trips)"
        fig_title = os.path.join(Figures_directory, f'AnalysisOfTripsNearStations_CompleteTrips.jpg')
        
    # plt.title(titletext)

    # Adjust layout to make room for the buffer
    plt.tight_layout(pad=0.5)

    
    # Show the plot
    plt.savefig(fig_title, bbox_inches='tight', pad_inches=0, facecolor='white')
    plt.show()
    print(trips_analyzed_data)

In [None]:
import folium
import h3
import numpy as np

percentiles = np.arange(10, 100, 10)

# Create a map centered around the Gulf of Mexico
def MakeH3Map(TripsByH3_df, ColName, ThisMapName):
    # Create a folium map centered at the specified location
    m = folium.Map(location=[27.0, -90.0], zoom_start=6)

    # Compute percentiles for the color scale
    percentiles = np.arange(10, 100, 10)
    percentile_values = np.percentile(TripsByH3_df[ColName], percentiles)

    # Add the boundaries of each H3 cell to the map with different colors based on total_time
    for index, row in TripsByH3_df.iterrows():
        # Determine the color based on the value in ColName
        if row[ColName] == 0:
            fillColor = None
            fillOpacity = 0
        else:
            fillColor = get_color(row[ColName], percentile_values)
            fillOpacity = 0.8

        geojson = {
            "type": "Feature",
            "geometry": {
                "type": "Polygon",
                "coordinates": [h3.h3_to_geo_boundary(row['h3_cell'], geo_json=True)]
            }
        }

        # Add the GeoJSON layer to the map with custom style
        folium.GeoJson(
            geojson,
            style_function=lambda feature, fillColor=fillColor, fillOpacity=fillOpacity: {
                'fillColor': fillColor,
                'color': 'black',
                'weight': 1,
                'fillOpacity': fillOpacity
            }
        ).add_child(
            folium.Popup(f"{ColName}: {row[ColName]}")
        ).add_to(m)

    # Save the map to an HTML file
    map_filename = os.path.join(Figures_directory, ThisMapName)
    m.save(map_filename)
    
    # Return the map
    return m

m1 = MakeH3Map(TripsByH3, 'trip_count', "MapOfAllTrips.html")
m2 = MakeH3Map(TripsByH3_station, 'trip_count', "MapOfStationTrips.html")
m3 = MakeH3Map(TripsByH3_nonstation, 'trip_count', "MapOfNonStationTrips.html")

m3

In [None]:
m2

In [None]:
def MakeH3MapTwoColors(TripsByH3_df, ColName, ThisMapName):
    m = folium.Map(location=[27.0, -90.0], zoom_start=6)

    # Add the boundaries of each H3 cell to the map with different colors based on the value in ColName
    for index, row in TripsByH3_df.iterrows():
        # Determine the color based on whether the value is positive or negative
        color = 'green' if row[ColName] > 0 else 'red'

        # Define the GeoJSON for the H3 cell
        geojson = {
            "type": "Feature",
            "geometry": {
                "type": "Polygon",
                "coordinates": [h3.h3_to_geo_boundary(row['h3_cell'], geo_json=True)]
            }
        }

        # Add the GeoJSON layer to the map with custom style
        folium.GeoJson(
            geojson,
            style_function=lambda feature, color=color: {
                'fillColor': color,
                'color': 'black',
                'weight': 1,
                'fillOpacity': 0.8
            }
        ).add_child(
            folium.Popup(f"{ColName}: {row[ColName]}")
        ).add_to(m)
    
    # Save the map to an HTML file
    map_filename = os.path.join(Figures_directory, ThisMapName)
    m.save(map_filename)
    
    # Display the map in the notebook (if in a Jupyter notebook environment)
    return m


### Map comparing destinations for more non-station 

In [None]:
TripsByH3_nonstation['percent'] = TripsByH3_nonstation['trip_count']/ TripsByH3_nonstation['trip_count'].sum()
TripsByH3_station['percent'] = TripsByH3_station['trip_count']/ TripsByH3_station['trip_count'].sum()

merged_df = pd.merge(TripsByH3_nonstation, TripsByH3_station, on='h3_cell', how='outer', suffixes=('_nonstation', '_station'))

# Fill NaN values with 0 (or another appropriate value)
merged_df.fillna(0, inplace=True)

# Calculate the difference
merged_df['percent_diff'] = merged_df['percent_nonstation'] - merged_df['percent_station']

# Create the final DataFrame with 'h3_cell' and 'percent_diff'
H3_nonstation_less_station = merged_df[['h3_cell', 'percent_diff']]

mcomparison = MakeH3Map(H3_nonstation_less_station, 'percent_diff', "ComparionsMap.html")
per_min = 0.01
H3_nonstation_less_station_big = H3_nonstation_less_station[(H3_nonstation_less_station['percent_diff'] > per_min) |
                                                            (H3_nonstation_less_station['percent_diff'] < -per_min) ]
H3_nonstation_less_station_big['percent_diff'] = round(H3_nonstation_less_station_big['percent_diff']*100,1)
mcomparison2 = MakeH3MapTwoColors(H3_nonstation_less_station_big, 'percent_diff', "ComparionsMapBigDiff.html")

mcomparison2

In [None]:
H3_nonstation_less_station.to_csv('Debug.csv')