# Predictive analysis of naval incidents in the USA, 2002 - 2015: <br>
## Annex 3.3. Preprocess Weather River

> Author: [Oscar Anton](https://www.linkedin.com/in/oscanton/) <br>
> Date: 2024 <br>
> License: [CC BY-NC-ND 4.0 DEED](https://creativecommons.org/licenses/by-nc-nd/4.0/) <br>
> Version: 0.9 <br>

# 0. Loadings

### Libraries

In [None]:
# General data management
import numpy as np
import pandas as pd

# File management
import os
import gzip

# Visualization
import plotly.graph_objects as go
import plotly.express as px

### General Variables

In [None]:
# Main data folders
import_data_folder = 'RawDataWeatherRiver'
export_data_folder = 'DataWeatherRiver'

# Toggle for export data to external file
file_export_enabled = False
# Toggle for calculations that takes a long time
protracted_calculation_enabled = False

# 1. Data Acquisition

## 1.1. Decompress and data concatenation

In [None]:
if protracted_calculation_enabled :
    # Get the list of files in the folder
    files = [file for file in os.listdir(import_data_folder) if file.endswith('.csv.gz')]
    
    # Initialize an empty DataFrame to be filled with the data from the files
    land_stations_comb_1 = pd.DataFrame()
    
    # Iterate over the files and process each one
    for file in files:
        file_path = os.path.join(import_data_folder, file)
    
        # Read the compressed CSV file
        with gzip.open(file_path, 'rt') as file:
            df_temp = pd.read_csv(file)
    
            # Select and rename the specific columns
            df_temp = df_temp.iloc[:, :4]  # Select the first 4 columns
            df_temp.columns = ['STATION', 'DATE', 'ELEMENT', 'DATAVALUE']
    
            # Filter the DataFrame to include only desired elements
            df_temp = df_temp[df_temp['ELEMENT'].isin(['PRCP', 'TMAX', 'TMIN', 'AWND'])]
    
            # Convert the 'DATE' column to a datetime format
            df_temp['DATE'] = pd.to_datetime(df_temp['DATE'], format='%Y%m%d')
    
            # Pivot the DataFrame to convert it from long to wide format
            df_temp = df_temp.pivot(index=['STATION', 'DATE'], columns='ELEMENT', values='DATAVALUE').reset_index()
    
            # Concatenate with the final DataFrame
            land_stations_comb_1 = pd.concat([land_stations_comb_1, df_temp], ignore_index=True)
            
    # Column names to lowercase
    land_stations_comb_1.columns = land_stations_comb_1.columns.str.lower()
    print(f'land_stations_comb_1 {land_stations_comb_1.shape} created')
else:
    land_stations_comb_1 = pd.read_feather(export_data_folder + '/' + 'land_stations_comb_1.feather')
    print(f'land_stations_comb_1 {land_stations_comb_1.shape} imported from {export_data_folder}')

## 1.2. Export dataframe

In [None]:
# Load or export to external file
if file_export_enabled :
    land_stations_comb_1.to_feather(export_data_folder + '/' + 'land_stations_comb_1.feather')
    print(f'land_stations_comb_1 {land_stations_comb_1.shape} exported to {export_data_folder}')
else:
    land_stations_comb_1 = pd.read_feather(export_data_folder + '/' + 'land_stations_comb_1.feather')
    print(f'land_stations_comb_1 {land_stations_comb_1.shape} imported to {export_data_folder}')

# 2. Coordinates

## 2.1. Load Station coords 

In [None]:
# Load data from txt file
ghcnd_stations = pd.read_fwf(import_data_folder + '/' + 'ghcnd_stations.txt',
                             widths=[11, 9, 10],
                             header=None,
                             names=["STATION", "LATITUDE", "LONGITUDE"])

# Column names to lowercase
ghcnd_stations.columns = ghcnd_stations.columns.str.lower()

# Data check
print(f'ghcnd_stations {ghcnd_stations.shape} loaded')

## 2.2. Coords to Stations

### Data boundaries

In [None]:
# Join Coords
land_stations_comb_2 = land_stations_comb_1.merge(ghcnd_stations, how='right', left_on='station', right_on='station')

# Only observation with relevant data: No NA in weather variables
land_stations_comb_2 = land_stations_comb_2.dropna(subset=['tmax', 'tmin', 'prcp'], thresh=1)

# Only Mississippi area
land_stations_comb_2 = land_stations_comb_2[(land_stations_comb_2['longitude'] >= -100) &
                                            (land_stations_comb_2['longitude'] <= -81.5) &
                                            (land_stations_comb_2['latitude'] >= 31) &
                                            (land_stations_comb_2['latitude'] <= 49)]

# Save to external file
if file_export_enabled :
    land_stations_comb_2.reset_index().to_feather(export_data_folder + '/' + 'land_stations_comb_2.feather')
    print(f'land_stations_comb_2 {land_stations_comb_2.shape} exported to {export_data_folder}')
else:
    land_stations_comb_2 = pd.read_feather(export_data_folder + '/' + 'land_stations_comb_2.feather')
    print(f'land_stations_comb_2 {land_stations_comb_2.shape} imported from {export_data_folder}')

### Screening: 33% min NAs

In [None]:
# Sort the DataFrame by the sum of null values in each row
land_stations_comb_3 = land_stations_comb_2.loc[
    land_stations_comb_2.isnull().sum(axis=1).sort_values().index]

# Select the first rows up to 33% of the total rows
percentage_rows = round(0.33 * len(land_stations_comb_3))
land_stations_comb_3 = land_stations_comb_3.iloc[:percentage_rows]

# Save to external file
if file_export_enabled :
    land_stations_comb_3.reset_index().to_feather(export_data_folder + '/' + 'land_stations_comb_3.feather')
    print(f'land_stations_comb_3 {land_stations_comb_3.shape} exported to {export_data_folder}')
else:
    land_stations_comb_3 = pd.read_feather(export_data_folder + '/' + 'land_stations_comb_3.feather')
    print(f'land_stations_comb_3 {land_stations_comb_3.shape} imported from {export_data_folder}')

### Load Weather river data

In [None]:
# Load dataframe
land_stations_comb_3 = pd.read_feather(export_data_folder + '/' + 'land_stations_comb_3.feather')

# Extract only date, leaving hour
land_stations_comb_3['date'] = pd.to_datetime(land_stations_comb_3['date']).dt.date

# Variable check
land_stations_comb_3['date'].head()

# 3. Join activity_id

## 3.1. Load Incidents in Rivers

In [None]:
# Load dataframe
Events = pd.read_feather('DataCasualtyAndPollution' + '/' + 'Events.feather')

# Variable selection
EventsRiver = Events[(Events.watertype == 'river')][['activity_id', 'date', 'longitude', 'latitude']]

# Extract only date, leaving hour
EventsRiver['date'] = pd.to_datetime(EventsRiver['date']).dt.date

# Drop duplicates
EventsRiver = EventsRiver.drop_duplicates()

# Data shape check
print(f'EventsRiver {EventsRiver.shape} created')

## 3.2. Nearest weather observation to each river incident

In [None]:
# Function to calculate nearest weather observation
def near_observation(incident):
    # Select data corresponding to this Activity_id
    coord_incident = EventsRiver[EventsRiver['activity_id'] == incident].iloc[0]

    # Select all weather observations for this day
    coord_station = land_stations_comb_3[(land_stations_comb_3['date'] == coord_incident['date'])]

    # Approximate distances
    coord_station['station_dist'] = np.sqrt((coord_station['latitude'] - coord_incident['latitude'])**2 +
                                            (coord_station['longitude'] - coord_incident['longitude'])**2)

    # Return the recorded weather observation located at minimum distance
    min_distance_row = coord_station[coord_station['station_dist'] == coord_station['station_dist'].min()]
    # Add activity_id to weather data
    min_distance_row['activity_id'] = incident

    #if coord_station.empty:
        #return pd.Series(dtype='float64')
    return min_distance_row.drop_duplicates(subset=['activity_id'], keep='first')

# Concatenate function returns to create a dataframe
if protracted_calculation_enabled :
    WeatherRiver = pd.concat([near_observation(incident) for incident in EventsRiver['activity_id']])
    print(f'WeatherRiver {WeatherRiver.shape} created')
else:
    WeatherRiver = pd.read_feather(export_data_folder + '/' + 'WeatherRiver.feather')
    print(f'WeatherRiver {WeatherRiver.shape} imported from {export_data_folder}')



In [None]:
# Export to external file
if file_export_enabled :
    WeatherRiver.reset_index().to_feather(export_data_folder + '/' + 'WeatherRiver.feather')
    print(f'WeatherRiver {WeatherRiver.shape} exported to {export_data_folder}')
else:
    WeatherRiver = pd.read_feather(export_data_folder + '/' + 'WeatherRiver.feather')
    print(f'WeatherRiver {WeatherRiver.shape} imported from {export_data_folder}')

# 4. Data check: Map

## 4.1. Dataframe structure

In [None]:
# Print first observations
WeatherRiver.head()

## 4.2. Map visualization

In [None]:
# Create figure object
fig = go.Figure()

# Aggregate WeatherRiver points
fig.add_trace(go.Scattermapbox(
    lat=WeatherRiver['latitude'],
    lon=WeatherRiver['longitude'],
    mode='markers',
    marker=dict(
        size=5,
        color=np.log1p(WeatherRiver['station_dist']),   # logarithmic scale
        colorscale=px.colors.sequential.Viridis,
        opacity=0.5,
    ),
    text=WeatherRiver.apply(lambda row:f"station:{row['station']}<br>station_dist: {row['station_dist']}", axis=1),
))

# Set up map design
fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'style': "open-street-map",
        'center': {'lon': -112, 'lat': 48},
        'zoom': 2})

# Show map
fig.show()

<hr style="border: 1px solid #2fa4e7;">