# Predictive analysis of naval incidents in the USA, 2002 - 2015: <br>
## Annex 4.1. Data Explore: VesselBalancedSample

> Author: [Oscar Anton](https://www.linkedin.com/in/oscanton/) <br>
> Date: 2024 <br>
> License: [CC BY-NC-ND 4.0 DEED](https://creativecommons.org/licenses/by-nc-nd/4.0/) <br>
> Version: 0.9 <br>

# 0. Loadings

### Libraries

In [None]:
# Data general management

import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Automatic Exploratory Data Analysis (EDA) report
from ydata_profiling import ProfileReport

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### General variables

In [None]:
# Main data folder
merged_activity_folder = '../3.DataPreprocess/DataMergedActivity'

# Toggle for export data to external file
file_export_enabled = False

### Load base dataframe

In [None]:
# Load dataframe from external file
VesselBalancedSample = pd.read_feather(merged_activity_folder + '/' + 'VesselBalancedSample.feather')

# Check dataframe structure
print(f'VesselBalancedSample {VesselBalancedSample.shape} loaded')
VesselBalancedSample.head()

# 1. Vessel features

## 1.1. vessel_class

### Frequency

In [None]:
# Filter data: group by vessel class
filtered_df = (VesselBalancedSample
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('vessel_class').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='vessel_class', data=filtered_df, palette='viridis')

# Customize plot
plt.title('Vessel type')
plt.xlabel('Number of vessels')
plt.ylabel(None) 
plt.show()

## 1.2. build_year

### Frequency

In [None]:
# Filter data: build_year between 1800 and 2015
filtered_df = (VesselBalancedSample
               [(VesselBalancedSample['build_year'] >= 1800) & (VesselBalancedSample['build_year'] <= 2015)]
               .drop_duplicates(subset='vessel_id', keep='first'))

# Plot histogram
plt.figure(figsize=(10, 6))
sns.histplot(filtered_df['build_year'], bins=range(1800, 2016), edgecolor='None', color='#00bfc4', alpha=0.9)

# Customize plot
plt.title('Distribution of build year')
plt.xlabel('Build Year')
plt.ylabel('Number of vessels')
plt.grid(True)
plt.show()

## 1.3. gross_tonnage

### Density

In [None]:
# Filter data
filtered_df = (VesselBalancedSample
               [(VesselBalancedSample['gross_ton'] >= 1) & (VesselBalancedSample['gross_ton'] <= 250000)]
               .drop_duplicates(subset='vessel_id', keep='first'))

# Labels for facet wrap
labels = ["1-100", "100-1000", "1000-50000", "50000-250000"]
filtered_df['gross_ton_range'] = pd.cut(filtered_df['gross_ton'], bins=[0, 100, 1000, 50000, 250000], labels=labels)

# Plot
sns.set(style="whitegrid")
g = sns.FacetGrid(filtered_df, col="gross_ton_range", col_wrap=2, height=4, sharey=False, sharex=False)
g.map(sns.kdeplot, "gross_ton", shade=True)
g.set_axis_labels("Gross Ton", "Density")
g.set_titles("Range {col_name}")

# Customize plot
plt.suptitle('Vessel Gross Tonnage (Density)')
plt.tight_layout()
plt.show()

### Ranking

In [None]:
# Filter data
filtered_df = (VesselBalancedSample
               [['vessel_id', 'imo_number', 'vessel_name', 'build_year', 'gross_ton', 'length']]
               .sort_values(by='gross_ton', ascending=False)
               .drop_duplicates()
               .head(10))

# Table output
print(filtered_df)

## 1.4. Vessel length

### Densitity

In [None]:
# Filter data
filtered_df = (VesselBalancedSample
               [(VesselBalancedSample['length'] >= 1) & (VesselBalancedSample['length'] <= 1250)]
               .drop_duplicates(subset='vessel_id', keep='first'))

# Labels for facet wrap
labels = ["1-250", "250-1000"]
filtered_df['length_range'] = pd.cut(filtered_df['length'], bins=[0, 250, 1000], labels=labels)

# Plot
g = sns.FacetGrid(filtered_df, col="length_range", col_wrap=1, height=3, aspect=7/3, sharey=False, sharex=False)
g.map(sns.kdeplot, "length", shade=True)

# Customize plot
g.set_axis_labels("Length", "Density")
g.set_titles("Range {col_name}")

plt.suptitle('Vessel Lengths (Density)')
plt.tight_layout()
plt.show()

### Ranking

In [None]:
# Filter data
filtered_df =(VesselBalancedSample
              [['vessel_id', 'imo_number', 'vessel_name', 'build_year', 'gross_ton', 'length']]
              .sort_values(by='length', ascending=False)
              .drop_duplicates()
              .head(10))

# Table output
print(filtered_df)

## 1.5. Flag

### Frequency (All)

In [None]:
# Filter data: group by vessel class
filtered_df = (VesselBalancedSample
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('flag_abbr').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False)
               .head(10))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='flag_abbr', y='frequency', data=filtered_df, palette='viridis')

# Customize plot
plt.title('Flag (All)')
plt.xlabel('Country')
plt.ylabel('Number of vessels') 
plt.show()



### Frequency (Foreign)

In [None]:
# Filter data: drop US flag, group by vessel class
filtered_df = (VesselBalancedSample
               [VesselBalancedSample['flag_abbr'] != "US"]
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('flag_abbr').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False)
               .head(10))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='flag_abbr', y='frequency', data=filtered_df, palette='viridis')

# Customize plot
plt.title('Flag (Foreign)')
plt.xlabel('Country')
plt.ylabel('Number of vessels') 
plt.show()

## 1.6. Classification Societies

In [None]:
# Filter data: group by classification_society
filtered_df = (VesselBalancedSample
               [VesselBalancedSample['classification_society'] != "UNSPECIFIED"]
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('classification_society').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False)
               .head(10))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='classification_society', data=filtered_df, palette='viridis')

# Percentage
filtered_df['percentage'] = filtered_df['frequency'] / filtered_df['frequency'].sum() * 100
for i, (value, percentage) in enumerate(zip(filtered_df['frequency'], filtered_df['percentage'])):
    plt.text(value / 2, i, f'{percentage:.1f}%', va='center', ha='center', color='white')

# Customize plot
plt.title('Classification Societies')
plt.xlabel('Number of vessels')
plt.ylabel(None)
plt.show()

## 1.7. SOLAS Membership

In [None]:
# Filter data: group by SOLAS
filtered_df = (VesselBalancedSample
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('solas_desc').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=True))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='solas_desc', y='frequency', data=filtered_df, palette='viridis') 

# Percentage
filtered_df['percentage'] = filtered_df['frequency'] / filtered_df['frequency'].sum() * 100
for i, (value, percentage) in enumerate(zip(filtered_df['frequency'], filtered_df['percentage'])):
    plt.text(i, value / 2, f'{percentage:.1f}%', va='center', ha='center', color='lightgrey')

# Customize plot
plt.title('SOLAS Membership')
plt.xlabel(None)
plt.ylabel('Number of vessels') 
plt.show()

# 2. Incidents

## 2.1. event_type

### Frequency

In [None]:
# Filter data: group by event_type
filtered_df = (VesselBalancedSample
               [VesselBalancedSample['event_type'] != "No event"]
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('event_type').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='event_type', data=filtered_df, palette='viridis')

# Customize plot
plt.title('Incidents')
plt.xlabel('Number of vessels')
plt.ylabel('Event type') 
plt.show()

## 2.2. damage_status

### Frequency

In [None]:
# Filter data: group by damage_status
filtered_df = (VesselBalancedSample
               .drop_duplicates(subset='vessel_id', keep='first')
               .groupby('damage_status').size().reset_index(name='frequency')
               .sort_values(by='frequency', ascending=False))

# Plot barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='frequency', y='damage_status', data=filtered_df, palette='viridis')

# Customize plot
plt.title('Damage Status')
plt.xlabel('Number of vessels')
plt.ylabel('Consequence') 
plt.show()

# 3. Incident Involvement

## 3.1. Incident Involvement / Vessel features

In [None]:
# Add 'involved' (true/false) variable to base dataframe
VesselBalancedSample['involved'] = VesselBalancedSample['event_type'] != "No event"

# Filter data 
filtered_df = (VesselBalancedSample
               [['involved', 'gross_ton', 'length', 'build_year']]
               [(VesselBalancedSample['gross_ton'] < 400) & 
                (VesselBalancedSample['length'] < 250) & 
                (VesselBalancedSample['build_year'] > 1950)])

# Pivot data
filtered_df = pd.melt(filtered_df, id_vars='involved', var_name='variable', value_name='value')

# Plot boxplot
g = sns.FacetGrid(filtered_df, col='variable', sharey=False, col_wrap=3)
g.map_dataframe(sns.boxplot, x='involved', y='value', palette='viridis',
                flierprops=dict(markerfacecolor='darkgrey', markeredgecolor='none', markersize=4))
g.map_dataframe(sns.pointplot, x='involved', y='value', color='darkgrey', markers='.')

# Customize plot
g.set_titles("Boxplot of {col_name}")
g.set_axis_labels("Involved", "Value")

plt.show()

### Incident involvement / build_year

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               [(VesselBalancedSample['build_year'] >= 1800) & (VesselBalancedSample['build_year'] <= 2015)]
               .groupby('build_year')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index())

# Plot bars in stack manner
plt.figure(figsize=(10, 6))
inv_bars = plt.bar(filtered_df['build_year'], filtered_df['involved'],
                   color='orangered', edgecolor='none')
notinv_bars = plt.bar(filtered_df['build_year'], filtered_df['not_involved'],
                      bottom=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to build year')
plt.xlabel('build year')
plt.ylabel('Number of vessels')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'), loc='upper left')
plt.show()

## 3.2. Incident involvement / vessel_class

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               .groupby('vessel_class')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=True)
               .reset_index())

# Plot bars in stack manner
inv_bars = plt.barh(filtered_df['vessel_class'], filtered_df['involved'],
                    color='orangered', edgecolor='none')
notinv_bars = plt.barh(filtered_df['vessel_class'], filtered_df['not_involved'],
                       left=filtered_df['involved'],
                       color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to build year')
plt.xlabel('Number of vessels')
plt.ylabel('Vessel Class')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'), loc='lower right')
plt.show()

## 3.3. Incident involvement / Flag

### All Flags

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample.groupby('flag_abbr')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index()
               .head(10))

# Plot bars in stack manner
inv_bars = plt.bar(filtered_df['flag_abbr'], filtered_df['involved'],
                   color='orangered', edgecolor='none')
notinv_bars = plt.bar(filtered_df['flag_abbr'], filtered_df['not_involved'],
                      bottom=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to flag')
plt.xlabel('Country')
plt.ylabel('Number of vessels')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

### Foreign Flags

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               [VesselBalancedSample['flag_abbr'] != "US"]
               .groupby('flag_abbr')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index()
               .head(10))

# Plot bars in stack manner
inv_bars = plt.bar(filtered_df['flag_abbr'], filtered_df['involved'],
                   color='orangered', edgecolor='none')
notinv_bars = plt.bar(filtered_df['flag_abbr'], filtered_df['not_involved'],
                      bottom=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to flag (foreign)')
plt.xlabel('Country')
plt.ylabel('Number of vessels')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

## 3.4. Incident involvement / classification_society

### All vessels

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               [VesselBalancedSample['classification_society'] != "UNSPECIFIED"]
               .groupby('classification_society')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index()
               .head(7))

# Plot bars in stack manner
fig, ax = plt.subplots()
inv_bars = ax.barh(filtered_df['classification_society'], filtered_df['involved'],
                   color='orangered', edgecolor='none')
notinv_bars = ax.barh(filtered_df['classification_society'], filtered_df['not_involved'],
                      left=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Percentages
for i, bar in enumerate(inv_bars):
    percentage = '{:.1f}%'.format((filtered_df['involved'][i] / filtered_df['total'][i]) * 100)
    ax.text(0 + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2, percentage,
            va='center', ha='center', color='white', size='8')

for i, bar in enumerate(notinv_bars):
    percentage = '{:.1f}%'.format((filtered_df['not_involved'][i] / filtered_df['total'][i]) * 100)
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2, percentage,
            va='center', ha='center', color='white', size='8')

# Customize plot
plt.title('Involvement in incidents according to Classification Society (All Vessels)')
plt.xlabel('Number of vessels')
plt.ylabel('Classification Society')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

### gross_ton > 50000 GPT

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               [(VesselBalancedSample['classification_society'] != "UNSPECIFIED") &
                (VesselBalancedSample['gross_ton'] >= 50000)]
                .groupby('classification_society')['involved']
                .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
                .sort_values(by='total', ascending=False)
                .reset_index()
                .head(7))

# Plot bars in stack manner
fig, ax = plt.subplots()
inv_bars = ax.barh(filtered_df['classification_society'], filtered_df['involved'],
                color='orangered', edgecolor='none')
notinv_bars = ax.barh(filtered_df['classification_society'], filtered_df['not_involved'],
                      left=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Percentages
for i, bar in enumerate(inv_bars):
    percentage = '{:.1f}%'.format((filtered_df['involved'][i] / filtered_df['total'][i]) * 100)
    ax.text(0 + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2, percentage,
            va='center', ha='center', color='white', size='8')

for i, bar in enumerate(notinv_bars):
    percentage = '{:.1f}%'.format((filtered_df['not_involved'][i] / filtered_df['total'][i]) * 100)
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2, percentage,
            va='center', ha='center', color='white', size='8')

# Customize plot
plt.title('Involvement in incidents according to Classification Society (Gross ton > 50000)')
plt.xlabel('Number of vessels')
plt.ylabel('Classification Society')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

## 3.5. Incident involvement / SOLAS Membership

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               .groupby('solas_desc')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index())

# Plot bars in stack manner
inv_bars = plt.bar(filtered_df['solas_desc'], filtered_df['involved'],
                   color='orangered', edgecolor='none')
notinv_bars = plt.bar(filtered_df['solas_desc'], filtered_df['not_involved'],
                      bottom=filtered_df['involved'],
                      color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to SOLAS membership')
plt.xlabel(None)
plt.ylabel('Number of vessels')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

## 3.6. Incident involvement / damage_status

In [None]:
# Filter data: 
filtered_df = (VesselBalancedSample
               .groupby('damage_status')['involved']
               .agg(involved='sum', not_involved=lambda x: len(x) - sum(x), total='count')
               .sort_values(by='total', ascending=False)
               .reset_index())

# Plot bars in stack manner
inv_bars = plt.barh(filtered_df['damage_status'], filtered_df['involved'],
                    color='orangered', edgecolor='none')
notinv_bars = plt.barh(filtered_df['damage_status'], filtered_df['not_involved'],
                       left=filtered_df['involved'],
                       color='#00bfc4', edgecolor='none')

# Customize plot
plt.title('Involvement in incidents according to damage')
plt.xlabel('Number of vessels')
plt.ylabel('Condition')
plt.legend((inv_bars[0], notinv_bars[0]), ('Involved', 'Not Involved'))
plt.show()

# 4. Correlations

## 4.1. Correlation Matrix

In [None]:
# Drop unvaluable variables
VesselBalancedSample = VesselBalancedSample.drop(['vessel_id', 'imo_number', 'vessel_name'], axis=1)

# Convert to numerical using cat.codes
VesselBalancedSample['vessel_class'] = VesselBalancedSample['vessel_class'].astype('category').cat.codes
VesselBalancedSample['flag_abbr'] = VesselBalancedSample['flag_abbr'].astype('category').cat.codes
VesselBalancedSample['classification_society'] = VesselBalancedSample['classification_society'].astype('category').cat.codes
VesselBalancedSample['solas_desc'] = VesselBalancedSample['solas_desc'].astype('category').cat.codes
VesselBalancedSample['event_type'] = VesselBalancedSample['event_type'].astype('category').cat.codes
VesselBalancedSample['damage_status'] = VesselBalancedSample['damage_status'].astype('category').cat.codes

# Convert all to numeric
VesselBalancedSample = VesselBalancedSample.apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Heatmap for correlation matrix 
plt.figure(figsize=(10, 8))
sns.heatmap(VesselBalancedSample.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()

# 5. Extra: Automatic EDA report

In [None]:
# Create ydata_profiling report
profile = ProfileReport(VesselBalancedSample, title='VesselBalancedSample: EDA')

# Export inform
if file_export_enabled :
    profile.to_file("Exported Reports/VesselBalancedSample_EDA.html")
else:
    print('EDA report already exported')

<hr style="border: 1px solid #2fa4e7;">