In [1]:
# Fishing Detection: Initial Data Exploration

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import folium
from folium.plugins import HeatMap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Set up plotting
plt.style.use('seaborn-whitegrid')
sns.set(font_scale=1.2)
plt.rcParams['figure.figsize'] = [12, 8]

# 1. Load Data
print("Loading data...")
df = pd.read_csv('../data/raw/drifting_longlines.csv')

# 2. Basic Exploration
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())

print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

print("\nSummary statistics:")
print(df.describe())

# 3. Process Data
print("\nPreprocessing data...")
# Convert timestamp to datetime
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
# Extract time components
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month

# Handle fishing label
print("\nFishing label distribution (original):")
print(df['is_fishing'].value_counts())

# Convert fishing label to binary (removing -1 values)
df = df[df['is_fishing'] >= 0]  # Remove no-data points
df['is_fishing_binary'] = (df['is_fishing'] > 0).astype(int)

print("\nFishing label distribution (binary):")
print(df['is_fishing_binary'].value_counts())
print(f"Fishing percentage: {df['is_fishing_binary'].mean()*100:.2f}%")

# 4. Exploratory Visualizations
print("\nCreating visualizations...")

# Plot distributions of features
features = ['speed', 'distance_from_shore', 'distance_from_port']
fig, axes = plt.subplots(len(features), 1, figsize=(12, 5*len(features)))

for i, feature in enumerate(features):
    sns.histplot(data=df, x=feature, hue='is_fishing_binary', bins=50, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature} by fishing activity')
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.savefig('fishing_feature_distributions.png')
plt.close()

# Visualize fishing activities on a map
print("\nGenerating map visualization...")
# Sample data if too large
sample_df = df.sample(min(10000, len(df)))

# Create a map centered on the mean coordinates
map_center = [sample_df['lat'].mean(), sample_df['lon'].mean()]
m = folium.Map(location=map_center, zoom_start=6)

# Add fishing points in red, non-fishing in blue
for idx, row in sample_df.iterrows():
    color = 'red' if row['is_fishing_binary'] == 1 else 'blue'
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=3,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7
    ).add_to(m)

# Save the map
m.save('fishing_map.html')
print("Map saved to fishing_map.html")

# 5. Speed vs. Course Analysis
print("\nAnalyzing speed vs. course patterns...")
plt.figure(figsize=(10, 8))
sns.scatterplot(data=sample_df, x='speed', y='course', hue='is_fishing_binary', alpha=0.6)
plt.title('Speed vs. Course by Fishing Activity')
plt.savefig('speed_vs_course.png')
plt.close()

# 6. Time of Day Analysis
print("\nAnalyzing fishing by time of day...")
hourly_fishing = df.groupby('hour')['is_fishing_binary'].mean()
plt.figure(figsize=(10, 6))
hourly_fishing.plot(kind='bar')
plt.title('Fishing Activity by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Proportion Fishing')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('fishing_by_hour.png')
plt.close()

# 7. Baseline Model (using just raw features)
print("\nTraining baseline model...")
features = ['speed', 'course', 'distance_from_shore', 'distance_from_port', 
           'hour', 'day_of_week', 'month', 'lat', 'lon']
X = df[features]
y = df['is_fishing_binary']

# Split the data (being careful about temporal aspects)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nBaseline model performance:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature importance:")
print(importance)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

print("\nExploration complete!")

  plt.style.use('seaborn-whitegrid')


Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/drifting_longlines.csv'