Part 1: Data Fetching from USGS
This script fetches earthquake data (both tsunami and non-tsunami events) from the USGS API and saves it as a CSV file.

In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime

def fetch_usgs_mixed_data(min_magnitude=6.0, start_year=2000):
    """
    Fetch mixed data (positive/tsunami and negative/no-tsunami samples) from USGS.
    Uses the GeoJSON interface to retrieve detailed fields like cdi, mmi, sig.
    """
    # USGS API endpoint
    url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
    
    # Parameters: Fetch all earthquakes > min_magnitude from the past 20+ years
    # This ensures a mix of large earthquakes that did and did not cause tsunamis.
    params = {
        "format": "geojson",
        "starttime": f"{start_year}-01-01",
        "minmagnitude": min_magnitude,
        "orderby": "time",
        "limit": 20000  # Fetch a sufficient amount of data
    }
    
    print(f"Fetching all earthquakes > {min_magnitude} from {start_year} to present from USGS...")
    print("This includes both 'tsunami-generating' and 'non-tsunami' events...")
    
    try:
        response = requests.get(url, params=params, timeout=60)
        response.raise_for_status()
        data = response.json()
        
        features = data.get('features', [])
        print(f"Successfully retrieved {len(features)} earthquake records.")
        
        # Parse GeoJSON data to match the desired CSV format
        parsed_data = []
        for feature in features:
            props = feature['properties']
            geometry = feature['geometry']['coordinates'] # [long, lat, depth]
            
            # Convert timestamp
            timestamp = props.get('time')
            if timestamp:
                dt = datetime.fromtimestamp(timestamp / 1000)
                year = dt.year
                month = dt.month
            else:
                year, month = np.nan, np.nan

            # Extract Tsunami flag (This is the Target)
            # In USGS: 1 = Tsunami, 0 or null = No Tsunami
            tsunami_flag = props.get('tsunami')
            if tsunami_flag is None:
                tsunami_flag = 0
            
            # Construct row data (strictly matching required column names)
            row = {
                'magnitude': props.get('mag'),
                'cdi': props.get('cdi'),       
                'mmi': props.get('mmi'),       
                'sig': props.get('sig'),       
                'nst': props.get('nst'),
                'dmin': props.get('dmin'),
                'gap': props.get('gap'),
                'depth': geometry[2],          # Depth is the 3rd coordinate
                'latitude': geometry[1],
                'longitude': geometry[0],
                'Year': year,
                'Month': month,
                'tsunami': int(tsunami_flag)   # Target column: 0 or 1
            }
            parsed_data.append(row)
            
        df = pd.DataFrame(parsed_data)
        return df
    
    except Exception as e:
        print(f"Data fetch failed: {e}")
        return None

# ==========================================
# Execution
# ==========================================
if __name__ == "__main__":
    # 1. Fetch data
    df_mixed = fetch_usgs_mixed_data(min_magnitude=6.0) 
    
    if df_mixed is not None:
        # 2. Select target columns
        target_columns = [
            'magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 
            'depth', 'latitude', 'longitude', 'Year', 'Month', 'tsunami'
        ]
        
        # Ensure all columns exist
        df_final = df_mixed[target_columns]
        
        # 3. Print statistics
        print("\n" + "="*30)
        print("Dataset Statistics (Class Distribution)")
        print("="*30)
        print(df_final['tsunami'].value_counts().rename({0: 'No Tsunami (Negative)', 1: 'Tsunami (Positive)'}))
        
        # 4. Save
        filename = "usgs_earthquake_tsunami_mixed.csv"
        df_final.to_csv(filename, index=False)
        print(f"\nFile saved as: {filename}")

Part 2: Data Cleaning (Filtering & Filling)
This script reads the raw data generated in Part 1, filters it for the years 2013-2025, fills missing values with 0, and saves the cleaned version.

In [None]:
import pandas as pd
import os

# 1. Define filenames
input_file = 'usgs_earthquake_tsunami_mixed.csv'
output_file = 'usgs_earthquake_tsunami_mixed_2013_2025_filled.csv'

# 2. Check if input file exists
if not os.path.exists(input_file):
    print(f"❌ Error: Input file '{input_file}' not found.")
    print("Please ensure the file from Part 1 exists in the directory.")
else:
    # 3. Load data
    print("Loading data...")
    df = pd.read_csv(input_file)
    print(f"Original row count: {len(df)}")

    # 4. Filter by Year (2013 - 2025)
    df_filtered = df[(df['Year'] >= 2013) & (df['Year'] <= 2025)].copy()
    print(f"Row count after 2013-2025 filtering: {len(df_filtered)}")

    # 5. Fill missing values (NaN) with 0
    df_filled = df_filtered.fillna(0)
    print("Filled all missing values (NaN) with 0.")

    # 6. Save processed file
    df_filled.to_csv(output_file, index=False)
    
    print("\n" + "="*40)
    print(f"✅ Success! File generated: {output_file}")
    print("="*40)

Part 3: Data Deduplication (Removing Shadow Records)
This script takes the cleaned data from Part 2, identifies and removes extremely similar "shadow records" (same time, exact magnitude, very close location/depth), and saves the final deduplicated dataset.

In [None]:
import pandas as pd
import numpy as np

def process_duplicates():
    input_file = 'usgs_earthquake_tsunami_mixed_2013_2025_filled.csv'
    output_dups = 'extremely_similar_records.csv'
    output_clean = 'usgs_earthquake_tsunami_mixed_2013_2025_deduplicated.csv'
    
    print(f"Loading file: {input_file} ...")
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print("Error: Input file not found.")
        return

    # Add temporary ID for tracking
    df['temp_id'] = df.index
    
    # Self-join to find potential duplicates
    # Criteria: Same Year, Month, and Exact Magnitude
    merged = pd.merge(df, df, on=['Year', 'Month', 'magnitude'], suffixes=('_1', '_2'))
    
    # Filter out self-matches and duplicate pairs (keep index_1 < index_2 only)
    merged = merged[merged['temp_id_1'] < merged['temp_id_2']]
    
    # Apply strict physical thresholds for "Extremely Similar"
    # Latitude/Longitude diff < 1.0 degree
    # Depth diff < 10.0 km
    duplicates = merged[
        (np.abs(merged['latitude_1'] - merged['latitude_2']) < 1.0) &
        (np.abs(merged['longitude_1'] - merged['longitude_2']) < 1.0) &
        (np.abs(merged['depth_1'] - merged['depth_2']) < 10.0)
    ]
    
    print(f"Detected {len(duplicates)} pairs of extremely similar data.")
    
    # 1. Save these potential duplicates for inspection
    ids_involved = set(duplicates['temp_id_1']).union(set(duplicates['temp_id_2']))
    df_dups = df[df['temp_id'].isin(ids_involved)].copy()
    df_dups = df_dups.sort_values(by=['Year', 'Month', 'magnitude'])
    df_dups.drop(columns=['temp_id'], inplace=True)
    df_dups.to_csv(output_dups, index=False)
    print(f"-> Saved suspected duplicates to: {output_dups} ({len(df_dups)} rows)")
    
    # 2. Generate final clean data
    # Strategy: For every pair (A, B), keep A (temp_id_1) and drop B (temp_id_2)
    ids_to_drop = set(duplicates['temp_id_2'])
    df_clean = df[~df['temp_id'].isin(ids_to_drop)].copy()
    df_clean.drop(columns=['temp_id'], inplace=True)
    
    print(f"-> Original row count: {len(df)}")
    print(f"-> Deduplicated row count: {len(df_clean)}")
    print(f"-> Removed {len(ids_to_drop)} redundant records.")
    
    df_clean.to_csv(output_clean, index=False)
    print(f"-> Saved final deduplicated data to: {output_clean}")

if __name__ == "__main__":
    process_duplicates()