In [293]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [294]:
df = pd.read_csv("/content/archive.zip")

In [295]:
print("Dataset Shape:", df.shape)
print(df.head())

Dataset Shape: (782, 13)
   magnitude  cdi  mmi  sig  nst   dmin   gap    depth  latitude  longitude  \
0        7.0    8    7  768  117  0.509  17.0   14.000   -9.7963    159.596   
1        6.9    4    4  735   99  2.229  34.0   25.000   -4.9559    100.738   
2        7.0    3    3  755  147  3.125  18.0  579.000  -20.0508   -178.346   
3        7.3    5    5  833  149  1.865  21.0   37.000  -19.2918   -172.129   
4        6.6    0    2  670  131  4.998  27.0  624.464  -25.5948    178.278   

   Year  Month  tsunami  
0  2022     11        1  
1  2022     11        0  
2  2022     11        1  
3  2022     11        1  
4  2022     11        1  


In [296]:
print(df.columns.tolist())

['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth', 'latitude', 'longitude', 'Year', 'Month', 'tsunami']


In [297]:
def mag_to_severity(mag):
    if pd.isna(mag):
        return np.nan
    m = float(mag)
    if m < 4.0:
        return "Low"
    elif m < 5.0:
        return "Moderate"
    elif m < 6.0:
        return "Strong"
    else:
        return "Severe"

df["severity"] = df["magnitude"].apply(mag_to_severity)

In [298]:
# Drop rows missing key info
df = df.dropna(subset=["magnitude", "tsunami"])

In [299]:
# Use all numeric columns relevant to prediction
features = ["magnitude", "cdi", "mmi", "sig", "nst", "dmin",
            "gap", "depth", "latitude", "longitude", "Year", "Month"]

# Keep only columns that actually exist in your df (safety)
features = [c for c in features if c in df.columns]

In [300]:
X = df[features]
y_sev = df["severity"]
y_tsu = df["tsunami"].astype(int)

In [301]:
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [302]:
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [303]:
preprocessor = ColumnTransformer([
    ("num", num_transformer, X.select_dtypes(include=["float64", "int64"]).columns),
    ("cat", cat_transformer, X.select_dtypes(include=["object"]).columns)
])

In [304]:
X_train, X_test, y_sev_train, y_sev_test, y_tsu_train, y_tsu_test = train_test_split(
    X, y_sev, y_tsu, test_size=0.2, random_state=42
)

In [305]:
sev_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [306]:
sev_model.fit(X_train, y_sev_train)
sev_pred = sev_model.predict(X_test)

In [307]:
print("Feature columns used:", features)
print("Severity distribution:\n", y_sev.value_counts())
print("Tsunami distribution:\n", y_tsu.value_counts())

Feature columns used: ['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth', 'latitude', 'longitude', 'Year', 'Month']
Severity distribution:
 severity
Severe    782
Name: count, dtype: int64
Tsunami distribution:
 tsunami
0    478
1    304
Name: count, dtype: int64


In [308]:
# Numeric impute + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [309]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, features)
], remainder="drop")


In [310]:
# Stratify by tsunami to keep class proportions (binary)
X_train, X_test, y_sev_train, y_sev_test, y_tsu_train, y_tsu_test = train_test_split(
    X, y_sev, y_tsu, test_size=0.20, random_state=42, stratify=y_tsu
)

In [311]:
# Severity model (Random Forest)
sev_pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

In [312]:
print("\nTraining severity model...")
sev_pipeline.fit(X_train, y_sev_train)


Training severity model...


In [313]:
all_severity_labels = ["Low", "Moderate", "Strong", "Severe"]

print("Confusion Matrix:")
print(confusion_matrix(y_sev_test, sev_pred, labels=all_severity_labels))

print("\nClassification Report:")
print(classification_report(
    y_sev_test,
    sev_pred,
    labels=all_severity_labels,
    zero_division=0
))

Confusion Matrix:
[[  0   0   0   0]
 [  0   0   0   0]
 [  0   0   0   0]
 [  0   0   0 157]]

Classification Report:
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00         0
    Moderate       0.00      0.00      0.00         0
      Strong       0.00      0.00      0.00         0
      Severe       1.00      1.00      1.00       157

    accuracy                           1.00       157
   macro avg       0.25      0.25      0.25       157
weighted avg       1.00      1.00      1.00       157



In [314]:
joblib.dump(sev_pipeline, "severity_model_kaggle.joblib")
print("Saved severity_model_kaggle.joblib")

Saved severity_model_kaggle.joblib


In [315]:
# Tsunami model (Logistic Regression)
tsu_pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", LogisticRegression(max_iter=2000, solver="liblinear"))
])

In [316]:
print("\nTraining tsunami model...")
tsu_pipeline.fit(X_train, y_tsu_train)

y_tsu_pred = tsu_pipeline.predict(X_test)
print("\nTsunami evaluation")
print("Accuracy:", accuracy_score(y_tsu_test, y_tsu_pred))
print(classification_report(y_tsu_test, y_tsu_pred))
print("Confusion matrix:\n", confusion_matrix(y_tsu_test, y_tsu_pred))


Training tsunami model...

Tsunami evaluation
Accuracy: 0.8598726114649682
              precision    recall  f1-score   support

           0       0.93      0.83      0.88        96
           1       0.77      0.90      0.83        61

    accuracy                           0.86       157
   macro avg       0.85      0.87      0.86       157
weighted avg       0.87      0.86      0.86       157

Confusion matrix:
 [[80 16]
 [ 6 55]]


In [317]:
joblib.dump(tsu_pipeline, "tsunami_model_kaggle.joblib")
print("Saved tsunami_model_kaggle.joblib")

Saved tsunami_model_kaggle.joblib


In [318]:
def predict_single(sample: dict):
    """
    sample keys: any subset of `features`. Missing fields will be treated as NaN
    Example:
      sample = {
        "magnitude": 6.8, "cdi": 5, "mmi": 5, "sig": 700,
        "nst": 120, "dmin": 1.2, "gap": 25, "depth": 15,
        "latitude": -9.7, "longitude": 159.6, "Year": 2022, "Month": 11
      }
    """
    row = {c: sample.get(c, np.nan) for c in features}
    sample_df = pd.DataFrame([row])
    sev = sev_pipeline.predict(sample_df)[0]
    tsu = int(tsu_pipeline.predict(sample_df)[0])
    tsu_prob = float(tsu_pipeline.predict_proba(sample_df)[0, 1]) if hasattr(tsu_pipeline, "predict_proba") else None
    return {"severity": sev, "tsunami": tsu, "tsunami_prob": tsu_prob}

In [319]:
# Example usage using medians from dataset with a higher magnitude:
example = {c: float(X[c].median()) for c in features}
example["magnitude"] = 6.8
example["depth"] = 15.0
print("\nSample prediction for example input:")
print(predict_single(example))


Sample prediction for example input:
{'severity': 'Severe', 'tsunami': 0, 'tsunami_prob': 0.41671895814857063}
