In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-cleaned_sw.tsv',sep='\t') # assuming column called ['text']
df.head()

Unnamed: 0,text,label,cleaned
0,#EartthquakeReport #TsunamiReport for M7.2 #Ea...,earthquake,eartthquakereport tsunamireport for m72 earthq...
1,Tsunami warning lifted after earthquake off Al...,earthquake,tsunami warning lifted after earthquake off al...
2,"First Temblor map (AFAIK) on bluesky! Today, a...",earthquake,first temblor map afaik on bluesky today a mag...
3,\U0001f9ea\n\nA M7.2 earthquake occurred offsh...,earthquake,test tube a m72 earthquake occurred offshore a...
4,Earthquake waves from the M7.2 earthquake in A...,earthquake,earthquake waves from the m72 earthquake in al...


In [4]:
df.dropna(inplace=True)

In [4]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
other,70610
earthquake,5654
wildfire,5611
hurricane,4746
flood,3312
tornado,765


In [26]:
import xgboost as xgb
import lightgbm as lgb

In [44]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import NearMiss
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE

In [45]:
# train test split
X = df['cleaned']
y = df['label']

from sklearn.preprocessing import LabelEncoder

# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts labels to numeric form
print(f"Labels before conversion: {np.unique(y)}")
print(f"Labels after conversion: {np.unique(y_encoded)}")

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=24)

print(f"Training with {len(X_train)} samples; Testing with {len(X_test)} samples")

Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]
Training with 72558 samples; Testing with 18140 samples


In [50]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=10000, min_df=2, max_df=0.85)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

smote = SMOTE(sampling_strategy={4: 2649}, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

print("Class distribution before SMOTE:", dict(zip(*np.unique(y_train, return_counts=True))))
print("Class distribution after SMOTE:", dict(zip(*np.unique(y_train_smote, return_counts=True))))

Class distribution before SMOTE: {np.int64(0): np.int64(4523), np.int64(1): np.int64(2649), np.int64(2): np.int64(3797), np.int64(3): np.int64(56488), np.int64(4): np.int64(612), np.int64(5): np.int64(4489)}
Class distribution after SMOTE: {np.int64(0): np.int64(4523), np.int64(1): np.int64(2649), np.int64(2): np.int64(3797), np.int64(3): np.int64(56488), np.int64(4): np.int64(2649), np.int64(5): np.int64(4489)}


In [51]:
nm = NearMiss(sampling_strategy='majority')
X_train_resampled, y_train_resampled = nm.fit_resample(X_train_smote, y_train_smote)

In [53]:
print("Class distribution after NearMiss:", dict(zip(*np.unique(y_train_resampled, return_counts=True))))
print("Class distribution before NearMiss:", dict(zip(*np.unique(y_train, return_counts=True))))

Class distribution after NearMiss: {np.int64(0): np.int64(4523), np.int64(1): np.int64(2649), np.int64(2): np.int64(3797), np.int64(3): np.int64(2649), np.int64(4): np.int64(2649), np.int64(5): np.int64(4489)}
Class distribution before NearMiss: {np.int64(0): np.int64(4523), np.int64(1): np.int64(2649), np.int64(2): np.int64(3797), np.int64(3): np.int64(56488), np.int64(4): np.int64(612), np.int64(5): np.int64(4489)}


In [54]:
xgb_gpu = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y)), tree_method="gpu_hist")
lgbm = lgb.LGBMClassifier(objective='multiclass', num_class=len(set(y)))

meta_model = LogisticRegression(max_iter=1000)

stacking_clf = StackingClassifier(
    estimators=[('xgb', xgb_gpu), ('lgbm', lgbm)],
    final_estimator=meta_model
)

In [55]:
param_dists = {
    'lgbm__num_leaves': [18, 25, 31, 50, 100],
    'lgbm__n_estimators': [100, 125, 150, 200],
    'lgbm__learning_rate': [0.05, 0.08, 0.1],
    'xgb__max_depth': [3, 5, 7],
    'xgb__n_estimators': [100, 150, 200],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
    'final_estimator__C': [0.1, 1, 10]
}

In [56]:
# set up the randomized search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(
    estimator=stacking_clf,
    param_distributions=param_dists,
    n_iter=20,
    cv=kfolds,
    verbose=2,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=42,
    error_score='raise'
)

In [57]:
random_search.fit(X_train_resampled, y_train_resampled)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

In [None]:
best_estimator = random_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

In [None]:
cv_results = random_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", random_search.best_score_)

In [None]:
# wrap everything in a. pipeline
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', best_estimator)
])

In [None]:
# export model
joblib.dump((label_encoder, pipeline), '/content/drive/MyDrive/notebook_data/stacking_model_v1.pkl')

['/content/drive/MyDrive/notebook_data/lgbm_model_v5.pkl']