In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import joblib

In [None]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-cleaned.tsv',sep='\t') # assuming column called ['text']
df.head()

Unnamed: 0,text,label,cleaned
0,#EartthquakeReport #TsunamiReport for M7.2 #Ea...,earthquake,eartthquakereport tsunamireport m72 earthquake...
1,Tsunami warning lifted after earthquake off Al...,earthquake,tsunami warning lifted earthquake alaska coast...
2,"First Temblor map (AFAIK) on bluesky! Today, a...",earthquake,temblor map afaik bluesky today magnitude72 ea...
3,\U0001f9ea\n\nA M7.2 earthquake occurred offsh...,earthquake,test tube m72 earthquake occurred offshore ala...
4,Earthquake waves from the M7.2 earthquake in A...,earthquake,earthquake waves m72 earthquake alaska visible...


In [None]:
df = df.dropna()
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
other,67773
wildfire,5486
earthquake,5380
hurricane,4693
flood,3286
tornado,732


In [None]:
df = df[df['text'].str.split().str.len() >= 5] # removing lines that have less than 5 words in the original tweet

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

#df.head()

In [None]:
# train test split
X = df['cleaned']
y = df['label']

from sklearn.preprocessing import LabelEncoder

# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts labels to numeric form
print(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
print(f"Training with {len(X_train)} samples; Testing with {len(X_test)} samples")
print(np.unique(y_train))

['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Training with 69880 samples; Testing with 17470 samples
[0 1 2 3 4 5]


In [None]:
# define pipeline for models
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', None) # placeholder
])

In [None]:
# define parameter grids
tfidf_params = {
    'tfidf__max_df': [0.85, 0.9, 0.95],
    'tfidf__min_df': [0.001, 2, 3, 5],
    'tfidf__ngram_range': [(1,1),(1,2)]
}

lgbm_param_grid = {
    'clf': [LGBMClassifier()],
    #'clf__class_weight': ['balanced'],
    'clf__learning_rate': [0.2, 0.15, 0.1, 0.08, 0.05, 0.03],
    'clf__n_estimators': [100, 125, 150],
    'clf__num_leaves': [18, 20, 25],
    #'clf__force_col_wise': [True],
    #'clf__min_child_samples': [15],
    #'clf__min_child_weight': [0.01],
    #'clf__reg_alpha': [1.0],
    **tfidf_params
}

In [None]:
# set up the grid search
param_grid = [lgbm_param_grid]
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(pipeline, param_grid, cv=kfolds, n_jobs=-1, verbose=1, scoring='f1_macro', factor=3, error_score="raise")

In [None]:
# fit to the data
grid_search.fit(X_train, y_train)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 95
max_resources_: 69880
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1296
n_resources: 95
Fitting 3 folds for each of 1296 candidates, totalling 3888 fits
----------
iter: 1
n_candidates: 432
n_resources: 285
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
----------
iter: 2
n_candidates: 144
n_resources: 855
Fitting 3 folds for each of 144 candidates, totalling 432 fits
----------
iter: 3
n_candidates: 48
n_resources: 2565
Fitting 3 folds for each of 48 candidates, totalling 144 fits
----------
iter: 4
n_candidates: 16
n_resources: 7695
Fitting 3 folds for each of 16 candidates, totalling 48 fits
----------
iter: 5
n_candidates: 6
n_resources: 23085
Fitting 3 folds for each of 6 candidates, totalling 18 fits
----------
iter: 6
n_candidates: 2
n_resources: 69255
Fitting 3 folds for each of 2 candidates, totalling 6 fits




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.768401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192139
[LightGBM] [Info] Number of data points in the train set: 69880, number of used features: 2487
[LightGBM] [Info] Start training from score -2.787235
[LightGBM] [Info] Start training from score -3.280176
[LightGBM] [Info] Start training from score -2.923958
[LightGBM] [Info] Start training from score -0.253767
[LightGBM] [Info] Start training from score -4.781215
[LightGBM] [Info] Start training from score -2.767678


In [1]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

NameError: name 'grid_search' is not defined

In [None]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.77      0.99      0.87      1076
       flood       0.64      0.92      0.75       657
   hurricane       0.73      0.96      0.83       939
       other       0.98      0.84      0.91     13555
     tornado       0.63      0.88      0.73       146
    wildfire       0.46      0.86      0.60      1097

    accuracy                           0.86     17470
   macro avg       0.70      0.91      0.78     17470
weighted avg       0.91      0.86      0.88     17470
 [[ 1070     1     0     4     0     1]
 [    1   606    12    34     3     1]
 [    0    10   906    19     2     2]
 [  317   329   311 11449    70  1079]
 [    0     2     3    13   128     0]
 [    4     5     5   141     1   941]]


In [None]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf': LGBMClassifier(), 'clf__class_weight': 'balanced', 'clf__learning_rate': 0.2, 'tfidf__max_df': 0.75, 'tfidf__min_df': 0.001, 'tfidf__ngram_range': (1, 1)}
Train Score: 0.3753 | Validation Score: 0.2281

Params: {'clf': LGBMClassifier(), 'clf__class_weight': 'balanced', 'clf__learning_rate': 0.2, 'tfidf__max_df': 0.75, 'tfidf__min_df': 0.001, 'tfidf__ngram_range': (1, 2)}
Train Score: 0.3806 | Validation Score: 0.2543

Params: {'clf': LGBMClassifier(), 'clf__class_weight': 'balanced', 'clf__learning_rate': 0.2, 'tfidf__max_df': 0.75, 'tfidf__min_df': 0.001, 'tfidf__ngram_range': (1, 3)}
Train Score: 0.3777 | Validation Score: 0.2322

Params: {'clf': LGBMClassifier(), 'clf__class_weight': 'balanced', 'clf__learning_rate': 0.2, 'tfidf__max_df': 0.75, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}
Train Score: 0.3733 | Validation Score: 0.2351

Params: {'clf': LGBMClassifier(), 'clf__class_weight': 'balanced', 'clf__learning_rate': 0.2, 'tfidf__max_df': 0.75, 'tfidf__min

In [None]:
# export model
joblib.dump((label_encoder, best_estimator), '/content/drive/MyDrive/notebook_data/lgbm_model_v5_encoder.pkl')

['/content/drive/MyDrive/notebook_data/lgbm_model_v5.pkl']