In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import joblib

In [4]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-reduced-cleaned_sw.tsv',sep='\t') # assuming column called ['text']
df.head()

Unnamed: 0,text,label,cleaned
0,#EartthquakeReport #TsunamiReport for M7.2 #Ea...,earthquake,eartthquakereport tsunamireport for m72 earthq...
1,Tsunami warning lifted after earthquake off Al...,earthquake,tsunami warning lifted after earthquake off al...
2,"First Temblor map (AFAIK) on bluesky! Today, a...",earthquake,first temblor map afaik on bluesky today a mag...
3,\U0001f9ea\n\nA M7.2 earthquake occurred offsh...,earthquake,test tube a m72 earthquake occurred offshore a...
4,Earthquake waves from the M7.2 earthquake in A...,earthquake,earthquake waves from the m72 earthquake in al...


In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
earthquake,5656
wildfire,5634
other,4981
hurricane,4751
flood,3312
tornado,766


In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV, HalvingRandomSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_v2.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v2.pkl')

In [8]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20080 samples; Testing with 5020 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [41]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LGBMClassifier(objective='multiclass', num_classes=len(np.unique(y_train)), metric='multi_logloss'))
])
param_grid = {
    'tfidf__ngram_range': [(1, 3)],
    'tfidf__max_df': [0.9],
    'tfidf__min_df': [2],
    'clf__num_leaves': [40],
    'clf__n_estimators': [80],
    'clf__min_child_samples': [15],
    'clf__learning_rate': [0.05],
    'clf__min_child_weight': [0.05],
    'clf__subsample': [0.9],
    'clf__colsample_bytree': [0.8],
    'clf__reg_alpha': [0.05],
    'clf__reg_lambda': [0.01]
}

In [44]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

random_search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    cv=kfolds,
    n_candidates=1024,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [45]:
# fit to the data
#random_search.fit(X_train, y_train)
grid_search.fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 627
max_resources_: 20080
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 36
n_resources: 627
Fitting 5 folds for each of 36 candidates, totalling 180 fits
----------
iter: 1
n_candidates: 18
n_resources: 1254
Fitting 5 folds for each of 18 candidates, totalling 90 fits
----------
iter: 2
n_candidates: 9
n_resources: 2508
Fitting 5 folds for each of 9 candidates, totalling 45 fits
----------
iter: 3
n_candidates: 5
n_resources: 5016
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 3
n_resources: 10032
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 5
n_candidates: 2
n_resources: 20064
Fitting 5 folds for each of 2 candidates, totalling 10 fits




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.924144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197339
[LightGBM] [Info] Number of data points in the train set: 20080, number of used features: 8612
[LightGBM] [Info] Start training from score -1.490107
[LightGBM] [Info] Start training from score -2.025542
[LightGBM] [Info] Start training from score -1.664460
[LightGBM] [Info] Start training from score -1.617187
[LightGBM] [Info] Start training from score -3.489115
[LightGBM] [Info] Start training from score -1.494093


In [47]:
#print(f"Best parameters: {random_search.best_params_}")
#print(f"Best score: {random_search.best_score_}")

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.05, 'clf__n_estimators': 80, 'clf__num_leaves': 40, 'clf__reg_alpha': 0.05, 'clf__reg_lambda': 0.01, 'clf__subsample': 0.9, 'tfidf__max_df': 0.9, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Best score: 0.913590873762233


In [48]:
#best_estimator = random_search.best_estimator_
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.93      0.93      0.93       663
   hurricane       0.95      0.95      0.95       950
       other       0.78      0.78      0.78       996
     tornado       0.93      0.82      0.87       153
    wildfire       0.88      0.87      0.88      1127

    accuracy                           0.90      5020
   macro avg       0.91      0.89      0.90      5020
weighted avg       0.90      0.90      0.90      5020
 [[1121    0    1    9    0    0]
 [   1  614   13   30    4    1]
 [   0   14  906   24    3    3]
 [  32   28   28  778    1  129]
 [   0    1    0   26  125    1]
 [   4    5    5  126    1  986]]


In [50]:
#cv_results = random_search.cv_results_
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.05, 'clf__n_estimators': 70, 'clf__num_leaves': 36, 'clf__reg_alpha': 0.05, 'clf__reg_lambda': 0.001, 'clf__subsample': 0.9, 'tfidf__max_df': 0.9, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Train Score: 0.9849 | Validation Score: 0.7835

Params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.05, 'clf__n_estimators': 70, 'clf__num_leaves': 36, 'clf__reg_alpha': 0.05, 'clf__reg_lambda': 0.01, 'clf__subsample': 0.9, 'tfidf__max_df': 0.9, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Train Score: 0.9845 | Validation Score: 0.7843

Params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.05, 'clf__n_estimators': 70, 'clf__num_leaves': 36, 'clf__reg_alpha': 0.1, 'clf__reg_lambda': 0.001, 'clf__subsample': 0.9, 'tfidf__max_df'

In [51]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_v2.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_v2.pkl']