In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_features_v2.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v4.pkl')

In [5]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20048 samples; Testing with 5013 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [6]:
X_train.head()

Unnamed: 0,cleaned,has_location,sentiment
21669,i do nt know how i held a job before i did nt ...,0,0
9094,1156 j1 hd now playing ayabie natsumonogatari ...,1,0
18799,climate change parching the american west even...,0,2
16642,hollywood or washington,1,2
9156,unless its your own business nobody should be ...,0,0


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 'cleaned'),  # Apply TF-IDF to 'cleaned' column
        ('num', 'passthrough', ['has_location', 'sentiment'])  # Passthrough numerical features
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LGBMClassifier(objective='multiclass', num_classes=len(np.unique(y_train))))
])


In [8]:
# Specifying the categorical features for the LightGBM classifier
categoricals = ['has_location','sentiment']
indexes_of_categories = [X_train.columns.get_loc(col) for col in categoricals]

In [9]:
param_grid = {
    'preprocessor__tfidf__ngram_range': [(1,3)],
    'preprocessor__tfidf__max_df': [0.9],
    'preprocessor__tfidf__min_df': [4],
    'preprocessor__tfidf__stop_words': [None],
    'clf__num_leaves': [15, 20, 60, 70, 80],
    'clf__n_estimators': [130],
    'clf__learning_rate': [0.05],
    'clf__max_depth': [8],
    'clf__min_child_samples': [50, 70]
}

In [10]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [11]:
# fit to the data
grid_search.fit(X_train, y_train, clf__categorical_feature=indexes_of_categories)
#grid_search.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 2506
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 10
n_resources: 2506
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 5
n_resources: 5012
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 2
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 3
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.296055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139307
[LightGBM] [Info] Number of data points in the train set: 20048, number of used features: 2428
[LightGBM] [Info] Start training from score -1.488954
[LightGBM] [Info] Start training from score -2.023947
[LightGBM] [Info] Start training from score -1.663918
[LightGBM] [Info] Start training from score -1.617350
[LightGBM] [Info] Start training from score -3.489152
[LightGBM] [Info] Start training from score -1.496499


In [12]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__learning_rate': 0.05, 'clf__max_depth': 8, 'clf__min_child_samples': 50, 'clf__n_estimators': 130, 'clf__num_leaves': 60, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': None}
Best score: 0.9031892132853865


In [13]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.92      0.93      0.93       663
   hurricane       0.95      0.95      0.95       949
       other       0.76      0.79      0.78       995
     tornado       0.87      0.79      0.83       153
    wildfire       0.90      0.86      0.88      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.88      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1115    0    1   14    1    0]
 [   1  619   14   24    5    0]
 [   0   11  899   30    4    5]
 [  31   36   25  789    7  107]
 [   0    2    2   28  121    0]
 [   1    5    5  149    1  961]]


In [14]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__learning_rate': 0.05, 'clf__max_depth': 8, 'clf__min_child_samples': 50, 'clf__n_estimators': 130, 'clf__num_leaves': 15, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9466 | Validation Score: 0.8153

Params: {'clf__learning_rate': 0.05, 'clf__max_depth': 8, 'clf__min_child_samples': 50, 'clf__n_estimators': 130, 'clf__num_leaves': 20, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9458 | Validation Score: 0.8177

Params: {'clf__learning_rate': 0.05, 'clf__max_depth': 8, 'clf__min_child_samples': 50, 'clf__n_estimators': 130, 'clf__num_leaves': 60, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9474 | Valida

In [15]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_features_sw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_features_sw.pkl']

In [20]:
param_grid_2 = {
    'preprocessor__tfidf__ngram_range': [(1,3)],
    'preprocessor__tfidf__max_df': [0.9],
    'preprocessor__tfidf__min_df': [4],
    'preprocessor__tfidf__stop_words': ['english'],
    'clf__num_leaves': [15, 20, 60, 70],
    'clf__n_estimators': [100, 130, 160],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    #'clf__max_depth': [8],
    #'clf__min_child_samples': [50, 70]
}

In [21]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_2 = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_2,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [22]:
# fit to the data
grid_search_2.fit(X_train, y_train, clf__categorical_feature=indexes_of_categories)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 626
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 36
n_resources: 626
Fitting 3 folds for each of 36 candidates, totalling 108 fits
----------
iter: 1
n_candidates: 18
n_resources: 1252
Fitting 3 folds for each of 18 candidates, totalling 54 fits
----------
iter: 2
n_candidates: 9
n_resources: 2504
Fitting 3 folds for each of 9 candidates, totalling 27 fits
----------
iter: 3
n_candidates: 5
n_resources: 5008
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 4
n_candidates: 3
n_resources: 10016
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 5
n_candidates: 2
n_resources: 20032
Fitting 3 folds for each of 2 candidates, totalling 6 fits




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.453998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 103031
[LightGBM] [Info] Number of data points in the train set: 20048, number of used features: 3806
[LightGBM] [Info] Start training from score -1.488954
[LightGBM] [Info] Start training from score -2.023947
[LightGBM] [Info] Start training from score -1.663918
[LightGBM] [Info] Start training from score -1.617350
[LightGBM] [Info] Start training from score -3.489152
[LightGBM] [Info] Start training from score -1.496499


In [23]:
print(f"Best parameters: {grid_search_2.best_params_}")
print(f"Best score: {grid_search_2.best_score_}")

Best parameters: {'clf__learning_rate': 0.05, 'clf__n_estimators': 100, 'clf__num_leaves': 20, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': 'english'}
Best score: 0.902290430624491


In [24]:
best_estimator_2 = grid_search_2.best_estimator_
y_pred = best_estimator_2.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.92      0.94      0.93       663
   hurricane       0.95      0.96      0.95       949
       other       0.76      0.79      0.78       995
     tornado       0.90      0.80      0.85       153
    wildfire       0.89      0.84      0.87      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.89      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1121    0    1    9    0    0]
 [   1  622   12   26    2    0]
 [   0    9  909   23    2    6]
 [  33   39   27  787    5  104]
 [   0    4    0   25  123    1]
 [   3    3    7  161    4  944]]


In [25]:
# export model
joblib.dump((best_estimator_2, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_features_noSw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_features_noSw.pkl']