In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_v4.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v4.pkl')

In [5]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20048 samples; Testing with 5013 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [6]:
X_train.head()

Unnamed: 0,cleaned
21669,i do nt know how i held a job before i did nt ...
9094,1156 j1 hd now playing ayabie natsumonogatari ...
18799,climate change parching the american west even...
16642,hollywood or washington
9156,unless its your own business nobody should be ...


In [16]:
# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LGBMClassifier(objective='multiclass', num_classes=len(np.unique(y_train))))
])


In [17]:
param_grid = {
    'tfidf__ngram_range': [(1, 3)],
    'tfidf__max_df': [0.8],
    'tfidf__min_df': [2],
    'clf__num_leaves': [28],
    'clf__n_estimators': [120],
    'clf__min_child_samples': [15],
    'clf__learning_rate': [0.05],
    'clf__min_child_weight': [0.01],
    'clf__subsample': [0.8]
}

In [18]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [19]:
# fit to the data
#grid_search.fit(X_train, y_train, clf__categorical_feature=indexes_of_categories)
grid_search.fit(X_train, y_train)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 20048
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 20048
Fitting 3 folds for each of 1 candidates, totalling 3 fits




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.220147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198110
[LightGBM] [Info] Number of data points in the train set: 20048, number of used features: 8737
[LightGBM] [Info] Start training from score -1.488954
[LightGBM] [Info] Start training from score -2.023947
[LightGBM] [Info] Start training from score -1.663918
[LightGBM] [Info] Start training from score -1.617350
[LightGBM] [Info] Start training from score -3.489152
[LightGBM] [Info] Start training from score -1.496499


In [20]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.01, 'clf__n_estimators': 120, 'clf__num_leaves': 28, 'clf__subsample': 0.8, 'tfidf__max_df': 0.8, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Best score: 0.9121648805800323


In [21]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.92      0.92      0.92       663
   hurricane       0.95      0.96      0.96       949
       other       0.78      0.79      0.78       995
     tornado       0.91      0.82      0.86       153
    wildfire       0.89      0.87      0.88      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.89      0.90      5013
weighted avg       0.90      0.90      0.90      5013
 [[1117    0    0   12    1    1]
 [   1  612   13   31    4    2]
 [   0    8  911   23    2    5]
 [  32   38   27  784    6  108]
 [   0    3    0   24  125    1]
 [   1    5    5  132    0  979]]


In [22]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.01, 'clf__n_estimators': 120, 'clf__num_leaves': 28, 'clf__subsample': 0.8, 'tfidf__max_df': 0.8, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Train Score: 0.9736 | Validation Score: 0.9122

Best Validation Score: 0.9121648805800323


In [23]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_nofeatures_sw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_nofeatures_sw.pkl']

In [24]:
# now find the best version with stop words removed
param_grid_2 = {
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_df': [0.95],
    'tfidf__min_df': [4],
    'tfidf__stop_words': ['english'],
    'clf__num_leaves': [18],
    'clf__n_estimators': [120],
    'clf__min_child_samples': [15],
    'clf__learning_rate': [0.05],
    'clf__min_child_weight': [0.001, 0.01, 0.1],
    'clf__subsample': [0.7, 0.8, 0.9, 1]
}

In [25]:
# set up the grid search
kfolds_2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_2 = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_2,
    cv=kfolds_2,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [26]:
grid_search_2.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 2506
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 12
n_resources: 2506
Fitting 3 folds for each of 12 candidates, totalling 36 fits
----------
iter: 1
n_candidates: 6
n_resources: 5012
Fitting 3 folds for each of 6 candidates, totalling 18 fits
----------
iter: 2
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 3
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.107145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97637
[LightGBM] [Info] Number of data points in the train set: 20048, number of used features: 4207
[LightGBM] [Info] Start training from score -1.488954
[LightGBM] [Info] Start training from score -2.023947
[LightGBM] [Info] Start training from score -1.663918
[LightGBM] [Info] Start training from score -1.617350
[LightGBM] [Info] Start training from score -3.489152
[LightGBM] [Info] Start training from score -1.496499


In [27]:
print(f"Best parameters: {grid_search_2.best_params_}")
print(f"Best score: {grid_search_2.best_score_}")

Best parameters: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.01, 'clf__n_estimators': 120, 'clf__num_leaves': 18, 'clf__subsample': 0.8, 'tfidf__max_df': 0.95, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
Best score: 0.9022735300759326


In [28]:
best_estimator_2 = grid_search_2.best_estimator_
y_pred = best_estimator_2.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))



              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.91      0.94      0.93       663
   hurricane       0.95      0.95      0.95       949
       other       0.76      0.80      0.78       995
     tornado       0.91      0.81      0.86       153
    wildfire       0.90      0.83      0.86      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.89      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1119    0    0   11    1    0]
 [   1  624   12   23    3    0]
 [   0   10  906   24    2    7]
 [  32   41   25  796    5   96]
 [   0    5    0   24  124    0]
 [   3    6    6  173    2  932]]


In [29]:
cv_results_2 = grid_search_2.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results_2["mean_train_score"], cv_results_2["mean_test_score"], cv_results_2["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search_2.best_score_)

Params: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.001, 'clf__n_estimators': 120, 'clf__num_leaves': 18, 'clf__subsample': 0.7, 'tfidf__max_df': 0.95, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
Train Score: 0.9837 | Validation Score: 0.8510

Params: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.001, 'clf__n_estimators': 120, 'clf__num_leaves': 18, 'clf__subsample': 0.8, 'tfidf__max_df': 0.95, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
Train Score: 0.9837 | Validation Score: 0.8510

Params: {'clf__learning_rate': 0.05, 'clf__min_child_samples': 15, 'clf__min_child_weight': 0.001, 'clf__n_estimators': 120, 'clf__num_leaves': 18, 'clf__subsample': 0.9, 'tfidf__max_df': 0.95, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
Train Score: 0.9837 | Validation Score: 0.8510

Params: {'clf__learning_rate

In [30]:
# export model
joblib.dump((best_estimator_2, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_nofeatures_noSw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/lgbm/lgbm_model_encoder_nofeatures_noSw.pkl']