In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_features_v2.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v4.pkl')

In [5]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20048 samples; Testing with 5013 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [6]:
X_train.head()

Unnamed: 0,cleaned,has_location,sentiment
21669,i do nt know how i held a job before i did nt ...,0,0
9094,1156 j1 hd now playing ayabie natsumonogatari ...,1,0
18799,climate change parching the american west even...,0,2
16642,hollywood or washington,1,2
9156,unless its your own business nobody should be ...,0,0


In [10]:
from sklearn.compose import ColumnTransformer

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 'cleaned'),  # Apply TF-IDF to 'cleaned' column
        ('num', 'passthrough', ['has_location', 'sentiment'])  # Passthrough numerical features
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), enable_categorical=True))
])


In [None]:
X_train["has_location"].astype("category")
X_train["sentiment"].astype("category")
X_test["has_location"].astype("category")
X_test["sentiment"].astype("category")

In [18]:
param_grid = {
    'preprocessor__tfidf__ngram_range': [(1,2)],
    'preprocessor__tfidf__max_df': [0.8],
    'preprocessor__tfidf__min_df': [4],
    'preprocessor__tfidf__stop_words': [None],
    'clf__max_leaves': [20, 30, 40],
    'clf__n_estimators': [100, 150, 200],
    'clf__learning_rate': [0.01, 0.1]
}

In [19]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [20]:
# fit to the data
#grid_search.fit(X_train, y_train, clf__categorical_feature=indexes_of_categories)
grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1253
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 18
n_resources: 1253
Fitting 3 folds for each of 18 candidates, totalling 54 fits
----------
iter: 1
n_candidates: 9
n_resources: 2506
Fitting 3 folds for each of 9 candidates, totalling 27 fits
----------
iter: 2
n_candidates: 5
n_resources: 5012
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 3
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 4
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [21]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__learning_rate': 0.1, 'clf__max_leaves': 20, 'clf__n_estimators': 200, 'preprocessor__tfidf__max_df': 0.8, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 2), 'preprocessor__tfidf__stop_words': None}
Best score: 0.9080422179216016


In [22]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.93      0.94      0.93       663
   hurricane       0.96      0.96      0.96       949
       other       0.78      0.79      0.79       995
     tornado       0.88      0.79      0.83       153
    wildfire       0.89      0.87      0.88      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.89      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1120    0    1    9    1    0]
 [   1  620   10   26    6    0]
 [   0    6  907   28    3    5]
 [  31   32   25  787    6  114]
 [   0    4    0   27  121    1]
 [   2    5    6  133    1  975]]


In [23]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__learning_rate': 0.01, 'clf__max_leaves': 20, 'clf__n_estimators': 100, 'preprocessor__tfidf__max_df': 0.8, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 2), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9194 | Validation Score: 0.8177

Params: {'clf__learning_rate': 0.01, 'clf__max_leaves': 20, 'clf__n_estimators': 150, 'preprocessor__tfidf__max_df': 0.8, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 2), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9290 | Validation Score: 0.8221

Params: {'clf__learning_rate': 0.01, 'clf__max_leaves': 20, 'clf__n_estimators': 200, 'preprocessor__tfidf__max_df': 0.8, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 2), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9364 | Validation Score: 0.8213

Params: {'clf__learning_rate': 0.01, 'clf__max_leaves': 30, 'clf__n_estimators': 100, 'preprocessor__tfidf__max_df': 0.8, 'preprocess

In [24]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_sw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_sw.pkl']

In [30]:
param_grid_2 = {
    'preprocessor__tfidf__ngram_range': [(1,3)],
    'preprocessor__tfidf__max_df': [0.85],
    'preprocessor__tfidf__min_df': [4],
    'preprocessor__tfidf__stop_words': ['english'],
    'clf__max_leaves': [20, 30, 40],
    'clf__n_estimators': [100, 150, 200],
    'clf__learning_rate': [0.01, 0.1]
}

In [31]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_2 = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_2,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [32]:
grid_search_2.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1253
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 18
n_resources: 1253
Fitting 3 folds for each of 18 candidates, totalling 54 fits
----------
iter: 1
n_candidates: 9
n_resources: 2506
Fitting 3 folds for each of 9 candidates, totalling 27 fits
----------
iter: 2
n_candidates: 5
n_resources: 5012
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 3
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 4
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [33]:
print(f"Best parameters: {grid_search_2.best_params_}")
print(f"Best score: {grid_search_2.best_score_}")

Best parameters: {'clf__learning_rate': 0.1, 'clf__max_leaves': 40, 'clf__n_estimators': 200, 'preprocessor__tfidf__max_df': 0.85, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': 'english'}
Best score: 0.9027674711288616


In [34]:
best_estimator_2 = grid_search_2.best_estimator_
y_pred = best_estimator_2.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.92      0.94      0.93       663
   hurricane       0.95      0.95      0.95       949
       other       0.75      0.79      0.77       995
     tornado       0.87      0.78      0.82       153
    wildfire       0.89      0.83      0.86      1122

    accuracy                           0.90      5013
   macro avg       0.89      0.88      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1121    0    1    8    1    0]
 [   1  621   10   25    5    1]
 [   0    8  906   25    4    6]
 [  31   40   25  791    5  103]
 [   0    3    2   28  120    0]
 [   3    6    7  174    3  929]]


In [35]:
# export model
joblib.dump((best_estimator_2, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_noSw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_noSw.pkl']