In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import joblib

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_v4.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v4.pkl')

In [7]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20048 samples; Testing with 5013 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [8]:
X_train.head()

Unnamed: 0,cleaned
21669,i do nt know how i held a job before i did nt ...
9094,1156 j1 hd now playing ayabie natsumonogatari ...
18799,climate change parching the american west even...
16642,hollywood or washington
9156,unless its your own business nobody should be ...


In [9]:
# Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', XGBClassifier(objective='multi:softmax', num_class=len(set(y_train))))
])


In [10]:
param_grid = {
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_df': [0.85],
    'tfidf__min_df': [2],
    'tfidf__stop_words': [None],
    'clf__max_leaves': [30],
    'clf__n_estimators': [150],
    'clf__learning_rate': [0.1]
}

In [11]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [12]:
# fit to the data
#grid_search.fit(X_train, y_train, clf__categorical_feature=indexes_of_categories)
grid_search.fit(X_train, y_train)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 20048
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1
n_resources: 20048
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [13]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__learning_rate': 0.1, 'clf__max_leaves': 30, 'clf__n_estimators': 150, 'tfidf__max_df': 0.85, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
Best score: 0.904299554716342


In [14]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.91      0.93      0.92       663
   hurricane       0.95      0.95      0.95       949
       other       0.78      0.78      0.78       995
     tornado       0.88      0.78      0.83       153
    wildfire       0.88      0.87      0.88      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.88      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1119    0    1   10    1    0]
 [   1  616   12   28    6    0]
 [   0   10  901   28    3    7]
 [  32   37   25  774    6  121]
 [   0    6    2   25  120    0]
 [   2    6    6  131    0  977]]


In [15]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__learning_rate': 0.1, 'clf__max_leaves': 30, 'clf__n_estimators': 150, 'tfidf__max_df': 0.85, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
Train Score: 0.9538 | Validation Score: 0.9043

Best Validation Score: 0.904299554716342


In [16]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_nofeatures_sw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_nofeatures_sw.pkl']

In [22]:
param_grid_2 = {
    'tfidf__ngram_range': [(1,3)],
    'tfidf__max_df': [0.85],
    'tfidf__min_df': [4],
    'tfidf__stop_words': ['english'],
    'clf__max_leaves': [20, 30, 40],
    'clf__n_estimators': [100, 150, 200],
    'clf__learning_rate': [0.01, 0.1]
}

In [23]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_2 = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_2,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [24]:
grid_search_2.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1253
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 18
n_resources: 1253
Fitting 3 folds for each of 18 candidates, totalling 54 fits
----------
iter: 1
n_candidates: 9
n_resources: 2506
Fitting 3 folds for each of 9 candidates, totalling 27 fits
----------
iter: 2
n_candidates: 5
n_resources: 5012
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 3
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 4
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [25]:
print(f"Best parameters: {grid_search_2.best_params_}")
print(f"Best score: {grid_search_2.best_score_}")

Best parameters: {'clf__learning_rate': 0.1, 'clf__max_leaves': 40, 'clf__n_estimators': 200, 'tfidf__max_df': 0.85, 'tfidf__min_df': 4, 'tfidf__ngram_range': (1, 3), 'tfidf__stop_words': 'english'}
Best score: 0.901657118866579


In [26]:
best_estimator_2 = grid_search_2.best_estimator_
y_pred = best_estimator_2.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.99      0.98      1131
       flood       0.92      0.94      0.93       663
   hurricane       0.95      0.95      0.95       949
       other       0.76      0.81      0.78       995
     tornado       0.88      0.80      0.84       153
    wildfire       0.90      0.83      0.86      1122

    accuracy                           0.90      5013
   macro avg       0.90      0.89      0.89      5013
weighted avg       0.90      0.90      0.90      5013
 [[1120    0    1    9    1    0]
 [   1  624    9   24    5    0]
 [   0    9  903   26    3    8]
 [  31   35   27  802    5   95]
 [   0    3    2   25  123    0]
 [   3    5    7  172    3  932]]


In [27]:
# export model
joblib.dump((best_estimator_2, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_noSw.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/xgb/xgb_model_encoder_features_noSw.pkl']