In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/notebook_data')
!pip install -r /content/drive/MyDrive/notebook_data/requirements.txt
import numpy as np
import pandas as pd
import joblib

In [None]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-cleaned.tsv',sep='\t') # assuming column called ['text']
df.head()

In [None]:
df = df.dropna()
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
other,70718
earthquake,5656
wildfire,5634
hurricane,4751
flood,3312
tornado,766


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# train test split
X = df['cleaned']
y = df['label']

from sklearn.preprocessing import LabelEncoder

# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts labels to numeric form
print(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
print(f"Training with {len(X_train)} samples; Testing with {len(X_test)} samples")
print(np.unique(y_train))

['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Training with 72669 samples; Testing with 18168 samples
[0 1 2 3 4 5]


In [None]:
# define pipeline for models
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', None) # placeholder
])

In [None]:
# define parameter grids
tfidf_params = {
    'tfidf__max_df': [0.8, 0.9],
    'tfidf__min_df': [2, 3],
    'tfidf__ngram_range': [(1,1), (1,2)]
}

xgb_param_grid = {
    'clf': [XGBClassifier()],
    'clf__learning_rate': [0.01, 0.05],
    'clf__n_estimators': [50, 100, 150, 200, 250],
    'clf__max_depth': [3, 5, 10],
    **tfidf_params
}

In [None]:
# set up the grid search
param_grid = [xgb_param_grid]
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # If it's going too slow, change 5 to 3 here
grid_search = HalvingGridSearchCV(pipeline, param_grid, cv=kfolds, n_jobs=-1, verbose=1, scoring='f1', factor=3)

In [None]:
# fit to the data
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

In [None]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
# Take a screenshot of the whole thing if you plan to save this model, make sure we know which model gave which results
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

In [None]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
# This is where we can see if the model is overfitting; if there is a big gap between train score and validation score it means the model is overfitting
# Overfitting: means our model will be bad at generalizing in the real world
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n") # These are generally sorted, so only look at the last one to see how it did

print("Best Validation Score:", grid_search.best_score_)

In [None]:
# Export model
joblib.dump((label_encoder, best_estimator), '/content/drive/MyDrive/notebook_data/xgb_model_v1.pkl') # Change the name if saving again

['/content/drive/MyDrive/notebook_data/lgbm_model_v5.pkl']