In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import joblib

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Read the label encoder and the train-test splits from the other notebook

X_train, y_train, X_test, y_test = joblib.load('/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_features_v2.pkl')
label_encoder = joblib.load('/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v4.pkl')

In [5]:
print(f"Training with {X_train.shape[0]} samples; Testing with {X_test.shape[0]} samples")
print(f"Labels before conversion: {np.unique(label_encoder.inverse_transform(y_train))}")
print(f"Labels after conversion: {np.unique(y_train)}")

Training with 20048 samples; Testing with 5013 samples
Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]


In [6]:
X_train.head()

Unnamed: 0,cleaned,has_location,sentiment
21669,i do nt know how i held a job before i did nt ...,0,0
9094,1156 j1 hd now playing ayabie natsumonogatari ...,1,0
18799,climate change parching the american west even...,0,2
16642,hollywood or washington,1,2
9156,unless its your own business nobody should be ...,0,0


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 'cleaned'),  # Apply TF-IDF to 'cleaned' column
        ('num', 'passthrough', ['has_location', 'sentiment'])  # Passthrough numerical features
    ])

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))
])


In [9]:
param_grid = {
    'preprocessor__tfidf__ngram_range': [(1,3)],
    'preprocessor__tfidf__max_df': [0.9],
    'preprocessor__tfidf__min_df': [2, 4],
    'preprocessor__tfidf__max_features': [7500],
    'preprocessor__tfidf__stop_words': [None, 'english'],
    'clf__estimator__C': [0.6],
    'clf__estimator__tol': [1e-3, 1e-4, 1e-5],
    'clf__estimator__max_iter': [800],
    'clf__estimator__intercept_scaling': [0.75, 10, 20]
}

In [10]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [11]:
# fit to the data
grid_search.fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 626
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 36
n_resources: 626
Fitting 3 folds for each of 36 candidates, totalling 108 fits
----------
iter: 1
n_candidates: 18
n_resources: 1252
Fitting 3 folds for each of 18 candidates, totalling 54 fits
----------
iter: 2
n_candidates: 9
n_resources: 2504
Fitting 3 folds for each of 9 candidates, totalling 27 fits
----------
iter: 3
n_candidates: 5
n_resources: 5008
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 4
n_candidates: 3
n_resources: 10016
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 5
n_candidates: 2
n_resources: 20032
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [12]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 10, 'clf__estimator__max_iter': 800, 'clf__estimator__tol': 1e-05, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': 'english'}
Best score: 0.8984957021779442


In [13]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.98      0.98      1131
       flood       0.92      0.91      0.92       663
   hurricane       0.95      0.94      0.95       949
       other       0.74      0.77      0.76       995
     tornado       0.89      0.84      0.86       153
    wildfire       0.86      0.84      0.85      1122

    accuracy                           0.89      5013
   macro avg       0.89      0.88      0.89      5013
weighted avg       0.89      0.89      0.89      5013
 [[1111    0    0   16    1    3]
 [   2  605   12   37    2    5]
 [   0   16  896   24    4    9]
 [  31   28   29  771    6  130]
 [   0    2    2   19  128    2]
 [   0    4    8  168    3  939]]


In [14]:
cv_results = grid_search.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results["mean_train_score"], cv_results["mean_test_score"], cv_results["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search.best_score_)

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 0.75, 'clf__estimator__max_iter': 800, 'clf__estimator__tol': 0.001, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 2, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': None}
Train Score: 0.9960 | Validation Score: 0.7816

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 0.75, 'clf__estimator__max_iter': 800, 'clf__estimator__tol': 0.001, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 2, 'preprocessor__tfidf__ngram_range': (1, 3), 'preprocessor__tfidf__stop_words': 'english'}
Train Score: 0.9912 | Validation Score: 0.7868

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 0.75, 'clf__estimator__max_iter': 800, 'clf__estimator__tol': 0.001, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, '

In [15]:
# export model
joblib.dump((best_estimator, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/linearsvc/linsvc_model_encoder_stopwords.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/linearsvc/linsvc_model_encoder_stopwords.pkl']

In [16]:
param_grid = {
    'preprocessor__tfidf__ngram_range': [(1,3)],
    'preprocessor__tfidf__max_df': [0.9],
    'preprocessor__tfidf__min_df': [2, 4],
    'preprocessor__tfidf__max_features': [7500],
    'clf__estimator__C': [0.6],
    'clf__estimator__max_iter': [800],
    'clf__estimator__intercept_scaling': [0.75, 10, 20]
}

In [17]:
# set up the grid search
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_2 = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=kfolds,
    n_jobs=-1,
    verbose=1,
    scoring='f1_weighted',
    factor=2,
    error_score="raise"
)

In [18]:
# fit to the data
grid_search_2.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 5012
max_resources_: 20048
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 5012
Fitting 3 folds for each of 6 candidates, totalling 18 fits
----------
iter: 1
n_candidates: 3
n_resources: 10024
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 2
n_candidates: 2
n_resources: 20048
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [19]:
print(f"Best parameters: {grid_search_2.best_params_}")
print(f"Best score: {grid_search_2.best_score_}")

Best parameters: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 10, 'clf__estimator__max_iter': 800, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 2, 'preprocessor__tfidf__ngram_range': (1, 3)}
Best score: 0.9044841019302838


In [20]:
best_estimator_2 = grid_search_2.best_estimator_
y_pred = best_estimator_2.predict(X_test)
# convert them back to the labels we can understand
y_pred_text = label_encoder.inverse_transform(y_pred)
y_test_text = label_encoder.inverse_transform(y_test)
print(classification_report(y_test_text, y_pred_text), confusion_matrix(y_test_text, y_pred_text))

              precision    recall  f1-score   support

  earthquake       0.97      0.98      0.97      1131
       flood       0.93      0.89      0.91       663
   hurricane       0.94      0.94      0.94       949
       other       0.75      0.80      0.77       995
     tornado       0.90      0.83      0.86       153
    wildfire       0.88      0.86      0.87      1122

    accuracy                           0.89      5013
   macro avg       0.90      0.88      0.89      5013
weighted avg       0.89      0.89      0.89      5013
 [[1104    0    1   22    1    3]
 [   2  593   17   45    3    3]
 [   0   11  893   35    4    6]
 [  29   30   27  792    5  112]
 [   0    1    2   21  127    2]
 [   1    3    7  148    1  962]]


In [21]:
cv_results_2 = grid_search_2.cv_results_

# Print mean training and validation scores for each parameter set
for mean_train, mean_val, params in zip(cv_results_2["mean_train_score"], cv_results_2["mean_test_score"], cv_results_2["params"]):
    print(f"Params: {params}")
    print(f"Train Score: {mean_train:.4f} | Validation Score: {mean_val:.4f}\n")

print("Best Validation Score:", grid_search_2.best_score_)

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 0.75, 'clf__estimator__max_iter': 800, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 2, 'preprocessor__tfidf__ngram_range': (1, 3)}
Train Score: 0.9906 | Validation Score: 0.8831

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 0.75, 'clf__estimator__max_iter': 800, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 4, 'preprocessor__tfidf__ngram_range': (1, 3)}
Train Score: 0.9893 | Validation Score: 0.8813

Params: {'clf__estimator__C': 0.6, 'clf__estimator__intercept_scaling': 10, 'clf__estimator__max_iter': 800, 'preprocessor__tfidf__max_df': 0.9, 'preprocessor__tfidf__max_features': 7500, 'preprocessor__tfidf__min_df': 2, 'preprocessor__tfidf__ngram_range': (1, 3)}
Train Score: 0.9906 | Validation Score: 0.8833

Params: {'clf__estimator__C': 0.6, 'clf__estimator__in

In [22]:
# export model
joblib.dump((best_estimator_2, label_encoder), '/content/drive/MyDrive/notebook_data/exports/models/linearsvc/linsvc_model_encoder_no_stopwords.pkl')

['/content/drive/MyDrive/notebook_data/exports/models/linearsvc/linsvc_model_encoder_no_stopwords.pkl']