In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#import sys
#sys.path.append('/content/drive/MyDrive/notebook_data')
#!pip install -r /content/drive/MyDrive/notebook_data/requirements.txt
import numpy as np
import pandas as pd
#from preprocess import preprocess_dataframe, bsk_preprocessor
import joblib

In [3]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-reduced-2-preprocessed.tsv',sep='\t') # assuming column called ['text']
df.head()

Unnamed: 0,text,label,cleaned
0,\U0001f6a8 Severe Thunderstorm Warning issued ...,flood,police car light severe thunderstorm warning i...
1,\U0001f6a8 Severe Thunderstorm Warning issued ...,flood,police car light severe thunderstorm warning i...
2,\U0001f6a8 Severe Thunderstorm Warning issued ...,flood,police car light severe thunderstorm warning i...
3,\U0001f6a8 Severe Thunderstorm Warning issued ...,flood,police car light severe thunderstorm warning i...
4,NEW WEATHER ADVISORY: Severe Thunderstorm Warn...,flood,new weather advisory severe thunderstorm warni...


In [4]:
df['label'].value_counts()
df = df.dropna()

In [5]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
other,9929
hurricane,4745
wildfire,4084
flood,3472
blizzard,2534
tornado,520


In [6]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,GridSearchCV,HalvingGridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.metrics import classification_report,confusion_matrix

# importing models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [7]:
# train test split
X = df['cleaned']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Training with {len(X_train)} samples; Testing with {len(X_test)} samples")

Training with 20227 samples; Testing with 5057 samples


In [8]:
# define pipeline for models
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    #('scaler', StandardScaler(with_mean=False)),
    #('scaler', MaxAbsScaler()),
    ('clf', None) # placeholder
])

In [14]:

# define parameter grids
logreg_param_grid = {
    'clf': [LogisticRegression(max_iter=8000, class_weight='balanced')],
    'clf__C': [4.5, 4.6, 4.7, 4.8, 5],
    'clf__solver': ['newton-cg','saga','sag','lbfgs']
}

svc_param_grid = {
    'clf': [SVC(class_weight='balanced')],
    'clf__C': [4, 4.5, 5, 5.5, 6],
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

linsvc_param_grid = {
    'clf': [OneVsRestClassifier(LinearSVC(max_iter=8000,class_weight='balanced'))],
    'clf__estimator__C': [0.5, 0.7, 1, 3, 10],
    'clf__estimator__intercept_scaling': [0.5125, 1, 1.7, 2, 2.5]
}

nb_param_grid = {
    'clf': [MultinomialNB()],
    'clf__alpha': [1, 5, 7, 10, 12] # smoothing param
}

sgd_param_grid = {
    'clf': [SGDClassifier(max_iter=5000, class_weight='balanced')],
    'clf__loss': ['log_loss', 'hinge','squared_hinge','perceptron'],
    'clf__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
}

rf_param_grid = {
    'clf': [RandomForestClassifier(class_weight='balanced')],
    'clf__n_estimators': [500, 700, 1000, 2000],
    'clf__max_depth': [None],
    'clf__min_samples_split': [5],
    'clf__max_features': ['sqrt'],
    'clf__min_samples_leaf': [2]
}

In [16]:
# set up the grid search
#param_grid = [logreg_param_grid, linsvc_param_grid, svc_param_grid, nb_param_grid, sgd_param_grid, rf_param_grid]
#param_grid = [logreg_param_grid, linsvc_param_grid, svc_param_grid, nb_param_grid, sgd_param_grid]
param_grid = [linsvc_param_grid, svc_param_grid, nb_param_grid, sgd_param_grid, rf_param_grid]
grid_search = HalvingGridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_weighted', factor=2)

In [17]:
# fit to the data
grid_search.fit(X_train, y_train)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 316
max_resources_: 20227
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 74
n_resources: 316
Fitting 5 folds for each of 74 candidates, totalling 370 fits
----------
iter: 1
n_candidates: 37
n_resources: 632
Fitting 5 folds for each of 37 candidates, totalling 185 fits
----------
iter: 2
n_candidates: 19
n_resources: 1264
Fitting 5 folds for each of 19 candidates, totalling 95 fits
----------
iter: 3
n_candidates: 10
n_resources: 2528
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 4
n_candidates: 5
n_resources: 5056
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 5
n_candidates: 3
n_resources: 10112
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 6
n_candidates: 2
n_resources: 20224
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [19]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'clf': OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', max_iter=8000)), 'clf__estimator__C': 0.5, 'clf__estimator__intercept_scaling': 0.5125}
Best score: 0.8630108814078179


In [20]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
print(classification_report(y_test, y_pred), confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    blizzard       0.80      0.87      0.83       507
       flood       0.90      0.89      0.90       694
   hurricane       0.93      0.94      0.93       949
       other       0.86      0.82      0.84      1986
     tornado       0.85      0.90      0.87       104
    wildfire       0.81      0.84      0.83       817

    accuracy                           0.86      5057
   macro avg       0.86      0.88      0.87      5057
weighted avg       0.86      0.86      0.86      5057
 [[ 441    1    2   62    1    0]
 [   6  620    9   49    5    5]
 [   1   19  889   37    1    2]
 [ 104   43   53 1621   10  155]
 [   0    4    2    4   94    0]
 [   2    3    6  116    0  690]]


In [21]:
# export model
joblib.dump(best_estimator, '/content/drive/MyDrive/notebook_data/disaster_classification_model_v5.pkl')

['/content/drive/MyDrive/notebook_data/disaster_classification_model_v5.pkl']