In [22]:
import numpy as np
import pandas as pd
from preprocess import preprocess_dataframe, bsk_preprocessor
import joblib

In [29]:
df = pd.read_csv('./datasets/merged-reduced.tsv',sep='\t') # assuming column called ['text']

In [30]:
df.head()
df['label'].value_counts()

label
other        9999
hurricane    4751
wildfire     4087
flood        3472
blizzard     1719
tornado      1337
Name: count, dtype: int64

In [31]:
del df['author']
del df['handle']
del df['query']
del df['timestamp']
del df['id']
df.head()

Unnamed: 0,text,label
0,"If it weren’t for the snow, I’d have been rock...",other
1,"I went and had a nap, here’s the next update. ...",tornado
2,Last Christmas is superior but I also love lov...,other
3,When songs collide in your head …\n\nFrosty th...,other
4,EL PASO Nov 29 Climate: High: 59 Low: 40 Preci...,other


In [32]:
df = preprocess_dataframe(df) # cleaned text is now in df['cleaned']
df.head()

Unnamed: 0,text,label,cleaned
0,"If it weren’t for the snow, I’d have been rock...",other,snow rock hard follow want video stroking cock...
1,"I went and had a nap, here’s the next update. ...",tornado,went nap update note snowing
2,Last Christmas is superior but I also love lov...,other,christmas superior love love love snow miracle...
3,When songs collide in your head …\n\nFrosty th...,other,songs collide head frosty snowman jolly happy ...
4,EL PASO Nov 29 Climate: High: 59 Low: 40 Preci...,other,el paso nov twenty-nine climate high fifty-nin...


In [45]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,GridSearchCV,HalvingGridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,roc_auc_score

In [46]:
# define X and y
X = df['cleaned'] # feature matrix
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=df['label'])

In [47]:
from sklearn.model_selection import StratifiedKFold
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [48]:
# define the param grid for LinearSVC
param_grid_linsvc = {
    'linsvc__C': [0.1, 1, 10, 100],
    'linsvc__max_iter': [3000, 5000, 7000],
    'linsvc__tol': [1e-4, 1e-3, 1e-2]
}

In [49]:
# define the param grid for SVC

In [50]:
param_grid_svc = {
    'svc__C': [0.1, 1, 10, 100],         
    'svc__gamma': ['scale', 'auto', 0.001, 0.01],  
    'svc__kernel': ['linear', 'rbf'],     
    'svc__degree': [3, 4, 5],
    'svc__coef0': [0, 0.1, 0.5],
    'svc__tol': [1e-4, 1e-3]
}

In [51]:
# define the param grid for LogisticRegression
param_grid_lr = {
    'lr__C': [0.1, 1, 10, 100],
    'lr__max_iter': [1000, 1500, 2000],
    'lr__solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear'],
    'lr__penalty': ['l2'],
    'lr__tol': [1e-4, 1e-3, 1e-2]
}

In [52]:
# define the param grid for RandomForestClassifier
param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__bootstrap': [True, False],
    'rf__n_jobs': [-1]
}

In [68]:
# repeat, do a pipeline for svc
from sklearn.svm import SVC
pipeline_svc = Pipeline([("tfidf",TfidfVectorizer()),("svc",SVC(class_weight="balanced", probability=True))])

In [69]:
# create the grid search instance for svc
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=kfolds, scoring="roc_auc_ovo", n_jobs=-1)

In [None]:
grid_search_svc.fit(X_train, y_train)

In [168]:
print("Best parameters found: ", grid_search_svc.best_params_)

Best parameters found:  {'svc__C': 1, 'svc__coef0': 0, 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__tol': 0.0001}


In [169]:
# extract the best model and print the classification report
best_model_svc = grid_search_svc.best_estimator_
svc_predict = best_model_svc.predict(X_test)

In [170]:
print(classification_report(y_test, svc_predict))

              precision    recall  f1-score   support

           0       0.96      0.78      0.86        67
           1       0.91      0.99      0.95       156

    accuracy                           0.92       223
   macro avg       0.94      0.88      0.90       223
weighted avg       0.93      0.92      0.92       223



In [181]:
# repeat, do a pipeline for logistic regression
from sklearn.linear_model import LogisticRegression
pipeline_lr = Pipeline([("tfidf",TfidfVectorizer()),("lr",LogisticRegression(class_weight="balanced"))])

In [182]:
# create the grid search instance for logistic regression
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kfolds, scoring="roc_auc", n_jobs=-1)

In [183]:
grid_search_lr.fit(X_train, y_train)

In [184]:
print("Best parameters found: ", grid_search_lr.best_params_)

Best parameters found:  {'lr__C': 10, 'lr__max_iter': 1500, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'lr__tol': 0.01}


In [185]:
# extract the best model and print the classification report
best_model_lr = grid_search_lr.best_estimator_
lr_predict = best_model_lr.predict(X_test)

In [186]:
print(classification_report(y_test, lr_predict))

              precision    recall  f1-score   support

           0       0.90      0.79      0.84        67
           1       0.91      0.96      0.94       156

    accuracy                           0.91       223
   macro avg       0.91      0.88      0.89       223
weighted avg       0.91      0.91      0.91       223



In [193]:
# repeat, do a pipeline for random forest
from sklearn.ensemble import RandomForestClassifier
pipeline_rf = Pipeline([("tfidf",TfidfVectorizer()),("rf",RandomForestClassifier(class_weight="balanced"))])

In [194]:
# create the grid search instance for random forest
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kfolds, scoring="roc_auc", n_jobs=-1)

In [195]:
grid_search_rf.fit(X_train, y_train)

In [196]:
print("Best parameters found: ", grid_search_rf.best_params_)

Best parameters found:  {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 10, 'rf__n_estimators': 200, 'rf__n_jobs': -1}


In [197]:
# extract the best model and print the classification report
best_model_rf = grid_search_rf.best_estimator_
rf_predict = best_model_rf.predict(X_test)

In [198]:
print(classification_report(y_test, rf_predict))

              precision    recall  f1-score   support

           0       0.75      0.91      0.82        67
           1       0.96      0.87      0.91       156

    accuracy                           0.88       223
   macro avg       0.86      0.89      0.87       223
weighted avg       0.90      0.88      0.89       223



In [200]:
grid_search_rf.best_score_

np.float64(0.9677744827723769)

In [201]:
grid_search_lr.best_score_

np.float64(0.9671569665390092)

In [203]:
grid_search_linsvc.best_score_

np.float64(0.9658967324593706)

In [204]:
grid_search_svc.best_score_

np.float64(0.9631442713703541)