In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/notebook_data')
#!pip install -r /content/drive/MyDrive/notebook_data/requirements.txt
from preprocess import preprocess_dataframe, preprocess_dataframe_sw
import numpy as np
import pandas as pd
import joblib

In [3]:
df = pd.read_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-reduced.tsv',sep='\t') # assuming column called ['text']
df.drop(['id', 'author', 'handle', 'timestamp', 'query'], axis=1, inplace=True)
df.head()

Unnamed: 0,text,label
0,#EartthquakeReport #TsunamiReport for M7.2 #Ea...,earthquake
1,Tsunami warning lifted after earthquake off Al...,earthquake
2,"First Temblor map (AFAIK) on bluesky! Today, a...",earthquake
3,\U0001f9ea\n\nA M7.2 earthquake occurred offsh...,earthquake
4,Earthquake waves from the M7.2 earthquake in A...,earthquake


In [4]:
df = preprocess_dataframe_sw(df)
df.dropna(inplace=True)
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
earthquake,5656
wildfire,5634
other,4981
hurricane,4751
flood,3312
tornado,766


In [5]:
df.to_csv('/content/drive/MyDrive/notebook_data/datasets/merged-labeled-reduced-cleaned_sw.tsv', sep='\t', index=False)
df.head()

Unnamed: 0,text,label,cleaned
0,#EartthquakeReport #TsunamiReport for M7.2 #Ea...,earthquake,eartthquakereport tsunamireport for m72 earthq...
1,Tsunami warning lifted after earthquake off Al...,earthquake,tsunami warning lifted after earthquake off al...
2,"First Temblor map (AFAIK) on bluesky! Today, a...",earthquake,first temblor map afaik on bluesky today a mag...
3,\U0001f9ea\n\nA M7.2 earthquake occurred offsh...,earthquake,test tube a m72 earthquake occurred offshore a...
4,Earthquake waves from the M7.2 earthquake in A...,earthquake,earthquake waves from the m72 earthquake in al...


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# train test split
X = df['cleaned']
y = df['label']

# Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts labels to numeric form
print(f"Labels before conversion: {np.unique(y)}")
print(f"Labels after conversion: {np.unique(y_encoded)}")

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=24)

print(f"Training with {len(X_train)} samples; Testing with {len(X_test)} samples")

unique_vals, counts = np.unique(y_train, return_counts=True)
unique_vals_test, counts_test = np.unique(y_test, return_counts=True)
print(f"Class distribution in training: {dict(zip(unique_vals.tolist(), counts.tolist()))}")
print(f"Class distribution in testing: {dict(zip(unique_vals_test.tolist(), counts_test.tolist()))}")

Labels before conversion: ['earthquake' 'flood' 'hurricane' 'other' 'tornado' 'wildfire']
Labels after conversion: [0 1 2 3 4 5]
Training with 20080 samples; Testing with 5020 samples
Class distribution in training: {0: 4525, 1: 2649, 2: 3801, 3: 3985, 4: 613, 5: 4507}
Class distribution in testing: {0: 1131, 1: 663, 2: 950, 3: 996, 4: 153, 5: 1127}


In [10]:
# export the train and test splits, along with the label_encoder and tfidf vectorizer
joblib.dump((X_train, y_train, X_test, y_test), '/content/drive/MyDrive/notebook_data/exports/train_test/train_test_split_v2.pkl')
joblib.dump(label_encoder, '/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v2.pkl')

['/content/drive/MyDrive/notebook_data/exports/labelEncoder_tfidfVectorizer/label_encoder_v2.pkl']