In [8]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import spacy

In [2]:
device = torch.device('cuda')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/tweets.csv", usecols = ['text','target'])

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and token.is_alpha]))

In [9]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model = model.to(device)

In [12]:
inputs = tokenizer(train_texts.tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
labels = torch.tensor(train_labels.tolist())

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [14]:
for epoch in range(1):
    for i in range(len(inputs['input_ids'])):
        input_id = inputs['input_ids'][i].to(device)
        attention_mask = inputs['attention_mask'][i].to(device)
        label = labels[i].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_id.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), labels=label.unsqueeze(0))
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Processed {i+1} out of {len(inputs["input_ids"])}')

Processed 100 out of 9096
Processed 200 out of 9096
Processed 300 out of 9096
Processed 400 out of 9096
Processed 500 out of 9096
Processed 600 out of 9096
Processed 700 out of 9096
Processed 800 out of 9096
Processed 900 out of 9096
Processed 1000 out of 9096
Processed 1100 out of 9096
Processed 1200 out of 9096
Processed 1300 out of 9096
Processed 1400 out of 9096
Processed 1500 out of 9096
Processed 1600 out of 9096
Processed 1700 out of 9096
Processed 1800 out of 9096
Processed 1900 out of 9096
Processed 2000 out of 9096
Processed 2100 out of 9096
Processed 2200 out of 9096
Processed 2300 out of 9096
Processed 2400 out of 9096
Processed 2500 out of 9096
Processed 2600 out of 9096
Processed 2700 out of 9096
Processed 2800 out of 9096
Processed 2900 out of 9096
Processed 3000 out of 9096
Processed 3100 out of 9096
Processed 3200 out of 9096
Processed 3300 out of 9096
Processed 3400 out of 9096
Processed 3500 out of 9096
Processed 3600 out of 9096
Processed 3700 out of 9096
Processed 

In [15]:
test_x = tokenizer(test_texts.tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
test_y_tensor = torch.tensor(test_labels.tolist())

In [16]:
model.eval()
predictions = []
for i in range(len(test_x['input_ids'])):
    input_id = test_x['input_ids'][i].to(device)
    attention_mask = test_x['attention_mask'][i].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_id.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
    prediction = torch.argmax(outputs.logits, dim=-1)
    predictions.append(prediction.item())


In [17]:
accuracy = accuracy_score(test_y_tensor.tolist(), predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9032541776605101


In [18]:
test_sentence = "There is a cyclone in Florida"
test_input = tokenizer(test_sentence, return_tensors='pt', truncation=True, padding=True)
test_input = {k: v.to(device) for k, v in test_input.items()}
test_output = model(**test_input)
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: "{test_sentence}" is {"a disaster" if test_prediction.item() else "not a disaster"}')

Test sentence: "There is a cyclone in Florida" is a disaster


In [19]:
import pickle
with open('/content/drive/MyDrive/Colab Notebooks/mlmodel.pkl', 'wb') as f:
    pickle.dump(model, f)

In [20]:
test_sentence = "There is a cyclone in Florida"
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(test_sentence)
locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
print("Disaster Locations:", locations)

Disaster Locations: ['Florida']


In [21]:
test_sentence = "My life is a cyclone"
test_input = tokenizer(test_sentence, return_tensors='pt', truncation=True, padding=True)
test_input = {k: v.to(device) for k, v in test_input.items()}
test_output = model(**test_input)
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: "{test_sentence}" is {"a disaster" if test_prediction.item() else "not a disaster"}')

Test sentence: "My life is a cyclone" is not a disaster
