In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score
import pandas as pd
import torch
import spacy

In [None]:
device = torch.device('cuda')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ML_Project/tweets.csv", usecols = ['text','target'])

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and token.is_alpha]))

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
model = model.to(device)

In [None]:
inputs = tokenizer(df['text'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
labels = torch.tensor(df['target'].tolist())

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
for epoch in range(1):
    for i in range(len(inputs['input_ids'])):
        input_id = inputs['input_ids'][i].to(device)
        attention_mask = inputs['attention_mask'][i].to(device)
        label = labels[i].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_id.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), labels=label.unsqueeze(0))
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Processed {i+1} out of {len(inputs["input_ids"])}')

In [None]:
model.eval()
predictions = []
for i in range(len(inputs['input_ids'])):
    input_id = inputs['input_ids'][i].to(device)
    attention_mask = inputs['attention_mask'][i].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_id.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
    prediction = torch.argmax(outputs.logits, dim=-1)
    predictions.append(prediction.item())

In [None]:
accuracy = accuracy_score(labels.tolist(), predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9085312225153914


In [None]:
test_sentence = "There is a cyclone in Florida"
test_input = tokenizer(test_sentence, return_tensors='pt', truncation=True, padding=True)
test_input = {k: v.to(device) for k, v in test_input.items()}
test_output = model(**test_input)
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: "{test_sentence}" is {"a disaster" if test_prediction.item() else "not a disaster"}')

Test sentence: "There is a cyclone in Florida" is a disaster


In [None]:
import pickle
with open('/content/drive/MyDrive/ML_Project/mlmodel.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
test_sentence = "There is a cyclone in Florida"
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(test_sentence)
locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
print("Disaster Locations:", locations)

Disaster Locations: ['Florida']


In [None]:
test_sentence = "My life is a cyclone"
test_input = tokenizer(test_sentence, return_tensors='pt', truncation=True, padding=True)
test_input = {k: v.to(device) for k, v in test_input.items()}
test_output = model(**test_input)
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: "{test_sentence}" is {"a disaster" if test_prediction.item() else "not a disaster"}')

Test sentence: "My life is a cyclone" is not a disaster
