# SETUP

In [2]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AutoModel, AutoTokenizer
from transformers import EvalPrediction
import transformers
from transformers import TrainingArguments, Trainer
from transformers import PreTrainedTokenizerFast

# Import Model

In [3]:
labels = ['Ignore', 'Machine_alert', 'Human_alert', 'Damages_alert']
id2label = {0: 'Ignore', 1: 'Machine_alert', 2: 'Human_alert', 3: 'Damages_alert'}
label2id = {'Ignore': 0, 'Machine_alert': 1, 'Human_alert': 2, 'Damages_alert': 3}

In [4]:
# specify the path to the saved model
model_path = 'vinai3/bertweet-large/checkpoint-130'
mymodel = BertForSequenceClassification.from_pretrained(model_path, local_files_only=True, problem_type="multi_label_classification", num_labels=len(labels), id2label=id2label, label2id=label2id)
# tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai3/bertweet-large/checkpoint-130 were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.7.output.LayerNorm.bias', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.19.output.LayerNorm.weight', 'roberta.encoder.layer.20.attention.self.value.bias', 'roberta.encoder.layer.22.attention.self.query.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.12.output.LayerNorm.weight', 'roberta.encoder.layer.23.intermediate.dense.weight', 'roberta.encoder.layer.18.attention.output.LayerNorm.weight', 'roberta.encoder.layer.23.attention.self.key.bias', 'roberta.encoder.layer.8.attention.self.key.bias', 'robe

In [5]:
tweet_label = []
tweet_text = []

def predict_tweet(x, mymodel):
    # encoding = tokenizer(x, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
    encoding = tokenizer(x, return_tensors="pt")
    encoding = {k: v.to(mymodel.device) for k,v in encoding.items()}
    outputs = mymodel(**encoding)
    logits = outputs.logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    return labels[probs.argmax()]

df_tweets = pd.read_csv("sample_tweet_lab0123.csv")
list_tweets = df_tweets["text"].to_list()

In [6]:
import tqdm

In [7]:
for x in tqdm.tqdm(list_tweets):
    predicted_lab = predict_tweet(x, mymodel)
    tweet_label.append(predicted_lab)
    tweet_text.append(x)



100%|██████████| 291/291 [00:58<00:00,  4.98it/s]


In [8]:
df_tweets_labeled = pd.DataFrame({'tweet_text': tweet_text, 'tweet_label':  tweet_label})

In [9]:
df_tweets_labeled["tweet_label"].value_counts()

Ignore           277
Damages_alert     14
Name: tweet_label, dtype: int64

In [10]:
tweet_label_try = []
tweet_text_try = []
list_tweets_try = ["there is a earthquake happening Magnitude 3.4", "the ground is shaking", "my name is Olivier and happy to follow the Becode Programme #AI"]
for x_try in tqdm.tqdm(list_tweets_try):
    predicted_lab_try = predict_tweet(x_try, mymodel)
    tweet_label_try.append(predicted_lab_try)
    tweet_text_try.append(x_try)
df_tweets_labeled_try = pd.DataFrame({'tweet_text': tweet_text_try, 'tweet_label':  tweet_label_try})
df_tweets_labeled_try["tweet_label"].value_counts()


100%|██████████| 3/3 [00:00<00:00,  7.74it/s]


Ignore           2
Damages_alert    1
Name: tweet_label, dtype: int64

In [11]:
df_tweets_labeled_try.head()

Unnamed: 0,tweet_text,tweet_label
0,there is a earthquake happening Magnitude 3.4,Ignore
1,the ground is shaking,Damages_alert
2,my name is Olivier and happy to follow the Bec...,Ignore


# code for data engineer

In [12]:
df_tweets = pd.read_csv("sample_tweet_lab0123.csv")
list_tweets = df_tweets["text"].to_list()
model_path = 'mymodel2'


In [13]:
#code for data engineer

def predict(list_tweets, model_path) :
    labels = ['Ignore', 'Machine_alert', 'Human_alert', 'Damages_alert']
    id2label = {0: 'Ignore', 1: 'Machine_alert', 2: 'Human_alert', 3: 'Damages_alert'}
    label2id = {'Ignore': 0, 'Machine_alert': 1, 'Human_alert': 2, 'Damages_alert': 3}
    model_path = 'bert-finetuned-sem_eval-english/checkpoint-170'
    mymodel = BertForSequenceClassification.from_pretrained(model_path, local_files_only=True, problem_type="multi_label_classification", num_labels=len(labels), id2label=id2label, label2id=label2id)
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")
    tweet_label = []
    tweet_text = []

    def predict_tweet(x, mymodel):
        encoding = tokenizer(x, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        encoding = {k: v.to(mymodel.device) for k,v in encoding.items()}
        outputs = mymodel(**encoding)
        logits = outputs.logits
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.squeeze().cpu())
        return labels[probs.argmax()]
    
    for x in (list_tweets):
        predicted_lab = predict_tweet(x, mymodel)
        tweet_label.append(predicted_lab)
        tweet_text.append(x)
    df_tweets_labeled = pd.DataFrame({'tweet_text': tweet_text, 'tweet_label':  tweet_label})
    return df_tweets_labeled


In [14]:
#Code 2 for data engineers

labels = ['Ignore', 'Machine_alert', 'Human_alert', 'Damages_alert']
id2label = {0: 'Ignore', 1: 'Machine_alert', 2: 'Human_alert', 3: 'Damages_alert'}
label2id = {'Ignore': 0, 'Machine_alert': 1, 'Human_alert': 2, 'Damages_alert': 3}
model_path = 'mymodel2'
mymodel = BertForSequenceClassification.from_pretrained(model_path, local_files_only=True, problem_type="multi_label_classification", num_labels=len(labels), id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

def predict_tweet(x, mymodel):
    encoding = tokenizer(x, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(mymodel.device) for k,v in encoding.items()}
    outputs = mymodel(**encoding)
    logits = outputs.logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    label = labels[probs.argmax()]
    dic_tweet_label = {x:label}
    return dic_tweet_label


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at mymodel2 were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.7.output.LayerNorm.bias', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.19.output.LayerNorm.weight', 'roberta.encoder.layer.20.attention.self.value.bias', 'roberta.encoder.layer.22.attention.self.query.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.12.output.LayerNorm.weight', 'roberta.encoder.layer.23.intermediate.dense.weight', 'roberta.encoder.layer.18.attention.output.LayerNorm.weight', 'roberta.encoder.layer.23.attention.self.key.bias', 'roberta.encoder.layer.8.attention.self.key.bias', 'roberta.encoder.layer.1.attentio

In [15]:
encoded_dataset.set_format("torch")

NameError: name 'encoded_dataset' is not defined

In [None]:
x = list_tweets[0]
x

'A message from the Queen to the people of  # Mexico following the resent earthquake . '

In [None]:
predict_tweet(x, mymodel)

{'A message from the Queen to the people of  # Mexico following the resent earthquake . ': 'Human_alert'}

In [None]:
predict(list_tweets, model_path)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at mymodel2 were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.13.attention.output.LayerNorm.weight', 'roberta.encoder.layer.16.attention.self.query.bias', 'roberta.encoder.layer.19.attention.output.LayerNorm.weight', 'roberta.encoder.layer.22.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.self.value.weight', 'roberta.encoder.layer.14.output.dense.weight', 'roberta.encoder.layer.13.intermediate.dense.bias', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.14.output.dense.bias', 'roberta.encoder.layer.6.output.dense.bias', 'roberta.encoder.layer.12.attention.self.query.bias', 'roberta.encoder.layer.1.attention.outp

Unnamed: 0,tweet_text,tweet_label
0,A message from the Queen to the people of # M...,Human_alert
1,"As we r commemorating the 1906 quake today, we...",Human_alert
2,Xavier is crowdrising for Relief support for E...,Human_alert
3,Thinking of our friends around Kaikoura and ot...,Human_alert
4,RT @ abpnewstv : # EcuadorEarthquake : At ...,Human_alert
...,...,...
286,Ive been awake all night with these bloody aft...,Human_alert
287,Some clown was letting off fireworks last nigh...,Human_alert
288,Took a tour of Quito Ecuador after the earthqu...,Human_alert
289,RT @ thesecondascent : 7 . 4 magnitude earth...,Human_alert
