In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action="ignore")
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, StratifiedKFold
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
#from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_doc = pd.read_csv("/content/drive/MyDrive/Microsft LMR/cleaned_data_doc.csv",index_col=0)
test_doc = pd.read_csv("/content/drive/MyDrive/Microsft LMR/cleaned_test_doc.csv", index_col=0)

In [None]:
data_doc.head()

Unnamed: 0,tweet_id,text,location,cleaned_text
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland,flash flood strike maryland city sunday wash s...
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland,state emergency declare maryland flooding
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland,maryland significant damage sunday storm inclu...
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland,catastrophic flooding slam ellicott city maryl...
5,ID_1001138377717157888,WATCH: 1 missing after flash #FLOODING devasta...,Ellicott City Maryland,watch miss flash flooding devastate ellicott c...


In [None]:
data_doc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11849 entries, 1 to 73071
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet_id      11849 non-null  object
 1   text          11849 non-null  object
 2   location      11849 non-null  object
 3   cleaned_text  11849 non-null  object
dtypes: object(4)
memory usage: 462.9+ KB


In [None]:
data_doc['text'][3],data_doc['location'][43],data_doc['cleaned_text'][3]

('Other parts of Maryland also saw significant damage from Sundays storms including this Baltimore city neighborhood, #Dundalk and #Catonsville. Rain totals spanned from 1 to 10 inches across Maryland:  #ECFlood',
 'EllicottCity Maryland',
 'maryland significant damage sunday storm include baltimore city neighborhood dundalk catonsville rain total span inch maryland ecflood')

In [None]:
train_doc = data_doc.drop(['tweet_id','text'], axis=1)
Test_doc = test_doc.drop(['tweet_id','text'], axis=1)
train_doc.head()

Unnamed: 0,location,cleaned_text
1,Maryland,flash flood strike maryland city sunday wash s...
2,Maryland,state emergency declare maryland flooding
3,Baltimore Maryland,maryland significant damage sunday storm inclu...
4,Ellicott City Maryland,catastrophic flooding slam ellicott city maryl...
5,Ellicott City Maryland,watch miss flash flooding devastate ellicott c...


In [None]:

# Define the annotation function
def annotate_text(row):
    text = row['cleaned_text']
    locations = row['location'].split(" ")  # Assuming location column has space-separated location names
    annotations = []
    for loc in locations:
        pattern = re.compile(r'\b{}\b'.format(re.escape(loc.strip())), re.IGNORECASE)
        for match in pattern.finditer(text):
            start_idx = match.start()
            end_idx = match.end()
            annotations.append((start_idx, end_idx, 'LOC'))
    return annotations

In [None]:
import re
data_doc['annotations'] = data_doc.apply(annotate_text, axis=1)

In [None]:
data_doc['annotations'][5]

[(36, 44, 'LOC'), (45, 49, 'LOC'), (50, 58, 'LOC')]

In [None]:
def prepare_data(df, tokenizer):
    data = []
    for index, row in df.iterrows():
        text = row['cleaned_text']
        annotations = row['annotations']
        tokenized_input = tokenizer(text, return_offsets_mapping=True, truncation=True, padding='max_length')
        labels = ['O'] * len(tokenized_input['input_ids'])
        for start, end, label in annotations:
            for idx, (offset_start, offset_end) in enumerate(tokenized_input['offset_mapping']):
                if offset_start == start:
                    labels[idx] = f'B-{label}'
                elif offset_start > start and offset_end < end:
                    labels[idx] = f'I-{label}'
                elif offset_end == end:
                    labels[idx] = f'L-{label}'
        # Handle single-token entities
        for idx, (offset_start, offset_end) in enumerate(tokenized_input['offset_mapping']):
            if offset_start == end and offset_end == end:
                labels[idx] = f'U-{label}'
        tokenized_input.pop('offset_mapping')
        data.append((tokenized_input, labels))
    return data

In [None]:
model_name = 'rsuwaileh/IDRISI-LMR-EN-timebased-typeless'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#nction to extract the most frequent label for stratification
#def most_frequent_label(annotations):
#    if not annotations:
#        return 'O'
#    labels = [label for _, _, label in annotations]
#    most_common_label, _ = Counter(labels).most_common(1)[0]
#    return most_common_label

#train_doc['stratify_label'] = train_doc['annotations'].apply(most_frequent_label) Fu

In [None]:
train_df, val_df = train_test_split(data_doc, test_size=0.2, random_state=42)

# Prepare training and validation data
train_data = prepare_data(train_df, tokenizer)
val_data = prepare_data(val_df, tokenizer)

In [None]:
train_data[0]

In [None]:
# Custom dataset class
class MSL_DataSet(Dataset):
    def __init__(self, data, tokenizer, label_map):
        self.data = data
        self.tokenizer = tokenizer
        self.label_map = label_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        tokenized_input, labels = self.data[index]
        input_ids = torch.tensor(tokenized_input['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(tokenized_input['attention_mask'], dtype=torch.long)
        labels = torch.tensor([self.label_map[label] for label in labels], dtype=torch.long)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [None]:
# Label mapping
label_list = ['O', 'B-LOC', 'I-LOC', 'L-LOC', 'U-LOC']
label_map = {label: idx for idx, label in enumerate(label_list)}

In [None]:
train_dataset = MSL_DataSet(train_data, tokenizer, label_map)
val_dataset = MSL_DataSet(val_data, tokenizer, label_map)

In [None]:
# Compute metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    # Flatten the predictions and labels
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Mask out the special tokens (-100)
    valid_indices = labels_flat != -100
    preds_flat = preds_flat[valid_indices]
    labels_flat = labels_flat[valid_indices]

    accuracy = metrics.accuracy_score(labels_flat, preds_flat)
    recall = metrics.recall_score(labels_flat, preds_flat, average='weighted')
    precision = metrics.precision_score(labels_flat, preds_flat, average='weighted')
    f1 = metrics.f1_score(labels_flat, preds_flat, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)

cuda


In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
model

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at rsuwaileh/IDRISI-LMR-EN-timebased-typeless were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [None]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
print(train_data[2])

({'input_ids': [101, 2302, 8386, 1631, 1940, 15615, 2149, 170, 1377, 2620, 2640, 1170, 2737, 5559, 177, 1733, 4566, 3238, 3073, 2599, 12964, 3113, 4929, 3644, 2445, 1297, 3290, 1470, 2029, 2400, 1155, 3354, 12801, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
# Save the model
model.save_pretrained('location_ner_model')
tokenizer.save_pretrained('location_ner_model')

In [None]:

# Function to predict locations
def get_predictions(text, model, tokenizer):
    # Tokenize input text
    tokenized_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Get predictions from the model
    with torch.no_grad():
        output = model(**tokenized_input)

    logits = output.logits
    predictions = torch.argmax(logits, dim=2)

    # Map predictions to labels
    predicted_labels = [label_list[prediction] for prediction in predictions[0].numpy()]
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0].numpy())

    # Extract locations based on BILOU labels
    locations = []
    current_location = []

    for token, label in zip(tokens, predicted_labels):
        if label.startswith("B-"):
            if current_location:
                locations.append(" ".join(current_location))
            current_location = [token]
        elif label.startswith("I-") and current_location:
            current_location.append(token)
        elif label.startswith("L-") and current_location:
            current_location.append(token)
            locations.append(" ".join(current_location))
            current_location = []
        elif label.startswith("U-"):
            locations.append(token)
        elif label == "O":
            if current_location:
                locations.append(" ".join(current_location))
                current_location = []

    # Append any remaining location
    if current_location:
        locations.append(" ".join(current_location))

    return " ".join(locations)


In [None]:
trained_model = AutoModelForTokenClassification.from_pretrained('location_ner_model')
tokenizer = AutoTokenizer.from_pretrained('location_ner_model')

NameError: name 'AutoModelForTokenClassification' is not defined

In [None]:
test_doc['location'] = test_doc['cleaned_text'].apply(lambda x: get_predictions(x, trained_model, tokenizer))

In [None]:
submission = test_doc.drop(["text", "cleaned_text"], axis="columns")

In [None]:
submission.to_csv("/content/drive/MyDrive/Microsft LMR/distill_baseline.csv", index=False)

In [None]:
submission

Unnamed: 0,tweet_id,location
0,ID_1001154804658286592,Jamaica Haiti US
1,ID_1001155505459486720,Ellicott City Maryland
2,ID_1001155756371136512,Ellicott City Maryland
3,ID_1001159445194399744,Ellicott Maryland
4,ID_1001164907587538944,Ellicott City Maryland
...,...,...
2937,ID_915017703055749120,Mexico
2938,ID_915026957758328832,Mexico San Antonio
2939,ID_915253441726889984,Calgary Alberta
2940,ID_915971980859400192,Mexico Oaxaca
