<a href="https://colab.research.google.com/github/nixtasy/NLP-with-Disaster-Tweets/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers==4.28.0
!pip install --upgrade accelerate

import numpy as np 
import evaluate
import pandas as pd 
import re
import string
import os.path as osp
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
import datasets
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, pipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [14]:
root = "/content/drive/MyDrive/nlp-getting-started"
data_df = pd.read_csv(osp.join(root, 'train.csv'))
submission_df = pd.read_csv(osp.join(root, 'test.csv'))
train_df, test_df = train_test_split(data_df, test_size=0.2)
train = Dataset.from_pandas(train_df, split="train", preserve_index = False)
test = Dataset.from_pandas(test_df, split="test", preserve_index = False)

In [15]:
train = train.remove_columns(["id","keyword","location"])
test = test.remove_columns(["id", "keyword","location"])

In [16]:
# Feature engineering
def remove_URL(example):
    url = re.compile(r'https?://\S+|www\.\S+')
    example['text'] = url.sub(r'', example['text'])
    return example

def remove_emoji(example):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    example['text'] = emoji_pattern.sub(r'', example['text'])
    return example

def remove_html(example):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    example['text'] = re.sub(html, '', example['text'])
    return example

def remove_punct(example):
    table = str.maketrans('', '', string.punctuation)
    example['text'] =  example['text'].translate(table).strip()
    return example

train = train.map(remove_URL, num_proc = 4)
test = test.map(remove_URL, num_proc = 4)

train = train.map(remove_emoji, num_proc = 4)
test = test.map(remove_emoji, num_proc = 4)

train = train.map(remove_html, num_proc = 4)
test = test.map(remove_html, num_proc = 4)

train = train.map(remove_punct, num_proc = 4)
test = test.map(remove_punct, num_proc = 4)

train = train.rename_column("target", "label")
test = test.rename_column("target", "label")

train[:5]

Map (num_proc=4):   0%|          | 0/6090 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1523 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6090 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1523 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6090 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1523 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6090 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1523 [00:00<?, ? examples/s]

{'text': ['My head exploded i swear',
  'reriellechan HE WAS THE LICH KINGS FIRST CASUALTY BLOCK ME BACK I HATE YOU',
  'Keeps askin me what this means\nNot like i got the answers\nPlus if i say the wrong thing\nThis might just turn into a disaster',
  'pantalonesfuego Yeah I grew up in the canyon above LA We had to evacuate a few times',
  'RobotRainstorm Imsort of interested in what fonts theyre using'],
 'label': [0, 1, 1, 1, 0]}

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_train = train.map(lambda example: tokenizer(example["text"], truncation=True), batched=True)
tokenized_test = test.map(lambda example: tokenizer(example["text"], truncation=True), batched=True)
# create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch 
# during collation, instead of padding the whole dataset to the maximum length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [18]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [11]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [23]:
training_args = TrainingArguments(
    output_dir="diaster_distilbert_base_uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

/content/diaster_distilbert_base_uncased is already a clone of https://huggingface.co/nixtasy/diaster_distilbert_base_uncased. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.392626,0.837163
2,0.421400,0.47642,0.823375
3,0.301400,0.420771,0.835194
4,0.205100,0.513949,0.827971
5,0.205100,0.847989,0.783979
6,0.142400,0.804496,0.815496
7,0.104200,0.929531,0.818779
8,0.075000,0.924056,0.814183
9,0.075000,1.006277,0.808273
10,0.061400,1.034456,0.807617


TrainOutput(global_step=3810, training_loss=0.17659055104092977, metrics={'train_runtime': 193.2326, 'train_samples_per_second': 315.164, 'train_steps_per_second': 19.717, 'total_flos': 521209020986088.0, 'train_loss': 0.17659055104092977, 'epoch': 10.0})

In [24]:
trainer.push_to_hub()

Upload file runs/May26_16-58-20_f4d23e4286aa/events.out.tfevents.1685120303.f4d23e4286aa.45480.4:   0%|       …

To https://huggingface.co/nixtasy/diaster_distilbert_base_uncased
   a8b0282..05ecf2b  main -> main

   a8b0282..05ecf2b  main -> main

To https://huggingface.co/nixtasy/diaster_distilbert_base_uncased
   05ecf2b..ab1f47d  main -> main

   05ecf2b..ab1f47d  main -> main



'https://huggingface.co/nixtasy/diaster_distilbert_base_uncased/commit/05ecf2b24caadbb7dde7f67307b4bb2f93970fec'

In [25]:
classifier = pipeline("sentiment-analysis", model="nixtasy/diaster_distilbert_base_uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [26]:
classifier("Fuck off!")

[{'label': 'NEGATIVE', 'score': 0.8725696206092834}]

In [27]:
submission = Dataset.from_pandas(submission_df, split="test", preserve_index = False)
submission = submission.remove_columns(["id", "keyword","location"])

In [28]:
submission = submission.map(remove_URL, num_proc = 4)
submission = submission.map(remove_emoji, num_proc = 4)
submission = submission.map(remove_html, num_proc = 4)
submission = submission.map(remove_punct, num_proc = 4)

Map (num_proc=4):   0%|          | 0/3263 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3263 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3263 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3263 [00:00<?, ? examples/s]

In [32]:
from transformers.pipelines.pt_utils import KeyDataset

predications =  list(classifier(KeyDataset(submission, "text")))

In [35]:
results = [label2id[p['label']] for p in predications]

In [37]:
ids = submission_df['id']

In [40]:
data = {'id':list(ids), 'target':results}
df_submissions = pd.DataFrame(data) 

In [41]:
df_submissions.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [44]:
df_submissions.to_csv("final_results.csv", index=False)