In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
from PIL import Image

import random
import warnings
import time
import datetime

from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches


import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import pytorch_lightning as pl

from google.colab import files

stop = set(stopwords.words('english'))
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.5)
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250
warnings.filterwarnings('ignore')

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# read data
train = pd.read_csv('data/train_clean.csv')
test = pd.read_csv('data/test_clean.csv')
train = train.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [10]:
display(train.sample(5))
display(test.sample(5))

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause unimag...,1
2227,3185,deluge,,The camping things I do for GISHWHES Just got ...,0
5448,7769,police,UK,DT georgegalloway RT Galloway4Mayor ÛÏThe CoL...,1
132,191,aftershock,,Aftershock back to school kick off was great I...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


Unnamed: 0,id,keyword,location,text
142,449,armageddon,1996???????????,UNIVERSAL ORDER OF ARMAGEDDON
2672,8915,snowstorm,Los Angeles,BigBangCBS wowokumthat was like ice water bliz...
2605,8682,sinkhole,"New York, New York",The sinkhole that ate Brooklyn
2515,8381,ruin,"Monroe, OH",Dont ruin a good today by thinking about a bad...
958,3187,deluge,"West Powelton, Philadelphia",Im havin previous life flashbacks of when i li...


In [11]:
labels = train['target'].values
idx = len(labels)
combined = pd.concat([train, test])
combined = combined.text.values

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [13]:
# find max length
max_len = 0
for text in combined:
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

Max sentence length:  55


In [14]:
# format each sample
max_length = max_len
dataset_for_loader = []
for index, row in train.iterrows():
    text = row['text']
    target = row['target']
    encoding = tokenizer(
        text,
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
    encoding['labels'] = target # add labels
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset_for_loader.append(encoding)

In [15]:
# split dataset
dataset_train, dataset_val = train_test_split(dataset_for_loader, random_state=42, test_size=0.3)
df_train, df_val = train_test_split(train, random_state=42, test_size=0.3)

# make dataloader
dataloader_train = DataLoader(
    dataset_train, batch_size=32, shuffle=True
) 
dataloader_val = DataLoader(dataset_val, batch_size=128)

In [16]:
class BertForSequenceClassification_pl(pl.LightningModule):
        
    def __init__(self, model_name, num_labels, lr):
        # model_name: name of the Transformer model
        # num_labels: number of labels
        # lr: learning rate

        super().__init__()
        
        # save num_labels and lr
        self.save_hyperparameters() 
        
        # load BERT model
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        

    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
        
      
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [21]:
# set conditon of saving model weights during learning
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

# set learning method

# CPU
"""
trainer = pl.Trainer(
    gpus=0,
    max_epochs=1,
    callbacks = [checkpoint]
)
"""
# GPU
trainer = pl.Trainer(
    gpus=1,
    max_epochs=10,
    callbacks = [checkpoint]
)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
# load PyTorch Lightning model
model = BertForSequenceClassification_pl(
    'bert-large-uncased', num_labels=2, lr=1e-5
)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [23]:
# fine tuning
trainer.fit(model, dataloader_train, dataloader_val)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 335 M 
----------------------------------------------------------
335 M     Trainable params
0         Non-trainable params
335 M     Total params
1,340.576 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [30]:
# preparation for calculating accuracy and etc for training data
text_list_trn = list(df_train['text'])
y_trn = df_train['target']

best_model_path = checkpoint.best_model_path
model = BertForSequenceClassification_pl.load_from_checkpoint(best_model_path)
bert_sc = model.bert_sc.cpu()    # CPU
# bert_sc = model.bert_sc.cuda() # GPU

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [31]:
# encording data
encoding = tokenizer(
    text_list_trn, 
    padding = 'longest',
    return_tensors='pt'
)
encoding = { k: v.cpu() for k, v in encoding.items() }     # CPU
# encoding = { k: v.cuda() for k, v in encoding.items() }  # GPU

In [32]:
# get predicted labels
with torch.no_grad():
    output = bert_sc(**encoding)
labels_predicted_tr  = output.logits.argmax(-1)

In [53]:
# calculate accuracy and etc for traning data
print("train confusion_matrix :\n", confusion_matrix(y_trn, labels_predicted_tr.tolist()))
print("train classification_report :\n",classification_report(y_trn, labels_predicted_tr.tolist()))

train confusion_matrix :
 [[2871  153]
 [ 443 1862]]
train classification_report :
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      3024
           1       0.92      0.81      0.86      2305

    accuracy                           0.89      5329
   macro avg       0.90      0.88      0.88      5329
weighted avg       0.89      0.89      0.89      5329



In [34]:
# preparation for calculating accuracy and etc for validation data
text_list_val = list(df_val['text'])
y_val = df_val['target']

best_model_path = checkpoint.best_model_path
model = BertForSequenceClassification_pl.load_from_checkpoint(best_model_path)
bert_sc = model.bert_sc.cpu()    # CPU
# bert_sc = model.bert_sc.cuda() # GPU

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [35]:
# encording data
encoding = tokenizer(
    text_list_val, 
    padding = 'longest',
    return_tensors='pt'
)
encoding = { k: v.cpu() for k, v in encoding.items() }     # CPU
# encoding = { k: v.cuda() for k, v in encoding.items() }  # GPU

In [37]:
# get predicted labels
with torch.no_grad():
    output = bert_sc(**encoding)
labels_predicted_val  = output.logits.argmax(-1)

In [57]:
# calculate accuracy and etc for validation data
print("validation confusion_matrix :\n", confusion_matrix(y_val, labels_predicted_val.tolist()))
print("validation classification_report :\n", classification_report(y_val, labels_predicted_val.tolist()))

validation confusion_matrix :
 [[1174  144]
 [ 240  726]]
validation classification_report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1318
           1       0.83      0.75      0.79       966

    accuracy                           0.83      2284
   macro avg       0.83      0.82      0.83      2284
weighted avg       0.83      0.83      0.83      2284



In [40]:
# preparation for calculating accuracy and etc for test data
text_list_test = list(test['text'])

best_model_path = checkpoint.best_model_path
model = BertForSequenceClassification_pl.load_from_checkpoint(best_model_path)
bert_sc = model.bert_sc.cpu()    # CPU
# bert_sc = model.bert_sc.cuda() # GPU

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [41]:
# encording data
encoding = tokenizer(
    text_list_test, 
    padding = 'longest',
    return_tensors='pt'
)
encoding = { k: v.cpu() for k, v in encoding.items() }     # CPU
# encoding = { k: v.cuda() for k, v in encoding.items() }  # GPU

In [42]:
# get predicted labels
with torch.no_grad():
    output = bert_sc(**encoding)
labels_predicted_test  = output.logits.argmax(-1)

In [89]:
df_train["prediction"] = labels_predicted_tr.tolist()
df_val["prediction"] = labels_predicted_val.tolist()
test["prediction"] = labels_predicted_test.tolist()

In [90]:
df_train_sorted = df_train.sort_values('id', ascending=True)
df_val_sorted = df_val.sort_values('id', ascending=True)
# not necessary
test_sorted  = test.sort_values('id', ascending=True)

In [102]:
df_train_sorted.to_csv('bert_train.csv', index=False, encoding='utf-8-sig')
files.download('bert_train.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [103]:
df_val_sorted.to_csv('bert_validation.csv', index=False, encoding='utf-8-sig')
files.download('bert_validation.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [104]:
test_sorted.to_csv('bert_test.csv', index=False, encoding='utf-8-sig')
files.download('bert_test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>