In [None]:
import torch
import nltk
import re
import pandas as pd
import numpy
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, BertConfig
import preprocessor

In [None]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

In [None]:
# exploring the data

In [None]:
train.head()

In [None]:
train.location.unique()

In [None]:
train.keyword.unique()

In [None]:
# drop unnecessary columns which BERT won't need
# train.drop(['keyword', 'location'], axis=1)
# test.drop(['keyword', 'location'], axis=1)

In [None]:
# preprocess and tokenize the tweets
def preprocess_tweets(df):
    # lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())

    # options - leave hashtags
    preprocessor.set_options(preprocessor.OPT.URL,
                             preprocessor.OPT.EMOJI,
                             preprocessor.OPT.SMILEY,
                             preprocessor.OPT.MENTION,
                             preprocessor.OPT.NUMBER)

    # replace numbers, as preprocessor module does not work with comma-separated numbers
    pattern = re.compile(r"(^|\s)(-?\d+([.,]?\d+))")
    df['text'] = df['text'].apply(lambda x: re.sub(pattern, "", x))
    
    # apply preprocessor module to remove redundant information
    df['text'] = df['text'].apply(preprocessor.clean)
    return df

In [None]:
train = preprocess_tweets(train)
test = preprocess_tweets(test)

In [None]:
train.head()

## Loading BERT from huggingface

Load tokenizer and model from huggingface

In [None]:
# tokenize words with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# set model in training mode
model.train()

In [None]:
model.save_pretrained("../bert-data-transformers")

In [None]:
train['text'] = train['text'].apply(tokenizer.tokenize)
test['text'] = test['text'].apply(tokenizer.tokenize)

In [None]:
train.head()

In [None]:
train['text'] = train['text'].apply(tokenizer.encode)
test['text'] = test['text'].apply(tokenizer.encode)

In [None]:
train.head()