In [1]:
import warnings

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.options.display.max_colwidth = 255

from pandas_profiling import ProfileReport
from tqdm.notebook import tqdm

import transformers
from tokenizers import BertWordPieceTokenizer

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

warnings.simplefilter("ignore")
##
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv


In [2]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

BATCH_SIZE = 64
AUTO = tf.data.experimental.AUTOTUNE
TEST_SIZE = 0.1
RANDOM_STATE = 42
MAX_SEQ_LEN = 150
PRETRAINED_MODEL = 'bert-base-uncased'
N_EPOCHS = 10

In [3]:
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", 
                   header=None,
                   encoding=DATASET_ENCODING)
data.columns = DATASET_COLUMNS
data = data[['text', 'target']]
data['target'] = data['target'].map(lambda x: 1.0 if x == 4 else (0. if x == 0 else np.NaN))

In [4]:
data.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0.0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0.0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0.0
3,my whole body feels itchy and like its on fire,0.0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0.0


In [5]:
data["contains_mention"] = data.text.str.contains("@")
data["contains_hashtag"] = data.text.str.contains("#")
data["contains_link"] = data.text.str.contains("http")

In [6]:

# ProfileReport(data[['target', 'contains_mention', 'contains_hashtag', 'contains_link']])

In [7]:
data['n_words'] = data['text'].map(lambda x: len(x.split(' ')))
data['n_chars'] = data['text'].map(len)

In [8]:
# ProfileReport(data[['target', 'n_words', 'n_chars']])

H:

- train as is
- drop mentions and hashtags
- replace mentions and hashtags with aux words
- replace mentions and hashtags generated by bert

# Modeling

## Helpers

### Text cleansing

In [9]:
import re

mention_regex = re.compile("\@([a-zA-Z1-9]+)", flags=re.IGNORECASE)
hashtag_regex = re.compile("\#([a-zA-Z1-9]+)", flags=re.IGNORECASE)
link_regex = re.compile("http(s?):\/\/[^\s]+", flags=re.IGNORECASE)


def remove_mentions(s: str) -> str:
    return mention_regex.sub(" ", s)

def remove_hashtags(s: str) -> str:
    return hashtag_regex.sub(" ", s)

def remove_links(s: str) -> str:
    return link_regex.sub(" ", s)

In [10]:
def clean_text(s: str) -> str:
    s = remove_mentions(s)
    s = remove_hashtags(s)
    s = remove_links(s)
    
    s = s.lower() # task specific
    
    return s

### BERT processing

In [11]:
def load_tokenizer(model_name: str) -> BertWordPieceTokenizer:
    tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
    
    save_path = f'/kaggle/working/{model_name}/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    tokenizer.save_pretrained(save_path)
    
    piece_tokenizer = BertWordPieceTokenizer(f'/kaggle/working/{model_name}/vocab.txt', lowercase=False)
    return piece_tokenizer

In [12]:
def prepare_texts(texts:pd.Series, tokenizer: BertWordPieceTokenizer, chunk_size: int=256, max_length:int=512):
    tokenizer.enable_truncation(max_length=max_length)
    tokenizer.enable_padding(max_length=max_length)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

## Data preparation

In [13]:
tokenizer = load_tokenizer(PRETRAINED_MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [14]:
clean_texts = data['text'].map(clean_text)

In [15]:
X = prepare_texts(clean_texts, tokenizer, max_length=MAX_SEQ_LEN)
y = data['target'].values

HBox(children=(FloatProgress(value=0.0, max=6250.0), HTML(value='')))




In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

In [17]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)



### Model definition

In [18]:
def classifier_model(x):
    x = tf.keras.layers.Dropout(0.35)(x)
    x = Dense(1, activation='sigmoid')(x)
    return x

In [19]:
def load_pretrained_encoder_model(model_name):
    model = transformers.TFBertModel.from_pretrained(model_name)
    
    return model

In [20]:
def build_encoder_classifier_model(encoder, classifier, loss='binary_crossentropy', max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = encoder(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = classifier(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss=loss, metrics=[tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()])
    
    return model

In [21]:
encoder = load_pretrained_encoder_model(PRETRAINED_MODEL)
model = build_encoder_classifier_model(encoder, classifier_model, max_len=MAX_SEQ_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 150)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 150, 768), (None, 109482240 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [22]:
train_history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    steps_per_epoch=250,
    validation_steps=75,
    epochs=N_EPOCHS
)

Train for 250 steps, validate for 75 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
model.save('./sent_classifier.model')

In [24]:
!ls

__notebook__.ipynb  bert-base-uncased  sent_classifier.model


In [25]:
!tar -czvf sent_classifier_model.tar.gz sent_classifier.model/

sent_classifier.model/
sent_classifier.model/variables/
sent_classifier.model/variables/variables.index
sent_classifier.model/variables/variables.data-00000-of-00002
sent_classifier.model/variables/variables.data-00001-of-00002
sent_classifier.model/assets/
sent_classifier.model/saved_model.pb


In [26]:
!rm -r sent_classifier.model/