In [None]:
import pandas as pd 

In [None]:
data = pd.read_csv('tesla_tweets.csv', sep='|')

In [None]:
data.head()

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
sample = "We are going to implement sentiment analysis with BERT!"

In [None]:
tokens = tokenizer.tokenize(sample)
print(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id

Tokenizer.encode_plus adds [CLS] at beginning and [SEP] at end of the sentence 

In [None]:
manual_sample = '[CLS]' + sample + '[SEP]'
tokens = tokenizer.tokenize(manual_sample)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(manual_sample)
print(tokens)
print(token_ids)

In [None]:
encoding = tokenizer.encode_plus(sample, max_length=24, truncation=True, pad_to_max_length=True,
                                add_special_tokens=True, return_attention_mask=True,
                                return_tensors='tf')

In [None]:
encoding 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import tensorflow as tf
from transformers import DistilBertTokenizerFast

In [None]:
df = pd.read_csv('train.csv', sep='\t')
df.head()

In [None]:
df.drop(['PhraseId', 'SentenceId'], axis=1, inplace=True)
df.head()

In [None]:
df.tail(20)

In [None]:
df['Sentiment'] = df['Sentiment'].apply(lambda x: 2 if x > 2 else (1 if x == 2 else 0))
df.tail(20)

In [None]:
seqlen = df['Phrase'].apply(lambda x: len(x.split()))

In [None]:
seqlen

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(16, 10))
sns.distplot(seqlen)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
SEQLEN = 30

In [None]:
def encoder(sentence):
    tokens = tokenizer.encode_plus(
        sentence, max_length=SEQLEN, truncation=True,
        padding='max_length', add_special_tokens=True,
        return_attention_mask=True, return_token_type_ids=False
    )
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
Xraw = df['Phrase'].values
Xraw[:5]

In [None]:
input_ids = []
attention_mask = []

for item in Xraw:
    input_i, mask = encoder(item)
    input_ids.append(input_id)
    attention_mask.append(mask)

In [None]:
Xids = np.asarray(input_ids)
Xmask = np.asarray(attention_mask)

In [None]:
Xids.shape

In [None]:
Xmask.shape

In [None]:
Xmask

In [None]:
arr = df['Sentiment'].values

In [None]:
labels = np.zeros((arr.size, arr.max()+1))

In [None]:
labels[np.arange(arr.size), arr] = 1

In [None]:
labels 

In [None]:
with open('xids.npy', 'wb') as f:
    np.save(f, Xids)
    
with open('xmask.npy', 'wb') as f:
    np.save(f, Xmask)
    
with open('labels.npy', 'wb') as f:
    np.save(f, labels)

In [None]:
import numpy as np
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertConfig

In [None]:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
with open('xids.npy', 'rb') as f:
    Xids = np.load(f)

In [None]:
Xids

In [None]:
with open('xmask.npy', 'rb') as f:
    Xmask = np.load(f)
with open('labels.npy', 'rb') as f:
    labels = np.load(f)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [None]:
for item in dataset.take(1):
    print(item)

In [None]:
def map_func(input_ids, mask, label):
    return {'input_ids': input_ids, 'attention_mask': mask}, label

In [None]:
dataset = dataset.map(map_func)

In [None]:
for item in dataset.take(1):
    print(item)

In [None]:
dataset = dataset.shuffle(10000).batch(64)

In [None]:
size = len(dataset)
size

In [None]:
split = 0.9
train = dataset.take(round(size*split))
val = dataset.skip(round(size*split))

In [None]:
split*size

In [None]:
from transformers import TFDistilBertForSequenceClassification
config = DistilBertConfig(num_labels=3)

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
model.summary()

In [None]:
model.layers[0].trainable = False  # Setting distilber layer to freeze
model.summary()

In [None]:
optimizer = tf.keras.optimizer.Adam(0.02)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(
    train,
    validation_data = val,
    epochs = 20
)

## Building NN models 

In [None]:
import numpy as np
import tensorflow as tf

In [None]:
with open('xids.npy', 'rb') as fp:
    Xids = np.load(fp)
with open('xmask.npy', 'rb') as fp:
    Xmask = np.load(fp)
with open('labels.npy', 'rb') as fp:
    labels = np.load(fp)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [None]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask':masks}, labels

In [None]:
dataset = dataset.map(map_func)

In [None]:
BATCH_SIZE = 32
dataset = dataset.shuffle(10000).batch(BATCH_SIZE)

In [None]:
SPLIT = 0.8
DS_LEN = len(list(dateset))
train = dataset.take(round(DS_LEN*SPLIT))
val = dataset.skip(round(DS_LEN*SPLIT))

In [None]:
from transformers import TFAutoModel

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

In [None]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.GlobalMaxPool1D()(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
X = tf.keras.layers.Dense(32, activation='relu')(X)
y = tf.keras.layers.Dense(3, activate='softmax', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [None]:
model.layers[2].trainable = False  # This time our BERT is 2nd layer

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(
    train,
    validation_data=val,
    epochs=140
)

In [None]:
model.save('../models/nn140')

In [None]:
sns.set_style('darkgrid')
mpl.rcParams.update({'font.size': 18})

epochs = list(range(len(history.history['accuracy'])))

plt.figure(figsize=(16, 10))
sns.lineplot(x=epochs, y=history.history['accuracy'], label='acc', color='#08c6ab')
sns.lineplot(x=epochs, y=history.history['val_accuracy'], label='val-acc', color='#212b38')

sns.lineplot(x=epochs, y=history.history['loss'], label='loss', color='#726eff')
sns.lineplot(x=epochs, y=history.history['val_loss'], label='val-loss', color='#37465b')
plt.tight_layout()
plt.savefig('../figures/nn140-metrics.png')

## Bulding ConveNet Models 

In [None]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.Conv1D(filters=50, kernel_size=2, padding='valid', activation = 'relu')(embeddings)
X = tf.keras.layers.Conv1D(filters=50, kernel_size=3, padding='valid', activation = 'relu')(X)
X = tf.keras.layers.Conv1D(filters=50, kernel_size=4, padding='valid', activation = 'relu')(X)

X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
X = tf.keras.layers.Dense(32, activation='relu')(X)
y = tf.keras.layers.Dense(3, activate='softmax', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

## Bulding an LSTM model

In [None]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
X = tf.keras.layers.Dense(32, activation='relu')(X)
y = tf.keras.layers.Dense(3, activate='softmax', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

## Final Model Walkthrough

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
api = KaggleApi()
api.authenticate()

In [None]:
api.dataset_download_file('kazanova/sentiment140', file_name='training.1600000.processed.noemoticon.csv', path='./')

In [None]:
import zipfile 

In [None]:
with zipfile.ZipFile('training.1600000.processed.noemoticon.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1',
                names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [None]:
df.head()

In [None]:
df['target'].unique()

In [None]:
df = df [['target', 'text']]
df.head()

In [None]:
df.groupby('target')['text'].count()

In [None]:
df[df['target'] == 4].head()

In [None]:
seqlen = df['text'].apply(lambda x: len(x.split()))

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(16, 10))
sns.distplot(seqlen)

In [None]:
SEQLEN = 32

In [None]:
from transformers import AutoTokenizer, TFAutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQLEN,
                                  truncation=True, padding='max_length',
                                  add_special_tokens=True, return_attention_mask=True,
                                  return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
import numpy as np

In [None]:
Xids = np.zeros((len(df), SEQLEN))
Xmask = np.zeros((len(df), SEQLEN))

In [None]:
Xids.shape

In [None]:
for i, sentence in enumerate(df['text']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)
    if i % 100000 == 0:
        print(i)

In [None]:
Xids

In [None]:
Xmask

In [None]:
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)

In [None]:
arr = df['target'].values

In [None]:
arr

In [None]:
labels = np.zeros((arr.size, arr.max()+1))

In [None]:
labels[np.arange(arr.size), arr] = 1

In [None]:
labels 

In [None]:
with open('twitter-xids.npy', 'wb') as f:
    np.save(f, Xids)
with open('twitter-xmask.npy', 'wb') as f:
    np.save(f, Xmask)
with open('twitter-labels.npy', 'wb') as f:
    np.save(f, labels)

In [None]:
del df, Xids, Xmask, labels 

In [None]:
import tensorflow as tf

In [None]:
with open('twitter-xids.npy', 'rb') as f:
    Xids = np.load(f)
with open('twitter-xmask.npy', 'rb') as f:
    Xmask = np.load(f)
with open('twitter-labels.npy', 'rb') as f:
    labels = np.load(f)

In [None]:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
data = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [None]:
SHUFFLE = 10000000
BATCH_SIZE = 32

In [None]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask':masks}, labels

In [None]:
data = data.map(map_func)

In [None]:
data = data.shuffle(SHUFFLE).batch(BATCH_SIZE)

In [None]:
SPLIT = 0.9
train = data.take(int(50000*SPLIT))
val = data.skip(int(50000*SPLIT))
del data

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

In [None]:
input_ids = tf.keras.layers.Input(shape=(SEQLEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQLEN,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]

x = tf.keras.layers.LSTM(64)(embeddings)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
model.layers[2].trainable=False

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(
    train,
    validation_data=val,
    epochs=10
)

In [None]:
model.save('../models/final')

In [None]:
sns.set_style('darkgrid')
mpl.rcParams.update({'font.size': 18})

epochs = list(range(len(history.history['accuracy'])))

plt.figure(figsize=(16, 10))
sns.lineplot(x=epochs, y=history.history['accuracy'], label='acc', color='#08c6ab')
sns.lineplot(x=epochs, y=history.history['val_accuracy'], label='val-acc', color='#212b38')

sns.lineplot(x=epochs, y=history.history['loss'], label='loss', color='#726eff')
sns.lineplot(x=epochs, y=history.history['val_loss'], label='val-loss', color='#37465b')
plt.tight_layout()
plt.savefig('../figures/final-acc.png')