# Import Datasets and libraries

In [None]:
!gdown 1SaSq8kwvNmxq2HoQBenhXC3ejM8BU70d
!gdown 1uGv2afj67P9BGEMwFPyv_IopjMzaqMuG
!pip install transformers
!pip3 install emoji==0.6.0


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from transformers import TFAutoModel, AutoTokenizer

# Text Preprocessing - For BERT and BERTweet removing punctuations, url and double spaces is enough

In [None]:
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

def remove_url(data):
    url_tag=re.compile(r'URL')
    data=url_tag.sub(r'', data)
    return data

def remove_double_spaces(data):
    data = re.sub(' +', ' ', data)
    return data

In [None]:
d1 = pd.read_csv('t15_text_n2v.csv', encoding='utf-8')
d2 = pd.read_csv('t16_text_n2v.csv', encoding='utf-8')

d1['text']=d1['text'].apply(lambda z: remove_punctuations(z))
d1['text']=d1['text'].apply(lambda z: remove_url(z))
d1['text']=d1['text'].apply(lambda z: remove_double_spaces(z))

d2['text']=d2['text'].apply(lambda z: remove_punctuations(z))
d2['text']=d2['text'].apply(lambda z: remove_url(z))
d2['text']=d2['text'].apply(lambda z: remove_double_spaces(z))

d1.replace({False: 0, True: 1}, inplace=True)
d2.replace({False: 0, True: 1}, inplace=True)

# BERT Embeddings for Twitter15 and Twitter16

In [None]:
#Download pre-trained tokenizer and model from huggingface
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
texts = d1['text']
encoded_texts = tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)

input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']
t15_bert_embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]

In [None]:
texts = d2['text']
encoded_texts = tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)

input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']
t16_bert_embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]

# BERTweet Embeddings for Twitter15 and Twitter16

In [None]:
tweet_tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
tweet_bert_model = TFAutoModel.from_pretrained('vinai/bertweet-base')

In [None]:
texts = d1['text']
encoded_texts = tweet_tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)

input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']
t15_bertweet_embeddings = tweet_bert_model(input_ids, attention_mask=attention_masks)[0]

In [None]:
texts = d2['text']
encoded_texts = tweet_tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='tf'
)

input_ids = encoded_texts['input_ids']
attention_masks = encoded_texts['attention_mask']
t16_bertweet_embeddings = tweet_bert_model(input_ids, attention_mask=attention_masks)[0]

# Save Embeddings

In [None]:
np.save('t15_bert_emb.npy', t15_bert_embeddings)
np.save('t16_bert_emb.npy', t16_bert_embeddings)
np.save('t15_bertweet_emb.npy', t15_bertweet_embeddings)
np.save('t16_bertweet_emb.npy', t16_bertweet_embeddings)