In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load data and label

In [2]:
tweets = []
labels = []

def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)
    
load_tweets('../input/cil-twitter/twitter-datasets/train_neg_full.txt', 0)
load_tweets('../input/cil-twitter/twitter-datasets/train_pos_full.txt', 1)

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)
labels = np.array(labels)

print(f'{len(tweets)} tweets loaded')

## Build validation set
We use 90% of tweets for training, and 10% for validation

In [3]:
np.random.seed(1) # Reproducibility!

shuffled_indices = np.random.permutation(len(tweets))
split_idx = int(0.9 * len(tweets))
train_indices = shuffled_indices[:split_idx]
val_indices = shuffled_indices[split_idx:]

len(train_indices), len(val_indices)

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)


## BERT Tokenizer

In [5]:
!pip install transformers

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import transformers
import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

%matplotlib inline

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

## BERT Data Preparation

In [7]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        
        tokens = tokenizer.tokenize(doc)
        
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
            
        all_ids.append(ids)
        all_masks.append(masks)
        
    encoded = np.array([all_ids, all_masks])
    
    return encoded

## Build Model Architecture

In [8]:
MAX_SEQ_LENGTH = 70

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

hidden_state = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')(inputs)[0]
pooled_output = hidden_state[:, 0]    

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)


model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5,  #2e-5
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

## Bert tokenizer

In [9]:
from transformers import AutoModel, AutoTokenizer

tokenizer_tweet = AutoTokenizer.from_pretrained("vinai/bertweet-base")

In [10]:
train_features_ids, train_features_masks = create_bert_input_features(tokenizer_tweet, tweets[train_indices][0:1000000],    
                                                                      max_seq_length=MAX_SEQ_LENGTH)  

## Model

In [None]:
MAX_SEQ_LENGTH = 70

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

hidden_state = transformers.TFRobertaModel.from_pretrained("vinai/bertweet-base")(inputs)[0]



pooled_output = hidden_state[:, 0]    

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)


model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5,  #2e-5
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
Y_train = labels[train_indices]
Y_val = labels[val_indices]

## Validation

In [None]:
val_features_ids, val_features_masks = create_bert_input_features(tokenizer_tweet, tweets[val_indices], 
                                                                  max_seq_length=MAX_SEQ_LENGTH)
                                                                  
                                                                  
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)         

## Training

In [None]:
model.fit([train_features_ids, 
           train_features_masks], Y_train[0:1000000], 
          validation_data=([val_features_ids, 
                            val_features_masks], Y_val),
          epochs=2, 
          batch_size=130, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

In [None]:
model.save_weights("bertweet_1.h5")
model.load_weights("bertweet_1.h5")

## Cleaning for RAM

In [None]:
import gc
del train_features_ids, train_features_masks
gc.collect()

In [None]:
model.load_weights("bertweet_1.h5")

train_features_ids, train_features_masks = create_bert_input_features(tokenizer_tweet, tweets[train_indices][1000000:],    
                                                                      max_seq_length=MAX_SEQ_LENGTH)  

In [None]:
model.fit([train_features_ids, 
           train_features_masks], Y_train[1000000:], 
          validation_data=([val_features_ids, 
                            val_features_masks], Y_val),
          epochs=2, 
          batch_size=40, 
          shuffle=True,
          callbacks=[es],
          verbose=1)

In [None]:
model.save_weights("bertweet_2.h5")
model.load_weights("bertweet_2.h5")

In [None]:
del train_features_ids, train_features_masks
gc.collect()

## Load test set

In [None]:
test_tweets = []

def load_test_tweets(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            test_tweets.append(line.rstrip())
    
load_test_tweets('../input/cil-twitter/twitter-datasets/test_data.txt')


# Convert to NumPy array to facilitate indexing
test_tweets = np.array(test_tweets)

print(f'{len(test_tweets)} tweets loaded')

In [None]:
test_features_ids, test_features_masks = create_bert_input_features(tokenizer_tweet, test_tweets, 
                                                                    max_seq_length=MAX_SEQ_LENGTH)
print('Test Features:', test_features_ids.shape, test_features_masks.shape)

In [None]:
#model.load_weights("bertweet_2.h5")

predictions = [1 if pr > 0.5 else -1 
                   for pr in model.predict([test_features_ids, 
                                            test_features_masks], batch_size=200, verbose=0).ravel()]

Id = list(range(1, 10001))

In [None]:
df = pd.DataFrame(list(zip(Id, predictions)),
               columns =['Id', 'Prediction'])

In [None]:
df.to_csv('sample_submission_bertweet_3.csv',index=False)