In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load data

In [4]:
tweets = []
labels = []

def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)
    
load_tweets('../data/train_neg.txt', 0)
load_tweets('../data/train_pos.txt', 1)

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)
labels = np.array(labels)

print(f'{len(tweets)} tweets loaded')

200000 tweets loaded


## Build validation set
We use 90% of tweets for training, and 10% for validation

In [5]:
np.random.seed(1) # Reproducibility!

shuffled_indices = np.random.permutation(len(tweets))
split_idx = int(0.9 * len(tweets))
train_indices = shuffled_indices[:split_idx]
val_indices = shuffled_indices[split_idx:]

len(train_indices), len(val_indices)

(180000, 20000)

In [6]:
train_indices

array([ 49673, 171551,   5506, ..., 194791,  79538, 121767])

## Bag-of-words baseline

In [None]:
"""
from sklearn.feature_extraction.text import CountVectorizer

# We only keep the 5000 most frequent words, both to reduce the computational cost and reduce overfitting
vectorizer = CountVectorizer(max_features=5000)

# Important: we call fit_transform on the training set, and only transform on the validation set
X_train = vectorizer.fit_transform(tweets[train_indices])
X_val = vectorizer.transform(tweets[val_indices])

Y_train = labels[train_indices]
Y_val = labels[val_indices]
"""

In [None]:
#print(X_train[5])

### logistic classifier...

In [None]:
"""
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e5, max_iter=100)
model.fit(X_train, Y_train)
"""

In [None]:
"""
Y_train_pred = model.predict(X_train)
Y_val_pred = model.predict(X_val)

train_accuracy = (Y_train_pred == Y_train).mean()
val_accuracy = (Y_val_pred == Y_val).mean()
"""

In [None]:
#print(f'Accuracy (training set): {train_accuracy:.05f}')
#print(f'Accuracy (validation set): {val_accuracy:.05f}')

## Model interpretation

In [None]:
"""
model_features = model.coef_[0]
sorted_features = np.argsort(model_features)
top_neg = sorted_features[:10]
top_pos = sorted_features[-10:]

mapping = vectorizer.get_feature_names()

print('---- Top 10 negative words')
for i in top_neg:
    print(mapping[i], model_features[i])
print()

print('---- Top 10 positive words')
for i in top_pos:
    print(mapping[i], model_features[i])
print()
"""

# CNN baseline

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

MAX_SEQUENCE_LENGTH = 1000

## Pre-processing

In [None]:
#t = Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents to create the vocabulary of unique words and map them to row numbers in the embedding layer
#t.fit_on_texts(tweets[train_indices])

In [None]:
#t.word_index['<PAD>'] = 0

In [None]:
#max([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1])

In [None]:
#min([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1])

In [None]:
"""
s = 'the avengers was really great'
r = t.texts_to_sequences([s])
r
"""

In [None]:
#train_sequences = t.texts_to_sequences(tweets[train_indices])
#val_sequences = t.texts_to_sequences(tweets[val_indices])

In [None]:
#print("Vocabulary size={}".format(len(t.word_index)))
#print("Number of Documents={}".format(t.document_count))

In [None]:
#max([len(sentence_tokens) for sentence_tokens in train_sequences])

In [None]:
#max([len(sentence_tokens) for sentence_tokens in val_sequences])

In [None]:
"""
import matplotlib.pyplot as plt
%matplotlib inline

train_lens = [len(s) for s in train_sequences]

fig, ax = plt.subplots(1,1, figsize=(12, 6))
h1 = ax.hist(train_lens)
"""

## Sequence Normalization

In [None]:
"""
MAX_SEQUENCE_LENGTH = 100
"""

In [None]:
# pad dataset to a maximum review length in words
"""
X_train = sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_val = sequence.pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_val.shape
"""

## Prepare the model

In [None]:
"""
EMBED_SIZE = 300 # word embedding size
EPOCHS=5
BATCH_SIZE=16 #128
"""

In [None]:
"""
VOCAB_SIZE = len(t.word_index)
VOCAB_SIZE
"""

In [None]:
# create the model
"""
model = Sequential()

# embedding layer is of dim: 175846 x 300
model.add(Embedding(VOCAB_SIZE, 
                    EMBED_SIZE, 
                    #weights=[wt_arr],
                    input_length=MAX_SEQUENCE_LENGTH))

model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())

model.add(Dense(256, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()
"""

## Model training

In [None]:
# callbacks
"""
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      restore_best_weights=True,
                                      verbose=1)

# Fit the model
model.fit(X_train, Y_train, 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE,
          callbacks=[es], 
          verbose=1)
"""

## BERT Tokenizer

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import transformers
import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

%matplotlib inline

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

## BERT Data Preparation

In [None]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        
        tokens = tokenizer.tokenize(doc)
        
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
            
        all_ids.append(ids)
        all_masks.append(masks)
        
    encoded = np.array([all_ids, all_masks])
    
    return encoded

## Build Model Architecture

In [None]:
MAX_SEQ_LENGTH = 70

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

hidden_state = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')(inputs)[0]
pooled_output = hidden_state[:, 0]    

dense1 = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)


model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5, 
                                           epsilon=1e-08), 
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

## Convert text to DistilBERT input features

In [None]:
"""
train_features_ids, train_features_masks = create_bert_input_features(tokenizer, tweets[train_indices], 
                                                                      max_seq_length=MAX_SEQ_LENGTH)
val_features_ids, val_features_masks = create_bert_input_features(tokenizer, tweets[val_indices], 
                                                                  max_seq_length=MAX_SEQ_LENGTH)
#test_features = create_bert_input_features(tokenizer, test_reviews, max_seq_length=MAX_SEQ_LENGTH)
print('Train Features:', train_features_ids.shape, train_features_masks.shape)
print('Val Features:', val_features_ids.shape, val_features_masks.shape)
"""

In [None]:
Y_train = labels[train_indices]
Y_val = labels[val_indices]

In [None]:
Y_train = labels[train_indices]
Y_val = labels[val_indices]

val_features_ids, val_features_masks = create_bert_input_features(tokenizer, tweets[val_indices], 
                                                                  max_seq_length=MAX_SEQ_LENGTH)
                                                                  
                                                                  
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)                                                            

                                                                    

In [None]:
train_features_ids, train_features_masks = create_bert_input_features(tokenizer, tweets[train_indices][:1000],    #1125001
                                                                      max_seq_length=MAX_SEQ_LENGTH)  

model.fit([train_features_ids, 
           train_features_masks], Y_train[:1000], 
          validation_data=([val_features_ids, 
                            val_features_masks], Y_val),
          epochs=2, 
          batch_size=20, 
          shuffle=True,
          callbacks=[es],
          verbose=1)
                                                

In [None]:
#del train_features_ids, train_features_masks

In [None]:
"""
train_features_ids, train_features_masks = create_bert_input_features(tokenizer, tweets[train_indices][1125001:], 
                                                                      max_seq_length=MAX_SEQ_LENGTH)

model.fit([train_features_ids, 
           train_features_masks], Y_train[1125001:], 
          validation_data=([val_features_ids, 
                            val_features_masks], Y_val),
          epochs=3, 
          batch_size=20, 
          shuffle=True,
          callbacks=[es],
          verbose=1)
"""
         

## Train and Validate Model

In [None]:
"""
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)
model.fit([train_features_ids, 
           train_features_masks], train_sentiments, 
          validation_data=([val_features_ids, 
                            val_features_masks], val_sentiments),
          epochs=3, 
          batch_size=20, 
          shuffle=True,
          callbacks=[es],
          verbose=1)
"""

## Save models

In [None]:
modelname = 'model_BERT_v1"
tf.keras.models.save_model(model, './export/' + modelname + '/')