**Approach - <br>- Data Reading and Understanding<br>- Preprocessing => removal of punctuations, numbers, lemmatization, fixing contractions, spelling correction<br> - Pretrained embeddings => glove embeddings https://nlp.stanford.edu/data/glove.twitter.27B.zip and emoji embeddings https://github.com/MirunaPislar/emoji2vec/tree/master/models <br>- Tokenizer to convert comments to sequences<br>- Model => Bidirectional LSTM with dropout<br>- Pipeline to execute tokenization and model in sequence<br>- Pipeline elements will be serialized to a pickle file to be used for predictions when invoked from StreamLit**

In [None]:
#download glove embeddings
import urllib.request
import zipfile

urllib.request.urlretrieve("https://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.27B.zip")

with zipfile.ZipFile("glove.27B.zip","r") as zip_ref:
    zip_ref.extractall("./embeddings")

In [1]:
#All imports
from helpers import preprocessing

import math
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

#model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.python.framework.ops import eager_run

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nehkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nehkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nehkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nehkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#reload
import importlib
importlib.reload(preprocessing)

In [3]:
#constants
EMBEDDING_DIM = 100
BATCH_SZ = 32
np.random.seed(42)
tf.random.set_seed(42)

### Step 1 - Data Reading and Understanding

In [4]:
#load training data
comments_df = pd.read_excel('./data/comments.xlsx', usecols=['comment','Sentiment' ])
comments_df.head()

Unnamed: 0,comment,Sentiment
0,hashtag hashtag hashtag hashtag hashtag hashta...,Neutral
1,send it pic _x000D_\n👁_x000D_\n👉 user,Neutral
2,if i get it on time maybe 😢,Mixed
3,gorgeous ❤,Positive
4,😍😍😍😍,Positive


In [5]:
#brief description of datatypes 
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11856 entries, 0 to 11855
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    11856 non-null  object
 1   Sentiment  11856 non-null  object
dtypes: object(2)
memory usage: 185.4+ KB


In [6]:
comments_df['Sentiment'].value_counts(normalize=True)

Positive    0.813850
Neutral     0.127783
Negative    0.052800
Mixed       0.005567
Name: Sentiment, dtype: float64

### Step 2 - Data Preparation

In [7]:
#data cleanup and preparation using helper module - "preprocesing"
def prepare_data(df):
  processed_df = preprocessing.preprocess_data(df, 'comment')  
  processed_df = preprocessing.convert_emoticon(processed_df, 'spelling_corrected', False)
  processed_df['cleaned_comment'] = processed_df['emoticon_converted']
  return processed_df

In [8]:
cleaned_df = prepare_data(comments_df)

X = cleaned_df['cleaned_comment']
y = cleaned_df['Sentiment'].map({"Neutral":0, "Positive":1, "Negative":2, "Mixed":3})

**Word Embeddings**

In [9]:
#load pre-trained glove embeddings
pretrained_embeddings = dict()
with open('./embeddings/glove.twitter.27B.100d.txt','r',encoding='utf-8') as file_handle:
  for line in file_handle:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    pretrained_embeddings[word]=vector

In [10]:
#load pre-trained emoji embeddings
emoji_embeddings = dict()
with open('./embeddings/emoji_embeddings_100d.txt','r',encoding='utf-8') as file_handle:
  for line in file_handle:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emoji_embeddings[word]=vector

In [11]:
#helper function to convert input comments to embedding vector representation
def vocab_embeddings(word_vocab):
  num_words = len(word_vocab) + 1
  word_embeddings = np.zeros((num_words, EMBEDDING_DIM))

  for word, i in word_vocab.items():
    if word in pretrained_embeddings:
      word_embeddings[i] = pretrained_embeddings[word]
    elif word in emoji_embeddings:
      word_embeddings[i] = emoji_embeddings[word]

  return word_embeddings


**CustomTokenizer to convert comments to sequences meant to be used as a pipeline element**

In [13]:
class CustomTokenizer:
  def __init__(self):
    #initialize tf tokenizer
    self.tokenizer = Tokenizer()
  
  def _vectorize_input(self, comments):   
    comments_seq = self.tokenizer.texts_to_sequences(comments)
    comments_seq = sequence.pad_sequences(comments_seq, maxlen=self.max_length, padding='post', truncating='post')
    return comments_seq

  def fit_transform(self, comments, y=None):
    """invoked on training data to fit tokenizer and convert comments to sequences"""        
    max_length = max(comments.map(len))
    self.max_length = min(max_length, 30)
    self.tokenizer.fit_on_texts(comments)
    return self._vectorize_input(comments)    

  def transform(self, comments):
    """invoked on testing/validation data to convert comments to sequences"""
    return self._vectorize_input(comments)

  def get_embeddings(self):
    """get vocab embeddings to be fed to embedding layer"""
    word_embeddings = vocab_embeddings(self.tokenizer.word_index)
    return word_embeddings

  def get_input_dim(self):
    """input sequence dimension to be fed to embedding layer"""
    return self.max_length  

In [14]:
#initialize tokenizer to be used in pipeline
cust_tokenizer = CustomTokenizer()

#transform target labels to one hot encoding
y_transformed = tf.keras.utils.to_categorical(y)

### Step 3 - Model Building

In [15]:
#evaluate model with f1 score to get a balance of precision and recall
def f1_score(true, pred):
  metric = tfa.metrics.F1Score(num_classes=4, average='weighted', threshold=0.5)
  metric.update_state(true, pred)
  result = metric.result()
  return result.numpy()  

In [16]:
#helper function to plot metrics - accuracy, f1-score for training and val data
def plot_results(x, metric, history):  
  val_metric = 'val_' + metric
  plt.plot(x, history.history[metric], label='training')
  plt.plot(x, history.history[val_metric], label='validation')
  plt.legend(loc='upper right')
  plt.show()

**Model - Bidirectional LSTM with dropout**

In [17]:
#model put together as a function to be used as a pipeline element
def create_model():
  word_embeddings = cust_tokenizer.get_embeddings()
  input_dim = cust_tokenizer.get_input_dim()
  model = tf.keras.models.Sequential([
          tf.keras.layers.Embedding(word_embeddings.shape[0], EMBEDDING_DIM, embeddings_initializer=tf.constant_initializer(word_embeddings),
                                    input_length =input_dim, trainable=False),
          tf.keras.layers.Dropout(0.2),
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),          
          tf.keras.layers.Dense(4, activation='softmax')
          ])  
  
  optimizer = tf.keras.optimizers.Adam(lr=0.0001)
  model.compile(optimizer, loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['acc',f1_score], run_eagerly=True)
  return model  

In [18]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=50)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint("BidirectionalLSTM", save_best_only=True, verbose=1, mode='max', monitor='f1_score')

#wrap the tf model into KerasClassifier to be used with sklearn pipeline
wrapper_model =  KerasClassifier(build_fn=create_model, verbose = 4, batch_size=BATCH_SZ, epochs=100, callbacks=[early_stopping, model_checkpoint])

In [1]:
#construct and invoke the pipeline
pipeline = Pipeline([("tokenize", cust_tokenizer),("model", wrapper_model)])
pipeline.fit(X, y_transformed)

In [59]:
#serialize the pipeline elements in a pickle file
pickle.dump(pipeline.named_steps['tokenize'], open('tokenizer.pkl','wb'))
pickle.dump(pipeline.named_steps['model'].classes_, open('classes.pkl','wb'))
pipeline.named_steps['model'].model.save('BidirectionalLSTM.h5')