In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [2]:
BASE_PATH = "data"

In [3]:
from sklearn.utils import shuffle

def get_processed_df(df):
    df.columns = ['x1','x2','labels','features']
    df = shuffle(df)
    df = df.dropna()
    return df.drop(['x1','x2'],axis=1)

In [4]:
#fetching and processing training_data
training_data = pd.read_csv(os.path.join(BASE_PATH,'twitter_training.csv'))
training_data = get_processed_df(training_data)

#fetching and processing validation_data
validation_data = pd.read_csv(os.path.join(BASE_PATH,'twitter_validation.csv'))
validation_data = get_processed_df(validation_data)

training_data.head()

Unnamed: 0,labels,features
6063,Neutral,You will definitely indeed not go far wrong wi...
25197,Neutral,"This is not law, not its full history, a more ..."
34020,Irrelevant,My wife and I celebrate the 50th episode of Th...
45518,Negative,@Verizon you really make clear with your promo...
50042,Irrelevant,How the fuck was cantona higher rated than Lah...


In [5]:
training_data['labels'].value_counts()

Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: labels, dtype: int64

In [6]:
#extracting labels and features out of dataframe :)
train_features, train_labels = training_data['features'].to_numpy(), training_data['labels'].to_numpy()
train_features, train_labels = training_data['features'].to_numpy(), training_data['labels'].to_numpy()
train_features

array(['You will definitely indeed not go far wrong with " With Everyday Victory In Jesus ". Order today!',
       'This is not law, not its full history, a more simple google search gives the following... The immediate causes of the 1905 revolution were failed state - a level leadership processes and policy, inflation poverty, famine hunger, Russo - Japanese Labor War, the subsequent rise of reformer forces and revolutionary opposition groups, and Bloody Sunday.',
       'My wife and I celebrate the 50th episode of The Illustration Department podcast with a dinner for two (while the kids play Fortnite in the other room).. pic.twitter.com / ZdHUMBkVJo',
       ...,
       'Tf is the homeboy doing? Mf acting like shit this kid is CSGO.',
       '@ PlayHearthstone legit can you guys disable djinn ASAP. Only playing for one minion is as lame as it boring.',
       "how can u always just sit there and tweet for negative shit like this all day my god. it's s not even some just the league st

# Using Bert Model for text vectorization

In [7]:
!pip install tensorflow-text



In [8]:

import tensorflow_hub as hub
import tensorflow_text as text

bert_preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [9]:
def get_sentence_embedings(sentences):
    processed_text = bert_preprocessor(sentences)
    encodings = bert_encoder(processed_text)['pooled_output']
    return encodings

In [10]:
len(get_sentence_embedings(['Hello what are you doing @sahil!'])[0])

768

In [11]:
#bert layers
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
encodings = get_sentence_embedings(text_input)

#Neural Network
l = tf.keras.layers.Dropout(0.1)(encodings)
output = tf.keras.layers.Dense(4, activation='sigmoid')(l)

#construct a final model
model = tf.keras.Model(inputs=[text_input], outputs=[output])

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [13]:
METRICS = [
    'accuracy'
]

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=METRICS
)

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(training_data['labels'])

train_labels_classes = le.transform(training_data['labels'])
train_labels_classes

array([2, 2, 0, ..., 0, 1, 0])

In [None]:
model.fit(
    training_data['features'],
    train_labels_classes,
    epochs=10,
    batch_size=100
)

Epoch 1/10
  7/740 [..............................] - ETA: 4:13:23 - loss: 1.4386 - accuracy: 0.2971

In [None]:
model.save(os.path.join(BASE_PATH,'models/1/'))