<a href="https://colab.research.google.com/github/mukulsn/Machine-Learning/blob/main/Tensorflow%20Developer%20Certificate/NLP_tweet_analysis_using_FNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install keras-nlp keras-core tensorflow-text --no-deps

Collecting keras-nlp
  Downloading keras_nlp-0.6.3-py3-none-any.whl (584 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.5/584.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-text
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text, keras-nlp, keras-core
Successfully installed keras-core-0.1.7 keras-nlp-0.6.3 tensorflow-text-2.15.0


In [2]:
import keras_nlp
import keras
import tensorflow as tf
import os

keras.utils.set_random_seed(42)

Using TensorFlow backend


#### Hyperparameters

In [3]:
BATCH_SIZE=64
EPOCHS=3
MAX_SEQUENCE_LENGTH=512
VOCAB_SIZE=15000

EMBED_DIM=128
INTERMEDIATE_DIM=512

In [4]:
# Download helper functions script
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2023-12-05 13:05:14--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2023-12-05 13:05:14 (78.4 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [5]:
# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [6]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2023-12-05 13:05:15--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.203.207, 74.125.199.207, 108.177.98.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.203.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-12-05 13:05:15 (142 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [7]:
# Turn .csv files into pandas DataFrame's
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [10]:
# splitting dataset
from sklearn.model_selection import train_test_split

# Use tran_test_split to split training datainto training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(), # to convert into numpy arrays for tensorflow
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1, # 10% of validation set
                                                                            random_state=42)

In [11]:
test_sentences = train_sentences[-1000:]
test_labels = train_labels[-1000:]
train_sentences = train_sentences[:-1000]
train_labels = train_labels[:-1000]

# Tokenization Steps
1. train the vocabulary of the dataset
2. tokenize the dataset

In [12]:
len(train_sentences)

5851

In [12]:
def train_word_piece(ds, vocab_size, reserved_tokens):
  word_piece_ds = ds.map(lambda x,y:x)
  vocab=keras_nlp.tokenizers.compute_word_piece_vocabulary(
      word_piece_ds.batch(1000).prefetch(2),
      vocabulary_size=vocab_size,
      # reserved_tokens=reserved_tokens,
  )
  return vocab

In [13]:
# from numpy array to tf.data.Dataset because tokenizer accepts only tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))
print(len(train_dataset),len(val_dataset),len(test_dataset))

5851 762 1000


In [14]:
reserved_tokens = ["[PAD]","UNK"]
reserved_tokens = None
vocab=train_word_piece(train_dataset, 15000,reserved_tokens)

In [15]:
# converting data into batches
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)


In [16]:
# Lets see some tokens
print("Tokens: ",vocab[600:700])

Tokens:  ['##ars', '##ction', '##ta', '##ver', 'Latest', 'Some', 'again', 'collided', 'damage', 'failure', 'little', 'loud', '##ail', '##ain', '##cial', '##ir', '##line', '##news', '##ng', '##ps', '30', 'An', 'Body', 'Rescuers', 'always', 'fatalities', 'fear', 'fucking', 'homes', 'oil', 'sinking', 'watch', 'ÛÒ', '##day', '##ea', '##ets', '##im', '##les', '##nd', '##os', '##ple', '##ville', '##way', '40', 'Don', 'Get', 'am', 'weapon', 'wind', 'wreck', '##RE', '##ace', '##ak', '##ctor', '##ma', '##ny', '##ode', '##rt', '##ut', '00', 'Crash', 'IS', 'Iran', 'Nuclear', 'One', 'Suicide', 'Two', 'Will', 'destroy', 'evacuation', 'fall', 'gonna', 'head', 'movie', 'refugees', 'through', 'tragedy', 'until', 'which', '##LE', '##de', '##ities', '##ks', '##light', '##ls', '##ner', '##ning', '##our', '##ro', '##ss', '##uck', '##up', '##ute', 'August', 'Still', 'ambulance', 'bad', 'debris', 'drought', 'missing']


### Now lets define the tokenize. we will ocnfigure the tokenizer with the vocabularies trained above. we will define a maximum sequence length so that all sequences are padded to the same length, if the length of the sequence is less than the specified sequence length. Otherwise, the sequence is truncated

In [19]:
MAX_SEQUENCE_LENGTH

512

In [32]:
tokenizer=keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=False,
    sequence_length=MAX_SEQUENCE_LENGTH, # max_sequence_length
)

In [None]:
# Lets try and tokenize a sample from our dataset
input_sentence_ex = train_dataset.take(1).get_single_element()[0]
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ",input_sentence_ex)
print("Tokens: ",input_tokens_ex)
print("Recovered text after dekotenizing: ",tokenizer.detokenize(input_tokens_ex))

### Format the dataset , tokenize it

In [33]:
def format_dataset(sentence, label):
  sentence=tokenizer(sentence)
  return ({'input_ids':sentence},label)

def make_dataset(dataset):
  dataset=dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
  return dataset.shuffle(512).prefetch(16).cache()

train_ds = make_dataset(train_dataset)
val_ds = make_dataset(val_dataset)
test_ds = make_dataset(test_dataset)

In [23]:
type(test_ds)

tensorflow.python.data.ops.cache_op.CacheDataset

In [34]:
[x for x in train_ds.take(1)]

[({'input_ids': <tf.Tensor: shape=(64, 128), dtype=int32, numpy=
   array([[  34, 1715,  178, ...,    0,    0,    0],
          [  34, 1864,  215, ...,    0,    0,    0],
          [  59,  634,  167, ...,    0,    0,    0],
          ...,
          [1869,   37, 1675, ...,    0,    0,    0],
          [ 470,   43,  299, ...,    0,    0,    0],
          [1366,  121,  978, ...,    0,    0,    0]], dtype=int32)>},
  <tf.Tensor: shape=(64,), dtype=int64, numpy=
  array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
         1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1])>)]

### Building the model


In [25]:
VOCAB_SIZE,MAX_SEQUENCE_LENGTH,EMBED_DIM

(15000, 512, 128)

In [19]:
INTERMEDIATE_DIM,VOCAB_SIZE,EMBED_DIM,MAX_SEQUENCE_LENGTH

(512, 15000, 128, 512)

In [20]:
input_ids = keras.Input(shape=(None,), dtype='int64',name='input_ids')

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(input_ids)

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)

x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(1,activation='sigmoid')(x)

fnet_classifier = keras.Model(input_ids,outputs,name='fnet_classifier')

fnet_classifier.summary()
fnet_classifier.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

Model: "fnet_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, None)]            0         
                                                                 
 token_and_position_embeddi  (None, None, 128)         1985536   
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 f_net_encoder (FNetEncoder  (None, None, 128)         132224    
 )                                                               
                                                                 
 f_net_encoder_1 (FNetEncod  (None, None, 128)         132224    
 er)                                                             
                                                                 
 f_net_encoder_2 (FNetEncod  (None, None, 128)     

In [22]:
[x for x in train_ds.take(1)]

[({'input_ids': <tf.Tensor: shape=(64, 512), dtype=int32, numpy=
   array([[ 172, 1257,  847, ...,    0,    0,    0],
          [ 277,  128, 2360, ...,    0,    0,    0],
          [ 270,  470,  135, ...,    0,    0,    0],
          ...,
          [2072,  127,   39, ...,    0,    0,    0],
          [  53,  136,  221, ...,    0,    0,    0],
          [ 276, 1738,   42, ...,    0,    0,    0]], dtype=int32)>},
  <tf.Tensor: shape=(64,), dtype=int64, numpy=
  array([0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
         0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
         1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1])>)]

In [23]:
fnet_classifier.fit(train_ds,epochs=EPOCHS,validation_data=val_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d2d2328d360>

In [25]:
fnet_classifier.evaluate(test_ds,batch_size=BATCH_SIZE)



[0.5440526604652405, 0.718999981880188]

In [27]:
# Make predictions (these come back in the form of probabilites)
model_pred_probs = fnet_classifier.predict(test_ds)
model_pred_probs[:10]



array([[0.61841756],
       [0.15501861],
       [0.30627173],
       [0.29210767],
       [0.836462  ],
       [0.15443254],
       [0.8429144 ],
       [0.4947775 ],
       [0.51006955],
       [0.99896586]], dtype=float32)

In [None]:
# turn predictions into probabilites
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_preds

In [29]:
# create evaluation function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def calcualte_results(y_true,y_pred):
  model_accuracy=accuracy_score(y_true,y_pred)*100
  model_precision,model_recall,model_f1,_ = precision_recall_fscore_support(y_true,y_pred, average='weighted')
  model_results={'accuracy':model_accuracy,
                 'precision':model_precision,
                 'recall':model_recall,
                 'f1':model_f1}
  return model_results

In [31]:
model_results = calcualte_results(y_true=test_labels,
                                 y_pred=model_preds)
model_results

{'accuracy': 50.5,
 'precision': 0.50933400736563,
 'recall': 0.5049999999999999,
 'f1': 0.5068593026714268}

In [None]:
model_6.save("fnet_classifier_SavedModel_format")

In [None]:
!cp  "/content/drive/MyDrive/Colab Notebooks/ML projects/Tensorflow projects/Tensorflow Developer Certificate/models/"

# iterating to other hyperparameters

In [35]:
MAX_SEQUENCE_LENGTH=128

In [38]:
EMBED_DIM,VOCAB_SIZE

(128, 15000)

In [36]:
tokenizer=keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=False,
    sequence_length=MAX_SEQUENCE_LENGTH, # max_sequence_length
)

In [39]:
input_ids = keras.Input(shape=(None,), dtype='int64',name='input_ids')

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(input_ids)

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)

x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(1,activation='sigmoid')(x)

fnet_classifier = keras.Model(input_ids,outputs,name='fnet_classifier')

fnet_classifier.summary()
fnet_classifier.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

Model: "fnet_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, None)]            0         
                                                                 
 token_and_position_embeddi  (None, None, 128)         1936384   
 ng_1 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 f_net_encoder_3 (FNetEncod  (None, None, 128)         132224    
 er)                                                             
                                                                 
 f_net_encoder_4 (FNetEncod  (None, None, 128)         132224    
 er)                                                             
                                                                 
 f_net_encoder_5 (FNetEncod  (None, None, 128)     

In [40]:
fnet_classifier.fit(train_ds,epochs=EPOCHS,validation_data=val_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d2c7e8e1120>

In [41]:
model_pred_probs = fnet_classifier.predict(test_ds)
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_results = calcualte_results(y_true=test_labels,
                                 y_pred=model_preds)
model_results



{'accuracy': 57.699999999999996,
 'precision': 0.5533445161290321,
 'recall': 0.577,
 'f1': 0.5438143452929184}

In [42]:
fnet_classifier.evaluate(test_ds,batch_size=BATCH_SIZE)



[0.6122368574142456, 0.7570000290870667]

some inferences are that data is less
