<a href="https://colab.research.google.com/github/neel26desai/cmpe258_neural_network_advanced/blob/main/Classification_and_Data_Augmentation_Text_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nlpaug



#Base Model

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
#conver positive to 1 and negative to 0 in the sentiment column
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [6]:
# prompt: find th maximum length of sentences present in data['review']

max_length = max([len(x.split()) for x in data['review']])
print(f"Maximum length of sentences: {max_length}")


Maximum length of sentences: 2470


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['review'].values, data['sentiment'].values, test_size=0.2)


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train)


In [10]:
#apply the tokenizer to X_train to generate sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert the text tokens to padded sequences
max_length = 2500  # You can adjust this length based on your data analysis or leave it to some heuristic
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')



In [12]:

#Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 128),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

#Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [14]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=128, validation_data=(X_test_pad, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e33e0397eb0>

# Augmentation

In [15]:
import os
os.environ["MODEL_DIR"] = '../model'

In [16]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action


In [17]:
text = 'The quick brown fox jumps over the lazy dog .'
print(text)

The quick brown fox jumps over the lazy dog .


In [18]:
#word Augmenter, substitude words by spelling mistake
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quick bown fox jumps over the lszy dong.', 'The quick brown fox jumps other yhe laizy dog.', 'Yhe quick brown fox jumps other the lazing dog.']


In [19]:
#Synonyms Augmentor
ug = naw.SynonymAug(aug_src='wordnet')
augmented_text = ug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['The quick brown charles james fox jumps all over the lazy firedog.']


In [23]:
aug = nafc.Sometimes([
    naw.SpellingAug(),
    naw.SynonymAug(aug_src='wordnet')
])

In [24]:
aug.augment(text, n=1)

['The quick brown dodger jumps other tht lszy dog.']

In [27]:
import tqdm

In [28]:
X_train_augmented = []
for text in tqdm.tqdm(X_train[:100]):
  augmented_texts = aug.augment(text, n=1)
  X_train_augmented.append(augmented_texts[0])



100%|██████████| 100/100 [00:01<00:00, 55.13it/s]


In [31]:
X_train_combined = np.concatenate([X_train, X_train_augmented])
y_train_combined = np.concatenate([y_train, y_train[:100]])


In [32]:
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train_combined)


In [33]:
#apply the tokenizer to X_train to generate sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_combined)

In [34]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert the text tokens to padded sequences
max_length = 2500  # You can adjust this length based on your data analysis or leave it to some heuristic
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')



In [35]:

#Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 128),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

#Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [37]:
model.fit(X_train_pad, y_train_combined, epochs=5, batch_size=128, validation_data=(X_test_pad, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e334588f850>