<a href="https://colab.research.google.com/github/pgshetty2001/capstone_project/blob/main/CloudXLab_Capstone_Project_with_LargeDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import csv

In [3]:
from google.colab import drive
import pandas as pd
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
# hyperparameters
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

# Set random seed
import tensorflow as tf
tf.random.set_seed(1337)

In [31]:
file_path='/content/gdrive/MyDrive/data/amazon_review_small.txt'
def read_data():
    # Define a custom dialect to handle quotes and commas
    csv.register_dialect('myDialect', delimiter = ',', quotechar = '"', quoting=csv.QUOTE_ALL, skipinitialspace=True)
    with open(file_path, 'r') as file:
        reader = csv.reader(file, dialect='myDialect')
        data = list(reader)
    # Create DataFrame and assign column names
    df_reviews = pd.DataFrame(data[1:], columns=["star_rating", "review_headline", "review_body"])  # Assuming first row is header
    return df_reviews
df_reviews = read_data()

In [22]:
!pip install keras-self-attention

Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18894 sha256=662904ad7d5ec019c7577a0acab15b9e8dccfea396be6d2937bedc10fa213f4f
  Stored in directory: /root/.cache/pip/wheels/b8/f7/24/607b483144fb9c47b4ba2c5fba6b68e54aeee2d5bf6c05302e
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0


In [32]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras_self_attention import SeqSelfAttention
import pandas as pd
import numpy as np
import joblib
import csv

# --- Configuration ---
text_column = 'review_body'  # Name of the column containing text data
label_column = 'star_rating'  # Name of the column containing sentiment labels
batch_size = 32
max_tokens = 10000  # Maximum vocabulary size
embedding_dim = 128
num_epochs = 10
num_classes = 5  # Number of sentiment classes (adjust accordingly)
chunk_size = 10000

# --- Data Loading and  Preprocessing ---
file_path='/content/gdrive/MyDrive/data/amazon_review_small.txt'

def read_data():
    # Define a custom dialect to handle quotes and commas
    csv.register_dialect('myDialect', delimiter = ',', quotechar = '"', quoting=csv.QUOTE_ALL, skipinitialspace=True)
    with open(file_path, 'r') as file:
        reader = csv.reader(file, dialect='myDialect')
        data = list(reader)
    # Create DataFrame and assign column names
    df_reviews = pd.DataFrame(data[1:], columns=["star_rating", "review_headline", "review_body"])  # Assuming first row is header
    return df_reviews

df_reviews = read_data()
df_reviews = df_reviews.drop_duplicates()
df_reviews = df_reviews.dropna()

df_reviews[text_column] = df_reviews[text_column].str.lower()

# Check if preprocessed data exists, load if available
try:
    text_vectorization = joblib.load("text_vectorization.pkl")
    print("Loaded text vectorization from disk.")
except FileNotFoundError:
    # Preprocess data if not found
    text_vectorization = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=500
    )
    text_vectorization.adapt(df_reviews[text_column])
    joblib.dump(text_vectorization, "text_vectorization.pkl")
    print("Text vectorization saved to disk.")

# Define the learning rate schedule
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,  # Number of steps over which to decay the learning rate
    decay_rate=0.9      # Rate at which to decay the learning rate
)

# Create an optimizer with the learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    # --- Model Definition ---
model = tf.keras.Sequential([
    text_vectorization,
    layers.Embedding(max_tokens + 1, embedding_dim),
    layers.LSTM(128),
    #SeqSelfAttention(attention_activation='sigmoid'),
    layers.Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.2),
    layers.Flatten(),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# --- Training in Chunks ---
for i in range(0, len(df_reviews), chunk_size):
    chunk = df_reviews.iloc[i: i + chunk_size]

    chunk = chunk.drop_duplicates()
    chunk = chunk.dropna()

    chunk[text_column] = chunk[text_column].str.lower()

    label_mapping = {
        '1': 0,
        '2': 1,
        '3': 2,
        '4': 3,
        '5': 4
    }
    chunk[label_column] = chunk[label_column].map(label_mapping).fillna(0).astype(np.int32)

    text_data = chunk[text_column].values
    label_data = chunk[label_column].values.astype(np.int32)

    # Convert labels to one-hot encoding
    label_data = tf.keras.utils.to_categorical(label_data, num_classes=num_classes)

    dataset = tf.data.Dataset.from_tensor_slices((text_data, label_data))
    dataset = dataset.batch(batch_size)

    model.fit(dataset, epochs=num_epochs)
    print(f"Finished training on chunk {i // chunk_size + 1}")

# --- Example Usage (Prediction) ---
new_text = ["This product is absolutely fantastic!"]
new_text_vectorized = text_vectorization(new_text)
prediction = model.predict(new_text_vectorized)
predicted_class = np.argmax(prediction[0])
print("Predicted sentiment class:", predicted_class)

Loaded text vectorization from disk.
Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1385s[0m 4s/step - accuracy: 0.2171 - loss: 1.8115
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1401s[0m 4s/step - accuracy: 0.2034 - loss: 1.6112
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1375s[0m 4s/step - accuracy: 0.2038 - loss: 1.6112
Epoch 4/10
[1m211/313[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m7:31[0m 4s/step - accuracy: 0.2053 - loss: 1.6112

KeyboardInterrupt: 