<a href="https://colab.research.google.com/github/muleyprasad/ai/blob/master/TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [None]:
# Load your data from the Excel file
data = pd.read_excel('/content/drive/My Drive/Lokmat/merged_data.xlsx')

# Combine removal of rows with empty strings in 'city', 'date', 'page', and 'story' columns
columns_to_check = ['city', 'date', 'page', 'story']
data = data.dropna(subset=columns_to_check)

# Combine "heading" and "story" columns into a single text column
data['text'] = data['heading'] + " " + data['story']

X = data['text'].astype(str)
y = data['rating'] - 1 # Labels


In [None]:
# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [None]:
# Tokenize the text data
max_words = 10000  # Set the maximum number of words to consider
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sequences to ensure they have the same length
max_sequence_length = 200 # Set the maximum sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

In [None]:
# Build the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), # Use a bidirectional LSTM layer instead of a flatten layer
    tf.keras.layers.Dropout(0.2), # Use a dropout layer to prevent overfitting
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax') # 5 classes for ratings 1 to 5
])

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Define an early stopping callback to stop the training when the validation loss stops decreasing
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
# Train the model on the training set and validate it on the validation set
batch_size = 64
epochs = 10
model.fit(X_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.src.callbacks.History at 0x7fbefc3016c0>

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Test loss: 1.4065
Test accuracy: 0.3985
