In [5]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

        
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # Add this import statement
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv('/kaggle/input/ratemeter/train.csv')
df_subset = df.iloc[:600000]

# Preprocess and vectorize the text
def preprocess_and_vectorize(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    porter = PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df_subset['review_text'] = df_subset['review_text'].apply(preprocess_and_vectorize)

print("preprocessing is done ......")
# Tokenize the text
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df_subset['review_text'])
X = tokenizer.texts_to_sequences(df_subset['review_text'])
X = pad_sequences(X)

print("vectorization is done ......")

# Split the data into training and testing sets
y = df_subset['rating']
num_classes = len(df_subset['rating'].unique())  # Determine the number of classes dynamically
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



/kaggle/input/ratemeter/sample_submission.csv
/kaggle/input/ratemeter/train.csv
/kaggle/input/ratemeter/test.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['review_text'] = df_subset['review_text'].apply(preprocess_and_vectorize)


preprocessing is done ......
vectorization is done ......


In [6]:
# Build the LSTM model for multi-class classification
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(num_classes, activation='softmax'))  # Use softmax for multi-class classification


In [7]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 5
batch_size = 1024
from keras.callbacks import LearningRateScheduler

def lr_scheduler(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        return lr * 0.9
    return lr

lr_schedule = LearningRateScheduler(lr_scheduler)

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[lr_schedule])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Set Loss: {loss}, Test Set Accuracy: {accuracy}')


Epoch 1/5
Epoch 2/5
Epoch 4/5
Epoch 5/5
Test Set Loss: 1.0455151796340942, Test Set Accuracy: 0.5557500123977661


In [9]:
max_len = 1726  # Adjust to the desired sequence length

test_df = pd.read_csv('/kaggle/input/ratemeter/test.csv')
test_df['review_text'] = test_df['review_text'].apply(preprocess_and_vectorize)

# Tokenize and pad the test data
X_test = tokenizer.texts_to_sequences(test_df['review_text'])
X_test = pad_sequences(X_test, max_len)

# Predict using the model
y_test_pred_probs = model.predict(X_test)

# Convert predicted probabilities to class predictions
y_test_pred = np.argmax(y_test_pred_probs, axis=1)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'review_id': test_df['review_id'], 'rating': y_test_pred})
predictions_df.to_csv('lstm_multiclass_predictions2.csv', index=False)


