In [1]:
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
import numpy as np
from tensorflow.keras.models import load_model

def remove_special_characters(text):
        # Define a regex pattern to match all characters that are not alphanumeric or whitespace
    pattern = r'[^a-zA-Z0-9\s]'
    
    # Substitute all characters that match the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)

    pattern = r'[^\w\s]'  # \w matches alphanumeric characters and underscores; \s matches whitespace

    # Substitute all characters that match the pattern with an empty string
    cleaned_text = re.sub(pattern, '', cleaned_text)
    
    # Remove additional escape sequences specifically
    cleaned_text = cleaned_text.replace('\r', '').replace('\n', '').replace('\t', '')

    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_text.split()]
    
    return ' '.join(lemmatized_words)


train = pd.read_csv('train.csv')

train['Review_combined'] = train.apply(lambda row: remove_special_characters(row['Review_Title']) + ' ' + remove_special_characters(row['Review']),axis=1)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout,Bidirectional
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Tokenize text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train['Review_combined'])
sequences = tokenizer.texts_to_sequences(train['Review_combined'])
labels = train['Rating']

# Pad sequences to ensure uniform length
max_sequence_length = 100
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=128)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile model
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('best_model.h5.keras', save_best_only=True, monitor='val_accuracy',mode='max')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42,stratify=labels)

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1,callbacks=[early_stopping,model_checkpoint])

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 1. Preprocess the data
# Preprocess the text data in the new dataset (texts_new)
# Load the model from the .h5.keras file
model = load_model('best_model.h5.keras')

# Print the model summary to verify it loaded correctly
model.summary()

test = pd.read_csv('test.csv')
test['Review_combined'] = test.apply(lambda row: remove_special_characters(row['Review_Title']) + ' ' + remove_special_characters(row['Review']),axis=1)

sequences_new = tokenizer.texts_to_sequences(test['Review_combined'])
data_new = pad_sequences(sequences_new, maxlen=max_sequence_length)

# 2. Use the trained model to make predictions
predictions = model.predict(data_new)

# 3. Thresholding
# Assuming 0.5 as the threshold
threshold = 0.5
predicted_ratings = [1 if prediction >= 0.49 else 0 for prediction in predictions]

# Print or use the predicted ratings as needed
pd.Series(predicted_ratings).value_counts(normalize=True)

submission_LSTM_SMOTE = pd.DataFrame()
submission_LSTM_SMOTE['ID'] = test['ID']
submission_LSTM_SMOTE['Rating']=predicted_ratings
submission_LSTM_SMOTE.to_csv('submission_LSTM_v9.csv',index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohithemaprasad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mohithemaprasad/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch 1/10




[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 210ms/step - accuracy: 0.8962 - loss: 0.2599 - val_accuracy: 0.9802 - val_loss: 0.0555
Epoch 2/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 223ms/step - accuracy: 0.9827 - loss: 0.0486 - val_accuracy: 0.9838 - val_loss: 0.0519
Epoch 3/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 219ms/step - accuracy: 0.9870 - loss: 0.0385 - val_accuracy: 0.9872 - val_loss: 0.0471
Epoch 4/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 216ms/step - accuracy: 0.9901 - loss: 0.0318 - val_accuracy: 0.9877 - val_loss: 0.0443
Epoch 5/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 212ms/step - accuracy: 0.9910 - loss: 0.0294 - val_accuracy: 0.9866 - val_loss: 0.0417
Epoch 6/10
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 221ms/step - accuracy: 0.9920 - loss: 0.0287 - val_accuracy: 0.9880 - val_loss: 0.0415
Epoch 7/10
[1m

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 44ms/step
