In [None]:
import pandas as pd

dataset_train_path = '/kaggle/input/amazon-reviews-multi/train.csv'
dataset_val_path = '/kaggle/input/amazon-reviews-multi/validation.csv'
dataset_test_path = '/kaggle/input/amazon-reviews-multi/test.csv'

train_data = pd.read_csv(dataset_train_path)
val_data = pd.read_csv(dataset_val_path)
test_data = pd.read_csv(dataset_test_path)

print("Train Dataset Information:")
train_data.info()
print("\n")

print("Validation Dataset Information:")
val_data.info()
print("\n")

print("Test Dataset Information:")
test_data.info()
print("\n")

# Print all column names
print("Columns in the datasets:")
print(train_data.columns.tolist())
print("\n")

# Print the number of unique values in each column for train dataset
print("Number of unique values in each column (Train Dataset):")
unique_values_train = train_data.nunique()
print(unique_values_train)
print("\n")

# Print the number of unique values in each column for validation dataset
print("Number of unique values in each column (Validation Dataset):")
unique_values_val = val_data.nunique()
print(unique_values_val)
print("\n")

# Print the number of unique values in each column for test dataset
print("Number of unique values in each column (Test Dataset):")
unique_values_test = test_data.nunique()
print(unique_values_test)


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Bidirectional


train_path = '/kaggle/input/amazon-reviews-multi/train.csv'
val_path = '/kaggle/input/amazon-reviews-multi/validation.csv'
test_path = '/kaggle/input/amazon-reviews-multi/test.csv'

train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

# Filter reviews in train, val, test
#train_data = train_data[train_data['language'] == 'de']
#val_data = val_data[val_data['language'] == 'de']
#test_data = test_data[test_data['language'] == 'de']

# Combine review_body and review_title
for data in [train_data, val_data, test_data]:
    data['review_text'] = data['review_body'].fillna('') + ' ' + data['review_title'].fillna('')

# Sentiment labels
def sentiment_label(stars):
    if stars > 3:
        return 1  # Positive
    else:
        return 0  # Negative

for data in [train_data, val_data, test_data]:
    data['sentiment'] = data['stars'].apply(sentiment_label)

# Tokenization
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['review_text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['review_text'])
val_sequences = tokenizer.texts_to_sequences(val_data['review_text'])
test_sequences = tokenizer.texts_to_sequences(test_data['review_text'])

# Pad sequences
max_len = 200
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

train_labels = train_data['sentiment'].values
val_labels = val_data['sentiment'].values
test_labels = test_data['sentiment'].values


##########################################
#### LSTM Sentiment (0 / 1)
##########################################

model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')  # 2 classes: positive, negative
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    train_padded, train_labels,
    epochs=3,
    batch_size=256,
    validation_data=(val_padded, val_labels),
    verbose=1
)

loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

##########################################
#### Predict whole dataset based on LSTM
##########################################

train_pred = np.argmax(model.predict(train_padded), axis=1)
val_pred = np.argmax(model.predict(val_padded), axis=1)
test_pred = np.argmax(model.predict(test_padded), axis=1)

train_data['sentiment_lstm'] = train_pred
val_data['sentiment_lstm'] = val_pred
test_data['sentiment_lstm'] = test_pred


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split

##########################################
#### LSTM Sentiment Score Value between 0 and 1
##########################################
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Single neuron with sigmoid activation for probability output
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    train_padded, train_labels,
    epochs=3,
    batch_size=256,
    validation_data=(val_padded, val_labels),
    verbose=1
)

loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

##########################################
#### Predict whole dataset based on LSTM
##########################################
train_pred = model.predict(train_padded)
val_pred = model.predict(val_padded)
test_pred = model.predict(test_padded)

train_data['sentiment_score_lstm'] = train_pred
val_data['sentiment_score_lstm'] = val_pred
test_data['sentiment_score_lstm'] = test_pred

combined_data = pd.concat([train_data, val_data, test_data], axis=0)

# Save to CSV
combined_data.to_csv('/kaggle/working/data_with_scores.csv', index=False)