In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm


In [1]:
import tensorflow as tf

# Periksa apakah GPU tersedia
print("GPU Available:", tf.config.list_physical_devices('GPU'))
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

GPU Available: []
/kaggle/input/amazon-alexa-reviews/amazon_alexa.tsv


# 1. Import the Depedencies

In [None]:
import nltk
import seaborn as sns
import pandas as pd
from nltk.corpus import stopwords
from textblob import Word
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import wordcloud
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import tensorflow as tf
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import re
import keras
import datetime

define the path

# 2. Load the Dataset

In [None]:
path = "/kaggle/input/amazon-alexa-reviews/amazon_alexa.tsv"

In [None]:
df= pd.read_csv(path,sep = "\t")

In [None]:
df

 # 3. Basic EDA 

In [None]:
df.info()

In [None]:
df[['verified_reviews', 'rating']].isnull().sum()

In [None]:
df = df.dropna(subset=['verified_reviews','rating'])

In [None]:
round(df['rating'].value_counts(normalize = True)*100,2)

In [None]:
sns.countplot(data = df, x = 'rating')

# 4. Cleaning, Preprocessing, and Tokenization

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
        doc = nlp(text)
        return ' '.join([token.lemma_ for token in doc])

def cleaning(df):
    # Konversi semua teks ke huruf kecil
    df['verified_reviews'] = df['verified_reviews'].apply(lambda x: 

            ' '.join(x.lower() for x in x.split()))

    # Hapus URL
    df['verified_reviews'] = df['verified_reviews'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
    
    # Hapus angka
    df['verified_reviews'] = df['verified_reviews'].apply(lambda x: re.sub(r'\d+', '', x))
    
    # Hapus tanda baca dan simbol
    df['verified_reviews'] = df['verified_reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    
    # Hapus spasi berlebih
    df['verified_reviews'] = df['verified_reviews'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    # melakukan lemmatisasi pada kata kata
    df['verified_reviews'] = df['verified_reviews'].apply(lemmatize_text)
    
    
    
    return df


In [None]:
copy_df = df.copy()
df_cleaned = cleaning(copy_df)

In [None]:
df_cleaned

## Visualize the most common word

In [None]:
all_words = " ".join(df_cleaned['verified_reviews']).lower().split()
word_freq = pd.Series(all_words).value_counts()


unique_words = len(word_freq)


print("Jumlah kata unik:", unique_words)
print("Frekuensi kata:\n", word_freq)

In [None]:
common_words = ''

for text in df_cleaned.verified_reviews:
    text = str(text)
    tokens = text.split()
    common_words += " ".join(tokens)+" "
wordcloud = wordcloud.WordCloud().generate(common_words)
plt.imshow(wordcloud,interpolation ='bilinear')
plt.axis('off')
plt.show()

In [None]:
reviews = df_cleaned['verified_reviews']
ratings = df_cleaned['rating']

## Split the Cleaned Data

In [None]:
train_x,test_x, train_y, test_y = train_test_split(reviews,ratings,test_size = 0.2,stratify = ratings,random_state = 88)

decrement by 1 for 0-indexing

In [None]:
train_y-=1
test_y-=1

## Tokenize and Add Padding

Add padding so the input have consistent dimension, so we can fit to model.

In [None]:

vocab_size = 3471
tokenizer = Tokenizer(num_words = vocab_size, oov_token="")
tokenizer.fit_on_texts(train_x)
word_index = tokenizer.word_index
train_x = tokenizer.texts_to_sequences(train_x)
train_x_padded = pad_sequences(train_x, padding = "post", maxlen = 150)

test_x = tokenizer.texts_to_sequences(test_x)
test_x_padded = pad_sequences(test_x, padding = "post", maxlen = 150)


 # 5. Model Preparation and Training

In [None]:
LSTM_model = Sequential([
    Embedding(vocab_size, 140, input_length = 150),
    SpatialDropout1D(0.4),
    keras.layers.Bidirectional(LSTM(256,dropout =0.3,recurrent_dropout =0.3)),
    Dense(24, activation = 'relu'),
    Dense(5, activation = 'softmax')
])

In [None]:
optim = keras.optimizers.Adam(learning_rate= 0.0001)

In [None]:
LSTM_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

# Membuat callback EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',   
    patience=13,           
    verbose=1,            
    restore_best_weights=True  # Mengembalikan bobot terbaik
)

In [None]:
epochs = 75
history = LSTM_model.fit(train_x_padded, train_y, epochs = epochs,verbose = 1, validation_split = 0.1,batch_size = 256, callbacks = [early_stopping])

In [None]:
def test_model(model, true_label):
    y_pred = model.predict(test_x_padded)
    y_pred = np.argmax(y_pred,axis=1)
    print(classification_report(true_label,y_pred))

test_model(LSTM_model,test_y)

# 6. Visualizing and Evaluating result 

In [None]:
def date_time(x):
    if x==1:
        return 'Timestamp: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
    if x==2:    
        return 'Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())
    if x==3:  
        return 'Date now: %s' % datetime.datetime.now()
    if x==4:  
        return 'Date today: %s' % datetime.date.today()

In [None]:

def plot_performance(history=None, figure_directory=None, ylim_pad=[0, 0]):
    xlabel = 'Epoch'
    legends = ['Training', 'Validation']

    plt.figure(figsize=(20, 5))

    y1 = history.history['accuracy']
    y2 = history.history['val_accuracy']

    min_y = min(min(y1), min(y2))-ylim_pad[0]
    max_y = max(max(y1), max(y2))+ylim_pad[0]


    plt.subplot(121)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Accuracy\n'+date_time(1), fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Accuracy', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()

    y1 = history.history['loss']
    y2 = history.history['val_loss']

    min_y = min(min(y1), min(y2))-ylim_pad[1]
    max_y = max(max(y1), max(y2))+ylim_pad[1]


    plt.subplot(122)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Loss\n'+date_time(1), fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Loss', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()
    if figure_directory:
        plt.savefig(figure_directory+"/history")

    plt.show()

plot_performance(history)

# 7. Save the Model

In [None]:
LSTM_model.save('/kaggle/working/5class_Amazon_Alexa_LSTM_weight.h5')