In [3]:
# Load the dataset
train_data_path = "D:/WORKSPACE/ai_advance/workshop/workshop2/topic3/train.ft.txt"
test_data_path = "D:/WORKSPACE/ai_advance/workshop/workshop2/topic3/test.ft.txt"

In [None]:
import pandas as pd
import re


def parse_data(file_path):
    labels = []
    reviews = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(" ", 1)
            if len(parts) == 2:
                label = parts[0].replace("__label__", "")
                review = parts[1]
                labels.append(label)
                reviews.append(review)
    return pd.DataFrame({"label": labels, "review": reviews})


train_data = parse_data(train_data_path)
test_data = parse_data(test_data_path)

In [4]:
print(train_data.head())
print(train_data.info())
print(test_data.info())

  label                                             review
0     2  Stuning even for the non-gamer: This sound tra...
1     2  The best soundtrack ever to anything.: I'm rea...
2     2  Amazing!: This soundtrack is my favorite music...
3     2  Excellent Soundtrack: I truly like this soundt...
4     2  Remember, Pull Your Jaw Off The Floor After He...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   label   object
 1   review  object
dtypes: object(2)
memory usage: 54.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   400000 non-null  object
 1   review  400000 non-null  object
dtypes: object(2)
memory usage: 6.1+ MB
None


In [7]:
train_data.head(20)

Unnamed: 0,label,review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
5,2,an absolute masterpiece: I am quite sure any o...
6,1,"Buyer beware: This is a self-published book, a..."
7,2,Glorious story: I loved Whisper of the wicked ...
8,2,A FIVE STAR BOOK: I just finished reading Whis...
9,2,Whispers of the Wicked Saints: This was a easy...


In [8]:
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Encode labels
label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["label"])
test_data["label"] = label_encoder.transform(test_data["label"])

In [14]:
import nltk

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HUYNGUYEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HUYNGUYEN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stopwords.words("english")]
    return " ".join(text)


# Clean the review text
train_data["review"] = train_data["review"].apply(clean_text)
test_data["review"] = test_data["review"].apply(clean_text)

In [None]:
# Tokenize and pad sequences
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data["review"])

X_train = tokenizer.texts_to_sequences(train_data["review"])
X_test = tokenizer.texts_to_sequences(test_data["review"])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

y_train = train_data["label"]
y_test = test_data["label"]

In [None]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

In [None]:
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Predict on the test set
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

In [None]:
# Calculate additional metrics
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")