<a href="https://colab.research.google.com/github/parsaGLS/quera-ai-final-bootcamp-project/blob/3.3/AI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import libraries

In [1]:
!pip install gdown -q

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import gdown
import os

Download stopwords from NLTK

In [2]:
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Download the datasets

In [3]:
train_file_id = '1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW'
test_file_id = '1-8TsrqTRFP-q9TM-6HinhO0ZVXFHq9TB'
train_output_path = 'train.json'
test_output_path = 'test.json'
if not os.path.exists(train_output_path):
    gdown.download(f'https://drive.google.com/uc?id={train_file_id}', train_output_path, quiet=False)
else:
    print(f"{train_output_path} already exists.")

if not os.path.exists(test_output_path):
    gdown.download(f'https://drive.google.com/uc?id={test_file_id}', test_output_path, quiet=False)
else:
    print(f"{test_output_path} already exists.")


Downloading...
From (original): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW
From (redirected): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW&confirm=t&uuid=9cf20239-45c6-44ab-8a6c-2dae05ad8b07
To: /content/train.json
100%|██████████| 635M/635M [00:05<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-8TsrqTRFP-q9TM-6HinhO0ZVXFHq9TB
To: /content/test.json
100%|██████████| 15.6M/15.6M [00:00<00:00, 21.4MB/s]


Load Data

In [4]:
df_train = pd.read_csv(train_output_path, low_memory=False)
df_test = pd.read_csv(test_output_path, low_memory=False)

In [5]:
print("Training data shape:", df_train.shape)
print("Test data shape:", df_test.shape)

Training data shape: (838944, 11)
Test data shape: (20000, 10)


Data Preprocessing

In [6]:
df_train['full_review'] = df_train['summary'].fillna('') + ' ' + df_train['reviewText'].fillna('')
df_test['full_review'] = df_test['summary'].fillna('') + ' ' + df_test['reviewText'].fillna('')


In [7]:
stop_words = set(stopwords.words('english'))


In [8]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [9]:
df_train['cleaned_review'] = df_train['full_review'].apply(clean_text)
df_test['cleaned_review'] = df_test['full_review'].apply(clean_text)


In [10]:
y = df_train['overall'].values - 1


In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train['cleaned_review'], y, test_size=0.2, random_state=42, stratify=y
)

Tokenization and Padding

In [12]:
vocab_size = 20000
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [13]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

In [14]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(df_test['cleaned_review'])

In [15]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)


Build and Train the LSTM Model

In [16]:
embedding_dim = 128
num_classes = 5

model = Sequential([

    Embedding(vocab_size, embedding_dim, input_length=max_length),


    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),


    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])




In [17]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [18]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

print("\nStarting model training...")
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=128,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


Starting model training...
Epoch 1/10
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 28ms/step - accuracy: 0.6579 - loss: 0.8826 - val_accuracy: 0.7054 - val_loss: 0.7330 - learning_rate: 0.0010
Epoch 2/10
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 27ms/step - accuracy: 0.7143 - loss: 0.7172 - val_accuracy: 0.7121 - val_loss: 0.7175 - learning_rate: 0.0010
Epoch 3/10
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 27ms/step - accuracy: 0.7331 - loss: 0.6641 - val_accuracy: 0.7154 - val_loss: 0.7225 - learning_rate: 0.0010
Epoch 4/10
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 27ms/step - accuracy: 0.7530 - loss: 0.6134 - val_accuracy: 0.7116 - val_loss: 0.7333 - learning_rate: 0.0010
Epoch 5/10
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 27ms/step - accuracy: 0.7860 - loss: 0.5311 - val_accuracy: 0.7092 - val_loss: 0.8008 - learning_rate: 2.0000e-04


Evaluate the Model

In [19]:
print("\nEvaluating model performance on the validation set...")
loss, accuracy = model.evaluate(X_val_pad, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")



Evaluating model performance on the validation set...
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 9ms/step - accuracy: 0.7125 - loss: 0.7199
Validation Loss: 0.7175
Validation Accuracy: 0.7121


In [25]:
y_pred_test = np.argmax(model.predict(X_test_pad), axis=1)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step


In [26]:
predicted_stars = [int(p) + 1 for p in y_pred_test]


In [27]:
df_submission = pd.DataFrame({'predicted': predicted_stars})
df_submission.to_csv("q2_submission.csv", index=False)
print(df_submission.head())


   predicted
0          1
1          1
2          1
3          1
4          1


In [28]:
print("\nCalculating internal validation F1 (micro)...")
y_pred_val = np.argmax(model.predict(X_val_pad), axis=1)
f1_micro = f1_score(y_val, y_pred_val, average='micro')
print(f"F1 Score (micro) روی داده Validation: {f1_micro:.4f}")


Calculating internal validation F1 (micro)...
[1m5244/5244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 7ms/step
F1 Score (micro) روی داده Validation: 0.7121
