<a href="https://colab.research.google.com/github/nad1011/natural_language_processing/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
!unzip /content/gdrive/MyDrive/Colab\ Notebooks/aclImdb.zip > /dev/null

In [5]:
import os
def load_data(path, tag):
	return [("".join(open(f"{path}/{f}", encoding="utf-8").readlines()), tag) for f in os.listdir(path)]

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
import re

def preprocessing(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'<.*?>', '', sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = sentence.lower()
    return ' '.join([lemmatizer.lemmatize(word) for word in sentence.split() if word not in stop_words])

In [8]:
import numpy as np
def extract(data_list):
	return np.array([preprocessing(data[0]) for data in data_list]), np.array([data[1] for data in data_list])

train_data = load_data("aclImdb/train/pos", 1) + load_data("aclImdb/train/neg", 0)
train_sentences, train_labels = extract(train_data)

test_data = load_data("aclImdb/test/pos", 1) + load_data("aclImdb/test/neg", 0)
test_sentences, test_labels = extract(test_data)

In [9]:
vocab_size = 3000
embedding_dim = 100
max_length = 200

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [10]:
import keras
from keras.callbacks import EarlyStopping

model = keras.Sequential([
	keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
	keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
	keras.layers.Dropout(0.5),
	keras.layers.Bidirectional(keras.layers.LSTM(32)),
	keras.layers.Dense(24, activation='relu'),
	keras.layers.Dropout(0.5),
	keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3)

model.fit(train_padded, train_labels, epochs=5, validation_split=0.1, callbacks=[early_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7864ac1237f0>

In [11]:
pred = model.predict(test_padded)
pred_labels = [1 if x > 0.5 else 0 for x in pred]



In [12]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86     12500
           1       0.86      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

