In [10]:
import numpy as np
import os
import pickle
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [11]:
# Define the paths to dataset
train_dir_positive = 'aclImdb/train/pos'
test_dir_positive = 'aclImdb/test/pos'
train_dir_negative = 'aclImdb/train/neg'
test_dir_negative = 'aclImdb/test/neg'
train_dir_unsupervised = 'aclImdb/train/unsup'



#---------------loading positive reviews--------------
def load_positive_data(dir):
    reviews = []
    labels = []
    
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append(1)
    return reviews, labels

#---------------loading negative reviews--------------
def load_negative_data(dir):
    reviews = []
    labels = []
    
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append(0)
    return reviews, labels

#---------------loading unsupervised reviews--------------
def load_unsupervised_data(dir):
    reviews = []
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
    return reviews

In [12]:
#-----------combining positive and negative reviews/labels------------
def load_data(positive_dir, negative_dir):
    positive_reviews, positive_labels = load_positive_data(positive_dir)
    negative_reviews, negative_labels = load_negative_data(negative_dir)
    reviews = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    return reviews, labels


train_reviews, train_labels = load_data(train_dir_positive, train_dir_negative)
test_reviews, test_labels = load_data(test_dir_positive, test_dir_negative)
unsup_reviews = load_unsupervised_data(train_dir_unsupervised)

#-----------saving train and test files of reviews/labels data--------------
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

save_data('train_data_rl.pkl', (train_reviews, train_labels))
save_data('test_data_rl.pkl', (test_reviews, test_labels))
save_data('unsupervised_data_r.pkl', unsup_reviews)

#-----------tokeninzing and pre-pocessing the reviews to padded integer sequence of equal length--------------
word_count = 8000
words_per_review = 500

tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(train_reviews)

def preprocess_reviews(tokenizer, reviews, words_per_review):
    sequences = tokenizer.texts_to_sequences(reviews)
    padded_sequences = pad_sequences(sequences, maxlen=words_per_review)
    return padded_sequences

x_train = preprocess_reviews(tokenizer, train_reviews, words_per_review)
x_test = preprocess_reviews(tokenizer, test_reviews, words_per_review)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)

#----------one-hot encoding the labels to 1 for positive and 0 for negative------------
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

#------------saving files for trained/test data-------------------
np.save('x_train_reviews.npy', x_train)
np.save('x_test_reviews.npy', x_test)
np.save('y_train_labels.npy', y_train)
np.save('y_test_labels.npy', y_test)

#-------save the tokenizer and label encoder as it will be needed for user input reviews as well-----------
def save_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)

save_pickle('tokenizer.pkl', tokenizer)
save_pickle('label_encoder.pkl', label_encoder)