In [None]:
# !pip install keras


In [None]:
import os
import numpy as np
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer  # Adjusted import for Keras
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Adjusted import for Keras
from tensorflow.keras.models import Sequential  # Adjusted import for Keras
from tensorflow.keras.layers import Dense, Dropout  # Adjusted import for Keras
import pandas as pd
from matplotlib import pyplot as plt
nltk.download('stopwords')
tokenizer=Tokenizer()


In [None]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc, vocab):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()] #to remove tokens of sign
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc, vocab)
    vocab.update(tokens)

def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc, vocab)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

def process_docs(directory, vocab, is_train):
    lines = list()
    movie_reviews_path = nltk.data.find('corpora/movie_reviews').path
    directory_path = os.path.join(movie_reviews_path, directory)
    for filename in listdir(directory_path):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = os.path.join(directory_path, filename)
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

def process_docsl(directory,vocab):
    movie_reviews_path = nltk.data.find('corpora/movie_reviews').path
    directory_path = os.path.join(movie_reviews_path, directory)
    for filename in listdir(directory_path):
        if filename.startswith('cv9'):
            continue
        path = os.path.join(directory_path, filename)
        add_doc_to_vocab(path, vocab)




In [None]:
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

In [None]:
positive_ids = movie_reviews.fileids('pos')
negavtive_ids = movie_reviews.fileids('neg')
print(positive_ids)
print(negavtive_ids)

In [None]:
vocab = Counter()
process_docsl('neg', vocab)
process_docsl('pos', vocab)
print(len(vocab))
print(vocab.most_common(50))

In [None]:
best_model = None
history=None
def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 1
    n_words = Xtest.shape[1]
    best_acc = 0

    for i in range(n_repeats):
        model = Sequential()
        model.add(Dense(50, input_shape=(n_words,), activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        history_temp=model.fit(Xtrain, ytrain, epochs=50, verbose=2)

        loss, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        global history
        global best_model
        if acc > best_acc:
            best_acc = acc
            best_model = model  # Keep track of the best model
            history=history_temp

    return scores


def prepare_data(train_docs, test_docs, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)  # Build the word index on the training data

    # Convert the texts to matrix representation based on the specified mode
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)

    # Convert to NumPy arrays (Keras works directly with NumPy)
    Xtrain = np.array(Xtrain, dtype=np.float32)
    Xtest = np.array(Xtest, dtype=np.float32)

    return Xtrain, Xtest

In [None]:
min_occurance=2
tokens=[k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

In [None]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

save_list(tokens, 'vocab.txt')
# save_list(test_tokens, 'test_vocab.txt')
# save_list(tags, 'tags.txt')
# save_list(test_tags, 'test_tags.txt')

In [None]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print(len(vocab))
print(vocab)


In [None]:
positive_lines = process_docs('pos', vocab, True)
negative_lines = process_docs('neg', vocab, True)
test_positive_lines = process_docs('pos', vocab, False)
test_negative_lines = process_docs('neg', vocab, False)

In [None]:
train_docs = negative_lines + positive_lines
test_docs = test_negative_lines + test_positive_lines

In [None]:
# print(len(train_docs))
# print(len(test_docs))
# positive_lines

ytrain=np.array([0 for _ in range(900)] + [1 for _ in range(900)])
ytest=np.array([0 for _ in range(100)] + [1 for _ in range(100)])
result=pd.DataFrame()

In [None]:
modes = ['binary']
# modes = ['binary', 'count', 'tfidf', 'freq']


for mode in modes:
    Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
    score = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
    result[mode] = score

    print('Mode:', mode)
    print('Accuracy: %.3f (%.3f)' % (np.mean(score), np.std(score)))
    print()

print(result)
result.boxplot()
plt.show()




In [None]:

def predict_sentiment(review, vocab, tokenizer, model):
    # Clean the review text
    tokens = clean_doc(review,vocab)

    # Filter out tokens not in the vocabulary
    tokens = [w for w in tokens if w in vocab]

    # Convert the filtered tokens into a string (sentence)
    line = ' '.join(tokens)

    # Encode the review text using the fitted tokenizer
    encoded = tokenizer.texts_to_matrix([line], mode='binary')  # or 'freq', depending on your model

    # Make a prediction using the trained model
    yhat = model.predict(encoded, verbose=0)

    # Round the prediction to get the sentiment
    sentiment = 'positive' if yhat[0, 0] >= 0.5 else 'negative'
    confidence = yhat[0, 0]

    return sentiment, confidence




In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

text = 'Everyone will enjoy this film. I love it, recommended!'
sentiment, confidence = predict_sentiment(text, vocab, tokenizer, best_model)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.6f}')

# text = 'good better'
text = 'bad movie ever!'
sentiment, confidence = predict_sentiment(text, vocab, tokenizer, best_model)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.6f}')


In [None]:

plt.plot(history.history['accuracy'])
plt.plot(history.history['loss'])
plt.title('Model accuracy and loss')
plt.ylabel('Accuracy / Loss')
plt.xlabel('Epoch')
plt.legend(['Train accuracy', 'Train loss'], loc='upper left')
plt.show()


In [None]:
best_model.summary()
