In [None]:
%pip install -q gensim nltk tensorflow numpy matplotlib 

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('movie_reviews')

In [None]:
import tensorflow as tf
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

from nltk.corpus import movie_reviews
movie_reviews.ensure_loaded()
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in string.punctuation and t not in stop_words]
    return tokens

In [None]:
sentences = [preprocess_text(' '.join(movie_reviews.words(fid))) for fid in movie_reviews.fileids()]

In [None]:
sentences

In [None]:
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [None]:
print(model.wv['good'])

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print("No. of GPUs:", gpus)
print("TensorFlow is using GPU:", tf.test.is_gpu_available())

In [None]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import movie_reviews

In [None]:
movie_reviews.ensure_loaded()
sentences = [' '.join(movie_reviews.words(fileid))
             for fileid in movie_reviews.fileids()]
labels = [1 if fileid.startswith(
    'pos') else 0 for fileid in movie_reviews.fileids()]

In [None]:
model_w2v = Word2Vec(sentences, vector_size=100,
                     window=5, min_count=5, workers=4)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
max_len = max([len(seq) for seq in sequences])
data = pad_sequences(sequences, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42)

y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [None]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in model_w2v.wv:
        embedding_matrix[i] = model_w2v.wv[word]

In [None]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100,
          weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=5,
                    batch_size=64, validation_data=(X_test, y_test))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt

# Plot training history
def plot_history(history):
    # Accuracy
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot training history
plot_history(history)
