# Sentiment Analysis — Word2Vec (Google News) + TensorFlow + Ngrok

**This notebook uses the pretrained Google News Word2Vec (300-dim)**, builds a Keras model that uses those embeddings, visualizes dataset splits and training curves, then launches a modern Tailwind-styled chatbox served by Flask and exposed with pyngrok. 

**Important:** The Google News Word2Vec model (~1.6GB) will be downloaded via `gensim.downloader`. Be sure you have enough disk and a stable connection in Colab.

Run the cells sequentially (or **Run all**).

## 1 — Install & imports

Installs: `gensim` (for pretrained word2vec), `pyngrok` (tunnel), and other dependencies.

In [None]:
# Install required packages
!pip install -q tensorflow tensorflow-datasets gensim==4.4.0 pyngrok matplotlib scikit-learn

# Imports
import os, re, pickle, json, time
import numpy as np, matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim import downloader as api
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
print('TensorFlow', tf.__version__)

## 2 — Load IMDB dataset (tfds) and quick inspection

In [None]:
ds_train, ds_test = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)
train_texts = []; train_labels = []
for t,l in tfds.as_numpy(ds_train):
    train_texts.append(t.decode('utf-8')); train_labels.append(int(l))
test_texts = []; test_labels = []
for t,l in tfds.as_numpy(ds_test):
    test_texts.append(t.decode('utf-8')); test_labels.append(int(l))

print('Train samples:', len(train_texts), 'Test samples:', len(test_texts))

## 3 — Preprocess (clean) and visualize dataset distribution

In [None]:
def clean_text(s):
    s = s.lower()
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

train_clean = [clean_text(t) for t in train_texts]
test_clean = [clean_text(t) for t in test_texts]

# Show sample lengths distribution and label distribution
all_labels = np.array(train_labels + test_labels)
labels, counts = np.unique(all_labels, return_counts=True)
plt.figure(figsize=(6,4))
plt.bar(['neg','pos'], counts)
plt.title('Label distribution (train+test)')
plt.ylabel('Count')
plt.show()

# text length histogram (tokens)
from tensorflow.keras.preprocessing.text import text_to_word_sequence
lengths = [len(text_to_word_sequence(t)) for t in train_clean]
plt.figure(figsize=(6,4))
plt.hist(lengths, bins=40)
plt.title('Train text token length distribution')
plt.xlabel('Tokens'); plt.ylabel('Count')
plt.show()

## 4 — Tokenize, split and prepare sequences

We use a fixed `VOCAB_SIZE` and `MAXLEN`. Tokenizer is fit on train only.

In [None]:
VOCAB_SIZE = 30000
MAXLEN = 200
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_clean)

X = tokenizer.texts_to_sequences(train_clean)
X = pad_sequences(X, maxlen=MAXLEN, padding='post', truncating='post')
y = np.array(train_labels)

# Further split train -> train/val for training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
print('Shapes:', X_train.shape, X_val.shape, y_train.shape)

## 5 — Download pretrained Word2Vec (Google News, 300-dim)

This downloads the `word2vec-google-news-300` model via `gensim`.
**Note:** ~1.6GB download — may take several minutes.

In [None]:
# Download via gensim-data (this may take a while)
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')  # KeyedVectors
print('Loaded pretrained vectors. Vocab size:', len(wv.key_to_index))

## 6 — Build embedding matrix mapping tokenizer -> pretrained vectors

Words not found in pretrained vectors get small random vectors.

In [None]:
EMBEDDING_DIM = 300
word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.random.normal(size=(num_words, EMBEDDING_DIM)).astype(np.float32) * 0.01

found = 0
for word, i in word_index.items():
    if i >= num_words: continue
    if word in wv:
        embedding_matrix[i] = wv[word]
        found += 1
print('Embedding matrix shape:', embedding_matrix.shape, 'Found pretrained vectors for', found, 'words')

## 7 — Build model (Embedding with pretrained weights -> BiLSTM -> Dense)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPool1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

def build_model(num_words=num_words, embed_dim=EMBEDDING_DIM, embedding_matrix=embedding_matrix):
    model = Sequential([
        Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_length=MAXLEN, trainable=False, name='pretrained_embed'),
        Bidirectional(LSTM(128, return_sequences=True)),
        GlobalMaxPool1D(),
        Dense(128, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = build_model()
model.summary()

## 8 — Train model (with ModelCheckpoint & EarlyStopping)

We record history to plot accuracy and loss curves later.

In [None]:
checkpoint_path = 'sentiment_best.keras'
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_loss')
]

history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=8, batch_size=128, callbacks=callbacks)

## 9 — Plot training & validation accuracy / loss

In [None]:
# Plot accuracy and loss
hist = history.history
epochs = range(1, len(hist['loss'])+1)

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(epochs, hist['accuracy'], label='train_acc')
plt.plot(epochs, hist['val_accuracy'], label='val_acc')
plt.title('Accuracy')
plt.xlabel('Epoch'); plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs, hist['loss'], label='train_loss')
plt.plot(epochs, hist['val_loss'], label='val_loss')
plt.title('Loss')
plt.xlabel('Epoch'); plt.legend()
plt.show()

## 10 — Evaluate on Test Set (IMDB test split)

In [None]:
# Prepare test sequences using tokenizer
X_test = pad_sequences(tokenizer.texts_to_sequences(test_clean), maxlen=MAXLEN, padding='post')
y_test = np.array(test_labels)

# Load best model and evaluate
best = tf.keras.models.load_model('sentiment_best.keras')
loss, acc = best.evaluate(X_test, y_test, verbose=1)
print('Test accuracy:', acc)

## 11 — Save model (.keras) and tokenizer

In [None]:
model_save_path = 'sentiment_w2v_model.keras'
best.save(model_save_path, include_optimizer=False)
with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)
print('Saved model and tokenizer:', model_save_path, 'tokenizer.pickle')

## 12 — Load saved model and test single predictions

In [None]:
loaded = tf.keras.models.load_model('sentiment_w2v_model.keras')
with open('tokenizer.pickle', 'rb') as f: tk = pickle.load(f)

def predict_sentiment(text):
    t = clean_text(text)
    seq = pad_sequences(tk.texts_to_sequences([t]), maxlen=MAXLEN, padding='post')
    prob = float(loaded.predict(seq)[0][0])
    return prob

for s in ['I loved this movie', 'I hated this film', 'It was okay, not great']:
    print(s, '->', predict_sentiment(s))

## 13 — Flask app with Tailwind chat UI + pyngrok

This cell starts a Flask app and exposes it via `pyngrok`. The chat UI uses Tailwind CDN for styling. When you run this cell in Colab it will print the public ngrok URL.

In [None]:
# Flask + pyngrok server
!pip install -q flask
from flask import Flask, request, jsonify, render_template_string
from pyngrok import ngrok

app = Flask(__name__)

# Load model & tokenizer
model = tf.keras.models.load_model('sentiment_w2v_model.keras')
with open('tokenizer.pickle', 'rb') as f: tokenizer = pickle.load(f)

# Tailwind-styled HTML template (inline)
TEMPLATE = '''
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<title>Sentiment Chatbox</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-gray-100 min-h-screen flex items-center justify-center p-4">
  <div class="w-full max-w-2xl bg-white rounded-2xl shadow-lg p-6">
    <h2 class="text-2xl font-semibold mb-4">Sentiment Chatbox</h2>
    <div id="chat" class="h-80 overflow-y-auto border rounded p-3 space-y-3 bg-gray-50"></div>
    <div class="flex gap-2 mt-4">
      <input id="msg" class="flex-1 border rounded p-2" placeholder="Type a message..." />
      <button onclick="send()" class="bg-blue-600 text-white px-4 py-2 rounded">Send</button>
    </div>
  </div>
<script>
async function send(){
  const msg = document.getElementById('msg').value;
  if(!msg) return;
  const chat = document.getElementById('chat');
  const userDiv = document.createElement('div');
  userDiv.innerHTML = '<div class="text-right"><span class="inline-block bg-blue-100 text-blue-800 px-3 py-1 rounded">'+msg+'</span></div>';
  chat.appendChild(userDiv);
  chat.scrollTop = chat.scrollHeight;
  document.getElementById('msg').value = '';
  // send to server
  const resp = await fetch('/predict', {
    method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify({text: msg})
  });
  const data = await resp.json();
  const botDiv = document.createElement('div');
  const score = parseFloat(data.probability);
  const label = score >= 0.5 ? 'Positive' : 'Negative';
  botDiv.innerHTML = '<div class="text-left"><span class="inline-block bg-gray-100 text-gray-900 px-3 py-1 rounded">Sentiment: <strong>'+label+'</strong> ('+score.toFixed(4)+')</span></div>';
  chat.appendChild(botDiv);
  chat.scrollTop = chat.scrollHeight;
}
</script>
</body>
</html>
'''

@app.route('/')
def index():
    return render_template_string(TEMPLATE)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    text = data.get('text','')
    t = clean_text(text)
    seq = pad_sequences(tokenizer.texts_to_sequences([t]), maxlen=MAXLEN, padding='post')
    prob = float(model.predict(seq)[0][0])
    return jsonify({'probability': prob})

# Start ngrok tunnel and Flask app
public_url = ngrok.connect(5000).public_url
print('Ngrok URL:', public_url)
app.run(port=5000)