# Complete NLP Pipeline

In [1]:
!pip install opendatasets



In [2]:
import numpy as np
import tensorflow as tf
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import opendatasets as od

In [3]:
od.download('https://www.kaggle.com/datasets/ffatty/plain-text-wikipedia-simpleenglish')

Skipping, found downloaded files in "./plain-text-wikipedia-simpleenglish" (use force=True to force download)


In [4]:
with open('./plain-text-wikipedia-simpleenglish/AllCombined.txt') as f:
  corpus_lines = f.readlines()

corpus = corpus_lines[:50000]

with open("Sentences_50Agree.txt", encoding='ISO-8859-1') as f:
  sentiment_lines = f.readlines()

print(len(sentiment_lines))

4846


## Phase 1: BPE Tokenizer

In [5]:
class BPETokenizer():
  def __init__(self, vocab_size, min_frequency):
    self.corpus = corpus
    self.vocab_size = vocab_size
    self.min_frequency = min_frequency

    self.tokenizer = Tokenizer(models.BPE())
    self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    self.trainer = trainers.BpeTrainer(
        vocab_size=30000,
        min_frequency=1
    )

  def train(self, corpus):
    self.tokenizer.train_from_iterator(self.corpus, self.trainer)
    self.tokenizer.save("bpe_tokenizer.json")

    print("Saved Tokenizer.")

  def encode(self, text):
    return self.tokenizer.encode(text).ids
  def decode(self, ids):
    return self.tokenizer.decode(ids)


In [6]:
bpe = BPETokenizer(30000, 1)
bpe.train(corpus)

def encode(text):
  return bpe.encode(text)

Saved Tokenizer.


## Phase 2: Continous Bag of Words Model

In [7]:
window_size = 5
pairs = []

for line in corpus:
  tokens = encode(line.strip())
  for i in range(window_size, len(tokens) - window_size):
    context = tokens[i-window_size:i] + tokens[i+1:i+window_size+1]
    target = tokens[i]
    pairs.append((context, target))



In [8]:
rng = np.random.default_rng()

def generate_negative_samples(target, num_samples):
  negatives = []
  while len(negatives) < num_samples:
    neg = rng.integers(0, vocab_size)
    if neg != target:
      negatives.append(neg)
  return negatives

In [9]:
vocab_size = bpe.tokenizer.get_vocab_size()

contexts = []
targets = []
labels = []

num_negatives = 5

for context, target in pairs:
  contexts.append(context)
  targets.append(target)
  labels.append(1)

  negatives = generate_negative_samples(target, num_negatives)
  for neg in negatives:
    contexts.append(context)
    targets.append(neg)
    labels.append(0)


In [10]:
vocab_size = bpe.tokenizer.get_vocab_size()
embedding_dim = 384

context_input = tf.keras.Input(shape=(window_size * 2,))
target_input = tf.keras.Input(shape=())

embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

context_embed = embedding(context_input)
context_embed = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(context_embed)

target_embed = embedding(target_input)

dot_product = tf.keras.layers.Dot(axes=1)([context_embed, target_embed])

output = tf.keras.layers.Activation("sigmoid")(dot_product)

cbow = tf.keras.Model([context_input, target_input], output)
cbow.compile(
    loss="binary_crossentropy",
    optimizer="adam"
)

cbow.summary()

In [11]:
contexts = np.array(contexts, dtype=np.int32)
targets  = np.array(targets, dtype=np.int32)
labels   = np.array(labels, dtype=np.float32)


In [12]:
cbow.fit([contexts, targets], labels, epochs=5, batch_size=256)

Epoch 1/5
[1m25139/25139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 4ms/step - loss: 0.2784
Epoch 2/5
[1m25139/25139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 4ms/step - loss: 0.1925
Epoch 3/5
[1m25139/25139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 4ms/step - loss: 0.1377
Epoch 4/5
[1m25139/25139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step - loss: 0.0908
Epoch 5/5
[1m25139/25139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 4ms/step - loss: 0.0587


<keras.src.callbacks.history.History at 0x7d4d7a8bf350>

In [13]:
embeddings = embedding.get_weights()[0]

## Phase 3: Sentiment Analysis

In [14]:
label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

texts = []
labels = []

for line in sentiment_lines:
  parts = line.strip().rsplit("@", 1)
  if len(parts) != 2:
    continue
  text, label = parts
  texts.append(text)
  labels.append(label_map[label.lower()])

print(labels[:5])

[1, 1, 0, 2, 2]


In [15]:
def sentence_vector(text):
  token_ids = encode(text)
  if not token_ids:
    return np.zeros(embedding_dim)
  return np.mean(embeddings[token_ids], axis=0)

In [16]:
X = np.array([sentence_vector(t) for t in texts])
y = np.array(labels)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
)


In [18]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [19]:
y_pred = clf.predict(X_test)

print(classification_report(
    y_test, y_pred,
    target_names=["negative", "neutral", "positive"]
))


              precision    recall  f1-score   support

    negative       0.57      0.33      0.41       126
     neutral       0.73      0.86      0.79       576
    positive       0.58      0.47      0.52       268

    accuracy                           0.68       970
   macro avg       0.62      0.55      0.57       970
weighted avg       0.66      0.68      0.66       970



## Evaluation: Comparison with VADER baseline

In [20]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [21]:
from nltk.sentiment import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

In [22]:
def vader_label(text):
    score = vader.polarity_scores(text)['compound']

    if score >= 0.05:
        return 2  # positive
    elif score <= -0.05:
        return 0  # negative
    else:
        return 1  # neutral

In [23]:
vader_preds = [vader_label(t) for t in texts]
y_true = y  # from earlier pipeline

In [24]:
from sklearn.metrics import classification_report, accuracy_score

print("VADER Accuracy:", accuracy_score(y_true, vader_preds))

print(classification_report(
    y_true, vader_preds,
    target_names=["negative", "neutral", "positive"]
))

VADER Accuracy: 0.5429219975237309
              precision    recall  f1-score   support

    negative       0.40      0.30      0.34       604
     neutral       0.74      0.52      0.61      2879
    positive       0.40      0.71      0.51      1363

    accuracy                           0.54      4846
   macro avg       0.52      0.51      0.49      4846
weighted avg       0.60      0.54      0.55      4846



Compared to VADER Baseline with an accuracy if 54% and macro avg of 0.49, our model achieves a much better accuracy of 68% and macro avg of 0.57

### Semantic Analogy Test

In [25]:
def word_vector(word):
    token_ids = encode(word)
    if not token_ids:
        return np.zeros(embedding_dim)
    return np.mean(embeddings[token_ids], axis=0)

In [26]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [27]:
vocab = bpe.tokenizer.get_vocab()  # token → id
id_to_token = {id: tok for tok, id in vocab.items()}

In [28]:
def most_similar(vec, top_k=5):
    sims = []

    for i in range(len(embeddings)):
        sim = cosine_similarity(vec, embeddings[i])
        sims.append((i, sim))

    sims.sort(key=lambda x: x[1], reverse=True)

    results = [(id_to_token[i], sim) for i, sim in sims[:top_k]]
    return results


In [29]:
def analogy(a, b, c):
    vec = word_vector(a) - word_vector(b) + word_vector(c)
    return most_similar(vec)

In [30]:
tests = [
    ("king", "man", "woman"),
    ("brother", "man", "woman"),
]

for a, b, c in tests:
    print(f"{a} - {b} + {c} ≈")
    print(most_similar(word_vector(a) - word_vector(b) + word_vector(c)))
    print()

king - man + woman ≈
[('king', np.float32(0.7313687)), ('woman', np.float32(0.5002279)), ('queen', np.float32(0.41642794)), ('monarch', np.float32(0.31645358)), ('Gangnihessou', np.float32(0.29481384))]

brother - man + woman ≈
[('brother', np.float32(0.68479484)), ('woman', np.float32(0.5085155)), ('sister', np.float32(0.37139127)), ('younger', np.float32(0.37114373)), ('aunt', np.float32(0.33577555))]

