In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

### Note: some code cells repeated across tasks. This is because i was doing the tasks in different times.

### task 1

In [9]:
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed()
df = pd.read_csv('/content/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['sentiment'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].values,
    df['label_encoded'].values,
    test_size=0.2,
    random_state=42
)

In [10]:
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

maxlen = 200
train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=maxlen)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

maxlen = 200
train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=maxlen)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [None]:
rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen),
    tf.keras.layers.SimpleRNN(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history_rnn = rnn_model.fit(train_padded, train_labels, epochs=5, validation_data=(val_padded, val_labels), batch_size=32)

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, maxlen, d_model):
        super().__init__()
        self.pos_encoding = self.positional_encoding(maxlen, d_model)

    def get_angles(self, pos, i, d_model):
        angles = pos / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        return angles

    def positional_encoding(self, maxlen, d_model):
        angle_rads = self.get_angles(np.arange(maxlen)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)

        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

inputs = tf.keras.Input(shape=(maxlen,))
x = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64)(inputs)
x = PositionalEncoding(maxlen, 64)(x)
x = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

transformer_model = tf.keras.Model(inputs=inputs, outputs=outputs)

transformer_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history_transformer = transformer_model.fit(train_padded, train_labels, epochs=5, validation_data=(val_padded, val_labels), batch_size=32)

### task 2

In [None]:
tf.random.set_seed(42)


def scaled_dot_product_attention(q, k, v):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)

    return output, attention_weights

In [None]:
batch_size = 1
num_tokens = 5
depth = 4

q = tf.random.normal(shape=(batch_size, num_tokens, depth))
k = tf.random.normal(shape=(batch_size, num_tokens, depth))
v = tf.random.normal(shape=(batch_size, num_tokens, depth))

output, attention_weights = scaled_dot_product_attention(q, k, v)

print("Attention Weights Shape:", attention_weights.shape)
print("Attention Weights:", attention_weights.numpy())


In [None]:
plt.matshow(attention_weights[0].numpy(), cmap='viridis')
plt.colorbar()
plt.title('Self-Attention Weights (Toy Example)')
plt.xlabel('Key')
plt.ylabel('Query')
plt.show()

### task 3

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, Embedding, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed()

df = pd.read_csv('/content/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['sentiment'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].values,
    df['label_encoded'].values,
    test_size=0.2,
    random_state=42
)

vocab_size = 10000
maxlen = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=maxlen)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)


In [None]:
class PositionalEncoding(Layer):
    def __init__(self, maxlen, d_model):
        super().__init__()
        self.pos_encoding = self._positional_encoding(maxlen, d_model)

    def _get_angles(self, pos, i, d_model):
        angles = pos / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return angles

    def _positional_encoding(self, maxlen, d_model):
        angle_rads = self._get_angles(
            np.arange(maxlen)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model
        )
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [None]:
def scaled_dot_product_attention(q, k, v):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_logits = matmul_qk / tf.math.sqrt(dk)
    attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output

class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        scaled_attention = scaled_dot_product_attention(q, k, v)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.num_heads * self.depth))

        return self.dense(concat_attention)


In [None]:
class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(dff, activation='relu'),
            Dense(d_model)
        ])

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, *, training=False):
        attn_output = self.mha(x, x, x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
class TransformerClassifier(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 maximum_position_encoding, num_classes):
        super().__init__()
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)]

        self.dropout = Dropout(0.1)
        self.global_avg_pool = GlobalAveragePooling1D()
        self.fc_out = Dense(num_classes, activation='sigmoid')

    def call(self, x, *, training=False):
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embedding.output_dim, tf.float32))
        x = self.pos_encoding(x)

        for enc_layer in self.enc_layers:
            x = enc_layer(x, training=training)

        x = self.global_avg_pool(x)
        return self.fc_out(x)


In [None]:
transformer_classifier = TransformerClassifier(
    num_layers=2,
    d_model=64,
    num_heads=4,
    dff=128,
    input_vocab_size=vocab_size,
    maximum_position_encoding=maxlen,
    num_classes=1
)

transformer_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history_custom_transformer = transformer_classifier.fit(
    train_padded, train_labels,
    epochs=3,
    validation_data=(val_padded, val_labels),
    batch_size=32
)


### task 4

In [None]:
!pip install transformers datasets -q


In [None]:
!pip install --upgrade transformers


In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

train_df = pd.DataFrame({'text': train_texts[:500], 'label': train_labels[:500]})
val_df = pd.DataFrame({'text': val_texts[:200], 'label': val_labels[:200]})

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./bert_output',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=50,
    logging_dir='./logs',
    disable_tqdm=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

trainer.train()


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")

prompt = "This movie was so amazing, I couldn't believe"

inputs = gpt_tokenizer.encode(prompt, return_tensors="pt")
gpt_outputs = gpt_model.generate(inputs, max_length=50, num_return_sequences=1, do_sample=True)

generated_text = gpt_tokenizer.decode(gpt_outputs[0], skip_special_tokens=True)
print(generated_text)


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

sample_review = "summarize: " + df['review'].iloc[0]

inputs = t5_tokenizer.encode(sample_review, return_tensors="pt", max_length=512, truncation=True)

outputs = t5_model.generate(inputs, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)

summary = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Summary:", summary)


### task 5


In [None]:
!pip install spacy -q
!python -m spacy download en_core_web_sm


In [None]:
!pip install transformers -q


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = """Apple is looking at buying U.K. startup for $1 billion. Elon Musk met with the board in San Francisco."""

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text:<20} {ent.label_}")


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

text = """Apple is looking at buying U.K. startup for $1 billion. Elon Musk met with the board in San Francisco."""
ner_results = nlp_ner(text)

for r in ner_results:
    print(f"{r['word']:<20} {r['entity_group']:<10} Score: {r['score']:.2f}")


In [None]:
for token in doc:
    print(f"{token.text:<15} {token.pos_:<10} {token.tag_}")


In [None]:
!pip install stanza -q
import stanza

stanza.download('en')
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos')

doc = nlp_stanza("Elon Musk is planning to launch another Tesla model next year.")
for sentence in doc.sentences:
    for word in sentence.words:
        print(f"{word.text:<15} POS: {word.upos}")


### task 6

In [None]:
!pip install datasets seqeval transformers -q


In [None]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

In [None]:
from transformers import BertTokenizerFast

label_list = dataset["train"].features["ner_tags"].feature.names
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [None]:
from datasets import DatasetDict

small_train = dataset["train"].select(range(500))
small_val = dataset["validation"].select(range(100))

tokenized_datasets = DatasetDict({
    "train": small_train.map(tokenize_and_align_labels, batched=True),
    "validation": small_val.map(tokenize_and_align_labels, batched=True)
})


In [None]:
from transformers import BertForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))

args = TrainingArguments(
    output_dir="./ner_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_preds = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds)
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
preds = np.argmax(predictions, axis=2)

true_preds = [
    [label_list[p] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(preds, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(preds, labels)
]

print(classification_report(true_labels, true_preds))


### task 7

In [None]:
!pip install transformers -q


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("/content/IMDB Dataset.csv")
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].values, df['label'].values, test_size=0.2, random_state=42
)

vocab_size = 10000
maxlen = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_seq = tokenizer.texts_to_sequences(train_texts)
val_seq = tokenizer.texts_to_sequences(val_texts)

train_padded = pad_sequences(train_seq, padding='post', maxlen=maxlen)
val_padded = pad_sequences(val_seq, padding='post', maxlen=maxlen)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)


In [None]:
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=maxlen),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history_lstm = lstm_model.fit(
    train_padded, train_labels,
    epochs=5,
    batch_size=32,
    validation_data=(val_padded, val_labels)
)


In [None]:
y_pred_lstm = (lstm_model.predict(val_padded) > 0.5).astype(int)

print("LSTM Evaluation:")
print(classification_report(val_labels, y_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(val_labels, y_pred_lstm))


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

preds = trainer.predict(val_ds)
y_pred = np.argmax(preds.predictions, axis=1)

print("BERT Classification Report:")
print(classification_report(val_df['label'], y_pred))
print("Confusion Matrix:")
print(confusion_matrix(val_df['label'], y_pred))


### task 8


In [None]:
import matplotlib.pyplot as plt

training_logs = trainer.state.log_history

steps = [log["step"] for log in training_logs if "loss" in log]
losses = [log["loss"] for log in training_logs if "loss" in log]


plt.plot(steps, losses, marker='o')
plt.xlabel("Step")
plt.ylabel("Training Loss")
plt.title("BERT Training Loss Curve")
plt.grid(True)
plt.show()