Part 1

In [5]:
# IMDB Sentiment Classification
# GloVe & RNN/LSTM


!pip install kagglehub --quiet
import kagglehub
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


# 1. Download IMDB Dataset from Kaggle
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)
csv_path = os.path.join(path, "IMDB Dataset.csv")
df = pd.read_csv(csv_path)

# 2. Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-zA-Z']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['review'] = df['review'].apply(clean_text)

# Tokenize
tokenized_reviews = [word_tokenize(review) for review in df['review']]

# Build vocab
from collections import Counter
word_counts = Counter(word for review in tokenized_reviews for word in review)
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.most_common(30000))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Encode reviews
def encode_review(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

encoded_reviews = [encode_review(tokens) for tokens in tokenized_reviews]

# Pad sequences
MAX_LEN = 200
def pad_sequence(seq):
    if len(seq) < MAX_LEN:
        return seq + [vocab['<PAD>']] * (MAX_LEN - len(seq))
    else:
        return seq[:MAX_LEN]

padded_reviews = np.array([pad_sequence(seq) for seq in encoded_reviews])

# Encode labels
le = LabelEncoder()
labels = le.fit_transform(df['sentiment'])

# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(padded_reviews, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 3. Load GloVe Embeddings
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove

embedding_dim = 100
glove_path = f"glove/glove.6B.{embedding_dim}d.txt"

glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

embedding_matrix = np.random.uniform(-0.05, 0.05, (len(vocab), embedding_dim))
embedding_matrix[vocab['<PAD>']] = np.zeros((embedding_dim,))

for word, idx in vocab.items():
    vec = glove_embeddings.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec

# 4. Dataset & DataLoader
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(IMDBDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(IMDBDataset(X_val, y_val), batch_size=64)
test_loader = DataLoader(IMDBDataset(X_test, y_test), batch_size=64)

# 5. Model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, rnn_type="RNN", embedding_weights=None, trainable=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        if embedding_weights is not None:
            self.embedding.weight = nn.Parameter(torch.tensor(embedding_weights, dtype=torch.float32))
            self.embedding.weight.requires_grad = trainable
        if rnn_type == "RNN":
            self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        else:
            self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_output = output[:, -1, :]
        return self.fc(last_output)

# 6. Training loop
def train_model(model, train_loader, val_loader, epochs=5, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            preds = outputs.argmax(1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}: train_loss={total_loss/len(train_loader):.4f} | train_acc={correct/total:.4f} | val_acc={val_acc:.4f}")
    return evaluate(model, test_loader)

def evaluate(model, loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            preds = outputs.argmax(1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    return correct / total

# 7. Run experiments
results = {}

print("\n=== GloVe + Vanilla RNN ===")
model = RNNClassifier(len(vocab), embedding_dim, 128, 2, "RNN", embedding_matrix, trainable=False)
results["GloVe + RNN"] = train_model(model, train_loader, val_loader)

print("\n=== GloVe + LSTM ===")
model = RNNClassifier(len(vocab), embedding_dim, 128, 2, "LSTM", embedding_matrix, trainable=False)
results["GloVe + LSTM"] = train_model(model, train_loader, val_loader)

print("\n=== Random Embedding + RNN ===")
model = RNNClassifier(len(vocab), embedding_dim, 128, 2, "RNN", None, trainable=True)
results["Random + RNN"] = train_model(model, train_loader, val_loader)

print("\n=== Random Embedding + LSTM ===")
model = RNNClassifier(len(vocab), embedding_dim, 128, 2, "LSTM", None, trainable=True)
results["Random + LSTM"] = train_model(model, train_loader, val_loader)

# 8. Final summary
print("\n=== Final Accuracy Summary ===")
for k, v in results.items():
    print(f"{k:25} Test Accuracy: {v:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews

=== GloVe + Vanilla RNN ===
Epoch 1: train_loss=0.6916 | train_acc=0.5248 | val_acc=0.5163
Epoch 2: train_loss=0.6946 | train_acc=0.5113 | val_acc=0.4968
Epoch 3: train_loss=0.6953 | train_acc=0.5133 | val_acc=0.4933
Epoch 4: train_loss=0.6940 | train_acc=0.5156 | val_acc=0.5219
Epoch 5: train_loss=0.6936 | train_acc=0.5192 | val_acc=0.5199

=== GloVe + LSTM ===
Epoch 1: train_loss=0.6884 | train_acc=0.5358 | val_acc=0.5769
Epoch 2: train_loss=0.6802 | train_acc=0.5509 | val_acc=0.5287
Epoch 3: train_loss=0.6112 | train_acc=0.6472 | val_acc=0.8028
Epoch 4: train_loss=0.3908 | train_acc=0.8277 | val_acc=0.8295
Epoch 5: train_loss=0.3578 | train_acc=0.8421 | val_acc=0.8432

=== Random Embedding + RNN ===
Epoch 1: train_loss=0.6978 | train_acc=0.5023 | val_acc=0.5119
Epoch 2: train_loss=0.6907 | train_acc=0.5256 | val_acc=0.5232
Epoch 3: train_loss=0.6919 | train_acc=0.5248 | val_acc=0.5363
Epoch 4: train_loss=0.6800 

Part 2

In [7]:
import re
import pandas as pd

# Month mapping
MONTHS = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12'
}

def normalize_year(y):
    """Convert 2-digit year to 4-digit."""
    y = int(y)
    if y < 100:
        if y <= 25:  # assume 2000s
            y += 2000
        else:  # assume 1900s
            y += 1900
    return str(y)

def parse_date(text):
    text = text.lower().strip()

    # 1. YYYY-MM-DD or YYYY/MM/DD or YYYY.MM.DD
    m = re.search(r'(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})', text)
    if m:
        y, mo, d = m.groups()
        return f"{int(d):02d}/{int(mo):02d}/{y}"

    # 2. DD-MM-YYYY or DD/MM/YYYY or DD.MM.YYYY
    m = re.search(r'(\d{1,2})[-/.](\d{1,2})[-/.](\d{4})', text)
    if m:
        d, mo, y = m.groups()
        return f"{int(d):02d}/{int(mo):02d}/{y}"

    # 3. DD-MM-YY or DD/MM/YY or DD.MM.YY
    m = re.search(r'(\d{1,2})[-/.](\d{1,2})[-/.](\d{2})', text)
    if m:
        d, mo, y = m.groups()
        return f"{int(d):02d}/{int(mo):02d}/{normalize_year(y)}"

    # 4. Month name formats (e.g., "5 March 2023" or "25th Dec 2024")
    m = re.search(r'(\d{1,2})(?:st|nd|rd|th)?(?:\s+of)?\s+([a-zA-Z]+)[,]?\s+(\d{2,4})', text)
    if m:
        d, month_word, y = m.groups()
        mo = MONTHS.get(month_word.lower()[:3], '??')
        return f"{int(d):02d}/{mo}/{normalize_year(y)}"

    m = re.search(r'([a-zA-Z]+)\s+(\d{1,2}),?\s+(\d{2,4})', text)
    if m:
        month_word, d, y = m.groups()
        mo = MONTHS.get(month_word.lower()[:3], '??')
        return f"{int(d):02d}/{mo}/{normalize_year(y)}"

    return None

# === Test on given CSV ===
df = pd.read_csv("date_parser_testcases (1).csv")
df["Parsed Output"] = df["Input"].apply(parse_date)
df["Match"] = df["Parsed Output"] == df["Expected Output"]

print(df)
print("\nAccuracy:", df["Match"].mean())


                                                Input Expected Output  \
0         The event will take place on March 5, 2023.      05/03/2023   
1                      Her birthday is on 07/08/1990.      07/08/1990   
2                         The deadline is 2022-12-31.      31/12/2022   
3                      We met on 1st of January 2000.      01/01/2000   
4   The concert is scheduled for 15th September, 2...      15/09/2021   
..                                                ...             ...   
95  We celebrate Independence Day on 2023-07-04, a...      04/07/2023   
96  The final date for submission is 30th November...      30/11/2022   
97  The annual conference is on 15th October 2023,...      15/10/2023   
98  His birthdate, noted as 1990-05-20, is in the ...      20/05/1990   
99  The festival will be celebrated on 12th August...      12/08/2024   

   Parsed Output  Match  
0     05/03/2023   True  
1     07/08/1990   True  
2     31/12/2022   True  
3     01/01/2000   

Part 3

In [11]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

# Pronoun mappings with dependency context
male_to_female = {
    "he": "she",
    "him": "her",
    "his": {"poss": "her", "other": "his"},
    "himself": "herself"
}

female_to_male = {
    "she": "he",
    "her": {"poss": "his", "other": "him"},
    "hers": "his",
    "herself": "himself"
}

def preserve_case(word, replacement):
    if word.isupper():
        return replacement.upper()
    elif word[0].isupper():
        return replacement.capitalize()
    else:
        return replacement.lower()

def replace_pronouns(text, target_gender):
    doc = nlp(text)
    new_tokens = []

    for tok in doc:
        lw = tok.text.lower()

        if target_gender == "female" and lw in male_to_female:
            if lw == "his":
                replacement = male_to_female["his"]["poss"] if tok.dep_ == "poss" else male_to_female["his"]["other"]
            else:
                replacement = male_to_female[lw]
            new_tokens.append(preserve_case(tok.text, replacement) + tok.whitespace_)

        elif target_gender == "male" and lw in female_to_male:
            if lw == "her":
                replacement = female_to_male["her"]["poss"] if tok.dep_ == "poss" else female_to_male["her"]["other"]
            else:
                replacement = female_to_male[lw]
            new_tokens.append(preserve_case(tok.text, replacement) + tok.whitespace_)

        else:
            new_tokens.append(tok.text_with_ws)

    return "".join(new_tokens).strip()

# Load test cases
df = pd.read_csv("pronoun_testcases.csv")

# Apply transformation
df["parsed_output"] = df.apply(lambda row: replace_pronouns(row["input_text"], row["target_gender"]), axis=1)
df["match"] = df["parsed_output"] == df["expected_output"]

# Results
print(df[["input_text", "expected_output", "parsed_output", "match"]])
print(f"Accuracy: {df['match'].mean() * 100:.2f}%")


                             input_text                      expected_output  \
0            He is going to the market.          She is going to the market.   
1             His book is on the table.            Her book is on the table.   
2                  I saw him yesterday.                 I saw her yesterday.   
3                      He hurt himself.                    She hurt herself.   
4              I called him last night.             I called her last night.   
5                      That is his car.                     That is her car.   
6            He told me about his trip.          She told me about her trip.   
8    He blames himself for the mistake.  She blames herself for the mistake.   
9                He brought his laptop.              She brought her laptop.   
10                  He made it himself.                 She made it herself.   
11           I don’t like his attitude.           I don’t like her attitude.   
12               Tell him to come here. 

In [9]:
#just another way:
import pandas as pd
import re

# Separate mappings
male_to_female = {
    "he": "she",
    "him": "her",
    "his": "her",       # will handle possessive separately
    "himself": "herself"
}

female_to_male = {
    "she": "he",
    "her": "him",       # will handle possessive separately
    "hers": "his",
    "herself": "himself"
}

def preserve_case(word, replacement):
    if word.isupper():
        return replacement.upper()
    elif word[0].isupper():
        return replacement.capitalize()
    else:
        return replacement.lower()

def replace_pronouns(text, target_gender):
    words = re.findall(r"\w+|[^\w\s]", text)
    new_words = []

    for i, w in enumerate(words):
        lw = w.lower()

        if target_gender == "female" and lw in male_to_female:
            # special case for "his"
            if lw == "his":
                if i + 1 < len(words) and words[i+1].isalpha():
                    replacement = "her"  # possessive adjective
                else:
                    replacement = "hers" # possessive pronoun
            else:
                replacement = male_to_female[lw]
            new_words.append(preserve_case(w, replacement))

        elif target_gender == "male" and lw in female_to_male:
            # special case for "her"
            if lw == "her":
                if i + 1 < len(words) and words[i+1].isalpha():
                    replacement = "his"  # possessive adjective
                else:
                    replacement = "him"  # object pronoun
            else:
                replacement = female_to_male[lw]
            new_words.append(preserve_case(w, replacement))

        else:
            new_words.append(w)

    # Join keeping punctuation spacing correct
    return "".join(
        [" " + w if i > 0 and re.match(r"\w", w) and re.match(r"\w", new_words[i-1]) else w
         for i, w in enumerate(new_words)]
    )

# Load and test
df = pd.read_csv("pronoun_testcases.csv")
df["parsed_output"] = df.apply(lambda row: replace_pronouns(row["input_text"], row["target_gender"]), axis=1)
df["match"] = df["parsed_output"] == df["expected_output"]

print(df[["input_text", "expected_output", "parsed_output", "match"]])
print(f"Accuracy: {df['match'].mean() * 100:.2f}%")


                             input_text                      expected_output  \
0            He is going to the market.          She is going to the market.   
1             His book is on the table.            Her book is on the table.   
2                  I saw him yesterday.                 I saw her yesterday.   
3                      He hurt himself.                    She hurt herself.   
4              I called him last night.             I called her last night.   
5                      That is his car.                     That is her car.   
6            He told me about his trip.          She told me about her trip.   
8    He blames himself for the mistake.  She blames herself for the mistake.   
9                He brought his laptop.              She brought her laptop.   
10                  He made it himself.                 She made it herself.   
11           I don’t like his attitude.           I don’t like her attitude.   
12               Tell him to come here. 