In [None]:
!pip install datasets snorkel wandb torch torchvision torchaudio transformers -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install datasets==3.6.0


Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [None]:
from google.colab import files
files.download('/content/data')

In [None]:
from google.colab import files
files.download('/content/wandb')

# Question 1:  Dataset loading + stats

In [None]:
import wandb
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from datasets import load_dataset
from collections import Counter
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import pandas as pd
import re

wandb.login()

wandb.init(project="Q1-weak-supervision-ner", name="Conll2003_Dataset_Stats")


[34m[1mwandb[0m: Currently logged in as: [33m142502019[0m ([33mir2023[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from datasets import load_dataset
dataset = load_dataset("conll2003")


# Dataset statistics
num_train = len(dataset['train'])
num_valid = len(dataset['validation'])
num_test  = len(dataset['test'])

# Count entity tags across all splits
all_entities = []
for split in ['train','validation','test']:
    for sample in dataset[split]['ner_tags']:
        all_entities.extend(sample)
entity_counts = Counter(all_entities)

# Log to W&B
wandb.log({
    "num_train_samples": num_train,
    "num_validation_samples": num_valid,
    "num_test_samples": num_test,
    "entity_distribution": dict(entity_counts)
})
print(" Dataset statistics logged to W&B.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


 Dataset statistics logged to W&B.


# Question 2:  Labeling functions + W&B logs

In [None]:
# Convert a small subset to Pandas DataFrame for Snorkel demo
train_df = pd.DataFrame({
    "tokens": [" ".join(tokens) for tokens in dataset['train']['tokens'][:2000]],  # use subset for speed
    "ner_tags": dataset['train']['ner_tags'][:2000]
})
train_df.head()


Unnamed: 0,tokens,ner_tags
0,EU rejects German call to boycott British lamb .,"[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,Peter Blackburn,"[1, 2]"
2,BRUSSELS 1996-08-22,"[5, 0]"
3,The European Commission said on Thursday it di...,"[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,Germany 's representative to the European Unio...,"[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


In [None]:
# a) Detect years (1900–2099) as MISC
@labeling_function()
def lf_detect_year(x):
    return 1 if re.search(r"\b(19|20)\d{2}\b", x.tokens) else 0

# b) Detect organizations with common suffixes
@labeling_function()
def lf_detect_org_suffix(x):
    return 1 if re.search(r"\b(Inc\.|Corp\.|Ltd\.)\b", x.tokens) else 0

lfs = [lf_detect_year, lf_detect_org_suffix]


In [None]:
from snorkel.labeling import LFAnalysis

# Apply labeling functions
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)

# Try accuracy only if 'label' exists
if 'label' in train_df.columns:
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=train_df['label'])
else:
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

print(analysis)

# Prepare W&B metrics
metrics_to_log = {
    "lf_coverage": analysis["Coverage"].to_dict(),
    "lf_overlaps": analysis["Overlaps"].to_dict(),
    "lf_conflicts": analysis["Conflicts"].to_dict(),
}

# Add accuracy only if available
if "Emp. Acc." in analysis.columns:
    metrics_to_log["lf_accuracy"] = analysis["Emp. Acc."].to_dict()

wandb.log(metrics_to_log)

print("Labeling function coverage, overlap, conflicts (and accuracy if available) logged to W&B.")


100%|██████████| 2000/2000 [00:00<00:00, 42590.20it/s]

                      j Polarity  Coverage  Overlaps  Conflicts
lf_detect_year        0   [0, 1]       1.0       1.0      0.122
lf_detect_org_suffix  1      [0]       1.0       1.0      0.122
Labeling function coverage, overlap, conflicts (and accuracy if available) logged to W&B.





# Question 3:  Label aggregation

In [None]:
from collections import Counter
from snorkel.labeling.model import MajorityLabelVoter

# Aggregate labels using MajorityLabelVoter
majority_model = MajorityLabelVoter()
majority_labels = majority_model.predict(L=L_train)

# Convert NumPy int64 keys to str for wandb
label_counts = Counter(majority_labels)
label_counts_clean = {str(int(k)): int(v) for k, v in label_counts.items()}

# Log cleaned counts to W&B
wandb.log({
    "aggregated_label_distribution": label_counts_clean
})

print(" Aggregated label distribution logged to W&B successfully.")


 Aggregated label distribution logged to W&B successfully.


# Question 4: CIFAR training + experiments

In [None]:
# Transform and device setup
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_loaders(dataset_name):
    if dataset_name == "CIFAR10":
        trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
        testset  = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
        num_classes = 10
    else:
        trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
        testset  = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
        num_classes = 100
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
    testloader  = torch.utils.data.DataLoader(testset,  batch_size=64, shuffle=False)
    return trainloader, testloader, num_classes

def train_model(model, trainloader, testloader, epochs, run_name):
    run = wandb.init(project="Q1-weak-supervision-ner", name=run_name)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        wandb.log({"epoch": epoch, "train_loss": running_loss/len(trainloader)})
    run.finish()


In [None]:
from torchvision.models import resnet18

# 1️ CIFAR100 ➜ CIFAR10
train100, test100, n100 = get_loaders("CIFAR100")
model = resnet18(num_classes=n100).to(device)
train_model(model, train100, test100, epochs=10, run_name="CIFAR100_first")

train10, test10, n10 = get_loaders("CIFAR10")
model.fc = nn.Linear(model.fc.in_features, n10).to(device)
train_model(model, train10, test10, epochs=10, run_name="CIFAR100_then_CIFAR10")

# 2️ CIFAR10 ➜ CIFAR100
train10, test10, n10 = get_loaders("CIFAR10")
model = resnet18(num_classes=n10).to(device)
train_model(model, train10, test10, epochs=10, run_name="CIFAR10_first")

train100, test100, n100 = get_loaders("CIFAR100")
model.fc = nn.Linear(model.fc.in_features, n100).to(device)
train_model(model, train100, test100, epochs=10, run_name="CIFAR10_then_CIFAR100")


0,1
num_test_samples,▁
num_train_samples,▁
num_validation_samples,▁

0,1
num_test_samples,3453
num_train_samples,14041
num_validation_samples,3250


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▆▅▄▄▃▃▂▁▁

0,1
epoch,9.0
train_loss,0.60888
