In [1]:
import json
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import numpy as np
from transformers import  AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, hamming_loss, accuracy_score
from torch.utils.data import TensorDataset, DataLoader
from transformers import AdamW

def load_license_data(json_folder):
    license_data = []
    for filename in os.listdir(json_folder):
        if filename.endswith(".json"):
            license_name = filename[:-5]
            filepath = os.path.join(json_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
                license_data.append({
                    "license_name": license_name,
                    "family": data["family"],
                    "labels": data["labels"],
                    "text": data["text"],
                })
    return license_data

json_folder = "../../data/processed/preprocessed_licenses_json_2"
license_data = load_license_data(json_folder)

# --- Create DataFrame ---
df = pd.DataFrame(license_data)

# --- Handle Missing Labels ---
df.dropna(subset=["labels"], inplace=True)
df = df[df["labels"].apply(lambda x: len(x) > 0)]

# --- Multi-Label Encoding ---
mlb = MultiLabelBinarizer()
multi_hot_labels = mlb.fit_transform(df["labels"])
df["multi_hot_labels"] = list(multi_hot_labels)
num_labels = len(mlb.classes_)
print(f"Number of unique labels: {num_labels}")
print(f"Classes: {mlb.classes_}")
labels = torch.tensor(multi_hot_labels, dtype=torch.float)

label_counts_before = Counter(label for labels in df["labels"] for label in labels)

# Define the desired number of samples for each label after oversampling
desired_counts = {}
min_samples = 80
for label in mlb.classes_:
    if label_counts_before[label] < min_samples:
        desired_counts[label] = min_samples
    else:
        desired_counts[label] = label_counts_before[label]

# Initialize lists to store resampled data
resampled_texts = []
resampled_labels = []
resampled_label_names = []  # List to store label names

# Iterate over each label and apply oversampling to each binary problem
for i in range(num_labels):
    # Create a binary label array for the current label
    binary_labels = multi_hot_labels[:, i]

    # Reshape the features (texts) and labels for oversampling
    X = np.array(df["text"]).reshape(-1, 1)
    y = binary_labels

    # Check if oversampling is needed for this label
    if sum(y) < desired_counts[mlb.classes_[i]]:
        # Apply RandomOverSampler with the desired count for this label
        oversampler = RandomOverSampler(sampling_strategy={1: desired_counts[mlb.classes_[i]]}, random_state=42)
        X_resampled, y_resampled = oversampler.fit_resample(X, y)

        # Update the resampled texts, labels, and label names
        for text, label in zip(X_resampled.flatten(), y_resampled):
            if label == 1:
                resampled_texts.append(text)
                # Find the original multi-hot labels for this text
                original_labels = multi_hot_labels[df[df["text"] == text].index[0]].copy()
                # Ensure that oversampling does not remove existing labels
                original_labels[i] = 1  # Set the current label to 1
                resampled_labels.append(original_labels)
                # Store the corresponding label names
                current_label_names = [mlb.classes_[j] for j, val in enumerate(original_labels) if val == 1]
                resampled_label_names.append(current_label_names)
    else:
        # If no oversampling is needed, use original data for this label
        for text, label in zip(X.flatten(), y):
            if label == 1:
                resampled_texts.append(text)
                original_labels = multi_hot_labels[df[df["text"] == text].index[0]].copy()
                resampled_labels.append(original_labels)
                # Store the corresponding label names
                current_label_names = [mlb.classes_[j] for j, val in enumerate(original_labels) if val == 1]
                resampled_label_names.append(current_label_names)

# Create a new DataFrame from the resampled data
df_resampled = pd.DataFrame({
    'text': resampled_texts,
    'multi_hot_labels': resampled_labels,
    'labels': resampled_label_names  # Include the label names
})

# Ensure all original samples are included in the oversampled DataFrame
for index, row in df.iterrows():
    if row['text'] not in df_resampled['text'].values:
        df_resampled = pd.concat([df_resampled, pd.DataFrame([row])], ignore_index=True)

# Calculate label distribution after oversampling
label_counts_after = Counter()
for label_list in df_resampled["multi_hot_labels"]:
    for i, label_present in enumerate(label_list):
        if label_present == 1:
            label_counts_after[mlb.classes_[i]] += 1

print("Label distribution before oversampling:", label_counts_before)
print("Label distribution after oversampling:", label_counts_after)

df = df_resampled
labels = torch.tensor(np.array(df["multi_hot_labels"].tolist()), dtype=torch.float)

# --- Split Data ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# --- Tokenization ---
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")

# Convert DataFrame to Hugging Face Dataset
# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# Set the format to PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Convert label lists to tensors
label2id = {label: str(i) for i, label in enumerate(mlb.classes_)}
id2label = {str(i): label for i, label in enumerate(mlb.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)
# --- Dataset Class ---
class LicenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

encoded_inputs = tokenizer(
    df["text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=512  # Adjust if necessary
)

# Get the multi-hot encoded labels as a tensor (we created this in 2.1.4)
labels = torch.tensor(df["multi_hot_labels"].tolist(), dtype=torch.float)

dataset = TensorDataset(
    encoded_inputs["input_ids"],
    encoded_inputs["attention_mask"],
    labels
)

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)  # Adjust batch size

loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate (lr)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

num_epochs = 10  # Adjust the number of epochs

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()

        # Optional: Print training progress
        print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
        
def save_model(model, tokenizer, output_dir):
    """Saves the model, tokenizer, and configuration to a specified directory.

    Args:
        model: The trained model.
        tokenizer: The tokenizer.
        output_dir: The directory where the model will be saved.
    """
    print(f"Saving model to {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

output_directory = "../../model/Bert3.0"  # Replace with your desired directory
save_model(model, tokenizer, output_directory)


Number of unique labels: 29
 'Copyleft (network protective) (obligation/ALARM)'
 'Deprecated License (other/INFORMATION)'
 'Display copyright notice (obligation/INFORMATION)'
 'Display license in binary (obligation/INFORMATION)'
 'Display license in the source (obligation/INFORMATION)'
 'Doing Business with US (other/ALARM)'
 'Endorsement prohibited (prohibition/INFORMATION)'
 'License upgrade allowed (right/INFORMATION)'
 'No further restrictions permitted (prohibition/INFORMATION)'
 'Patent grant (other/INFORMATION)' 'Permissive (right/INFORMATION)'
 'Public Domain (other/INFORMATION)'
 'Severe patent retaliation (other/ALARM)'
 'Unclear or Ambiguous (other/ALARM)'
 'Usage notice in advertisement (obligation/INFORMATION)'
 'Use in distributed software (right/INFORMATION)'


Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Map:   0%|          | 0/742 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(df["multi_hot_labels"].tolist(), dtype=torch.float)


Epoch: 1/10, Loss: 0.7610
Epoch: 1/10, Loss: 0.6104
Epoch: 1/10, Loss: 0.5558
Epoch: 1/10, Loss: 0.4834
Epoch: 1/10, Loss: 0.4834
Epoch: 1/10, Loss: 0.5081
Epoch: 1/10, Loss: 0.3274
Epoch: 1/10, Loss: 0.4447
Epoch: 1/10, Loss: 0.4162
Epoch: 1/10, Loss: 0.3824
Epoch: 1/10, Loss: 0.3896
Epoch: 1/10, Loss: 0.3006
Epoch: 1/10, Loss: 0.3569
Epoch: 1/10, Loss: 0.3823
Epoch: 1/10, Loss: 0.3180
Epoch: 1/10, Loss: 0.3782
Epoch: 1/10, Loss: 0.3329
Epoch: 1/10, Loss: 0.2541
Epoch: 1/10, Loss: 0.3309
Epoch: 1/10, Loss: 0.3681
Epoch: 1/10, Loss: 0.4079
Epoch: 1/10, Loss: 0.2948
Epoch: 1/10, Loss: 0.3521
Epoch: 1/10, Loss: 0.2812
Epoch: 1/10, Loss: 0.3307
Epoch: 1/10, Loss: 0.3428
Epoch: 1/10, Loss: 0.4709
Epoch: 1/10, Loss: 0.3089
Epoch: 1/10, Loss: 0.3086
Epoch: 1/10, Loss: 0.3177
Epoch: 1/10, Loss: 0.3927
Epoch: 1/10, Loss: 0.3235
Epoch: 1/10, Loss: 0.3017
Epoch: 1/10, Loss: 0.2201
Epoch: 1/10, Loss: 0.2406
Epoch: 1/10, Loss: 0.2691
Epoch: 1/10, Loss: 0.2586
Epoch: 1/10, Loss: 0.4073
Epoch: 1/10,