In [29]:
import json
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaConfig, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, average_precision_score, f1_score

In [9]:
# --- Load Data ---
def load_license_data(json_folder):
    license_data = []
    for filename in os.listdir(json_folder):
        if filename.endswith(".json"):
            license_name = filename[:-5]
            filepath = os.path.join(json_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
                license_data.append({
                    "license_name": license_name,
                    "family": data["family"],
                    "labels": data["labels"],
                    "text": data["text"],
                })
    return license_data

json_folder = "../../data/processed/preprocessed_licenses_json"
license_data = load_license_data(json_folder)

# --- Create DataFrame ---
df = pd.DataFrame(license_data)

# --- Handle Missing Labels ---
df.dropna(subset=["labels"], inplace=True)
df = df[df["labels"].apply(lambda x: len(x) > 0)]

# --- Multi-Label Encoding ---
mlb = MultiLabelBinarizer()
multi_hot_labels = mlb.fit_transform(df["labels"])
df["multi_hot_labels"] = list(multi_hot_labels)
num_labels = len(mlb.classes_)

In [10]:
print(f"Number of unique labels: {num_labels}")
print(f"Classes: {mlb.classes_}")

Number of unique labels: 29
 'Copyleft (network protective) (obligation/ALARM)'
 'Deprecated License (other/INFORMATION)'
 'Display copyright notice (obligation/INFORMATION)'
 'Display license in binary (obligation/INFORMATION)'
 'Display license in the source (obligation/INFORMATION)'
 'Doing Business with US (other/ALARM)'
 'Endorsement prohibited (prohibition/INFORMATION)'
 'License upgrade allowed (right/INFORMATION)'
 'No further restrictions permitted (prohibition/INFORMATION)'
 'Patent grant (other/INFORMATION)' 'Permissive (right/INFORMATION)'
 'Public Domain (other/INFORMATION)'
 'Severe patent retaliation (other/ALARM)'
 'Unclear or Ambiguous (other/ALARM)'
 'Usage notice in advertisement (obligation/INFORMATION)'
 'Use in distributed software (right/INFORMATION)'


In [11]:
labels = torch.tensor(multi_hot_labels, dtype=torch.float)

In [33]:
# --- Oversampling using Binary Relevance with RandomOverSampler ---
# Calculate label distribution before oversampling
label_counts_before = Counter(label for labels in df["labels"] for label in labels)

# Initialize lists to store resampled data
resampled_texts = []
resampled_multihot_labels = []

# Iterate over each label and apply oversampling to each binary problem
for i in range(num_labels):
    # Create a binary label array for the current label
    binary_labels = multi_hot_labels[:, i]

    # Reshape the features (texts) and labels for oversampling
    X = np.array(df["text"]).reshape(-1, 1)
    y = binary_labels

    # Apply RandomOverSampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Create new multi-hot encoded labels for the resampled data
    for text, label in zip(X_resampled, y_resampled):
        resampled_texts.append(text[0])  # Extract the text string from the array
        new_labels = multi_hot_labels[df[df["text"] == text[0]].index[0]].copy()
        new_labels[i] = label  # Set the current label based on binary oversampling
        resampled_multihot_labels.append(new_labels)

# Create a new DataFrame from the resampled data
df_resampled = pd.DataFrame({'text': resampled_texts, 'multi_hot_labels': resampled_multihot_labels})

# Calculate label distribution after oversampling
label_counts_after = Counter()
for labels in df_resampled["multi_hot_labels"]:
    for i, label_present in enumerate(labels):
        if label_present:
            label_counts_after[mlb.classes_[i]] += 1

print("Label distribution before oversampling:", label_counts_before)
print("Label distribution after oversampling:", label_counts_after)



In [34]:
df = df_resampled
labels = torch.tensor(np.array(df["multi_hot_labels"].tolist()), dtype=torch.float)

In [35]:
# --- Split Data ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# --- Tokenization ---
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(
    train_df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

test_encodings = tokenizer(
    test_df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

In [36]:
# --- Dataset Class ---
class LicenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = LicenseDataset(train_encodings, train_df["multi_hot_labels"].tolist())
test_dataset = LicenseDataset(test_encodings, test_df["multi_hot_labels"].tolist())

In [37]:
config = RobertaConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.problem_type = "multi_label_classification"

model = RobertaForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
label_counts = Counter()
for labels in df["multi_hot_labels"]:
    for i, label_present in enumerate(labels):
        if label_present:
            label_counts[mlb.classes_[i]] += 1

num_samples = len(df)
weights = [num_samples / (num_labels * label_counts.get(label, 1)) for label in mlb.classes_]
weights = torch.tensor(weights, dtype=torch.float)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn = nn.BCEWithLogitsLoss(pos_weight=weights.to(device))
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# --- Move Model to Device ---
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [40]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,  # Changed to 10 epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# --- Metrics Function ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)

    # Calculate metrics for multi-label classification
    roc_auc = []
    average_precision = []
    for i in range(labels.shape[1]):
        if len(np.unique(labels[:, i])) > 1:
            roc_auc.append(roc_auc_score(labels[:, i], preds[:, i]))
        average_precision.append(average_precision_score(labels[:, i], preds[:, i]))

    # Average the results
    roc_auc = np.mean(roc_auc) if roc_auc else 0
    average_precision = np.mean(average_precision)

    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')

    return {
        "roc_auc": roc_auc,
        "average_precision": average_precision,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }



In [41]:
# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# --- Train the Model ---
train_results = trainer.train()

  0%|          | 0/10940 [00:00<?, ?it/s]

{'loss': 0.6064, 'grad_norm': 0.8542879819869995, 'learning_rate': 4.9954296160877516e-05, 'epoch': 0.01}
{'loss': 0.4591, 'grad_norm': 0.5277919769287109, 'learning_rate': 4.990859232175503e-05, 'epoch': 0.02}
{'loss': 0.4178, 'grad_norm': 0.5543948411941528, 'learning_rate': 4.986288848263254e-05, 'epoch': 0.03}


KeyboardInterrupt: 

In [27]:
# --- Evaluate the Model ---
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.20971296727657318, 'eval_roc_auc': 0.6505465028120867, 'eval_average_precision': 0.43686081162034834, 'eval_accuracy': 0.1111111111111111, 'eval_precision': 0.9595959595959596, 'eval_recall': 0.7640750670241286, 'eval_f1': 0.8507462686567164, 'eval_runtime': 15.6833, 'eval_samples_per_second': 2.869, 'eval_steps_per_second': 0.064, 'epoch': 10.0}




In [7]:




# --- Save the Model ---
model.save_pretrained("./results/Roberta")
tokenizer.save_pretrained("./results/Roberta")

  0%|          | 0/220 [00:00<?, ?it/s]

{'loss': 0.6931, 'grad_norm': 0.7531012296676636, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.45}
{'loss': 0.6903, 'grad_norm': 0.7990725040435791, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.91}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6866267919540405, 'eval_roc_auc': 0.5, 'eval_average_precision': 0.285823754789272, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 11.2802, 'eval_samples_per_second': 3.989, 'eval_steps_per_second': 0.089, 'epoch': 1.0}
{'loss': 0.6832, 'grad_norm': 0.8112362623214722, 'learning_rate': 3e-06, 'epoch': 1.36}
{'loss': 0.6733, 'grad_norm': 0.8657842874526978, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.82}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6584188938140869, 'eval_roc_auc': 0.5, 'eval_average_precision': 0.285823754789272, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 11.1676, 'eval_samples_per_second': 4.03, 'eval_steps_per_second': 0.09, 'epoch': 2.0}
{'loss': 0.6519, 'grad_norm': 1.2119724750518799, 'learning_rate': 5e-06, 'epoch': 2.27}
{'loss': 0.5861, 'grad_norm': 1.1018736362457275, 'learning_rate': 6e-06, 'epoch': 2.73}


  0%|          | 0/1 [00:00<?, ?it/s]



{'eval_loss': 0.4907730519771576, 'eval_roc_auc': 0.5, 'eval_average_precision': 0.285823754789272, 'eval_accuracy': 0.0, 'eval_precision': 0.9888888888888889, 'eval_recall': 0.4772117962466488, 'eval_f1': 0.64376130198915, 'eval_runtime': 11.7447, 'eval_samples_per_second': 3.832, 'eval_steps_per_second': 0.085, 'epoch': 3.0}
{'loss': 0.5229, 'grad_norm': 1.1941916942596436, 'learning_rate': 7.000000000000001e-06, 'epoch': 3.18}


KeyboardInterrupt: 