In [1]:
import pandas as pd

# Load the datasets
edu_train = pd.read_csv("edu_train.csv")
edu_dev = pd.read_csv("edu_dev.csv")
edu_test = pd.read_csv("edu_test.csv")
mappings = pd.read_csv("mappings.csv")

# Display the first few rows of each dataset to understand their structure
edu_train.head(), edu_dev.head(), edu_test.head(), mappings.head()

(   Unnamed: 0.2  Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  Unnamed: 0.1.1.1  \
 0             0           0             0               0                 0   
 1             1           1             1               1                 1   
 2             2           2             2               2                 2   
 3             3           3             3               3                 3   
 4             4           4             4               4                 4   
 
    Unnamed: 0.1.1.1.1  Unnamed: 0.1.1.1.1.1  Unnamed: 0.1.1.1.1.1.1  \
 0                   0                     0                       0   
 1                   1                     1                       1   
 2                   2                     2                       2   
 3                   3                     3                       3   
 4                   4                     4                       4   
 
    Unnamed: 0.1.1.1.1.1.1.1  \
 0                      1436   
 1                  

In [2]:
# Drop unnecessary columns from the datasets
edu_train_cleaned = edu_train[['source_article', 'updated_label']]
edu_dev_cleaned = edu_dev[['source_article', 'updated_label']]
edu_test_cleaned = edu_test[['source_article', 'updated_label']]

# Create a mapping from the 'Understandable Name' in mappings to an integer id
label2id = {row['Understandable Name']: idx for idx, row in mappings.iterrows()}
id2label = {idx: row['Understandable Name'] for idx, row in mappings.iterrows()}

# Map the labels in the datasets to integer ids
edu_train_cleaned['updated_label'] = edu_train_cleaned['updated_label'].map(label2id)
edu_dev_cleaned['updated_label'] = edu_dev_cleaned['updated_label'].map(label2id)
edu_test_cleaned['updated_label'] = edu_test_cleaned['updated_label'].map(label2id)

# Verify the preprocessing
edu_train_cleaned.head(), edu_dev_cleaned.head(), edu_test_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edu_train_cleaned['updated_label'] = edu_train_cleaned['updated_label'].map(label2id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edu_dev_cleaned['updated_label'] = edu_dev_cleaned['updated_label'].map(label2id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edu_test_cleaned['updated_label'] = e

(                                      source_article  updated_label
 0          company's slogan "Expect More. Pay Less."            6.0
 1  The bigger a child's shoe size, the better the...            1.0
 2  Since many people believe this, then it must b...            NaN
 3  Senator Randall isn't lying when she says she ...            2.0
 4  A mother is telling her daughter that she went...            NaN,
                                       source_article  updated_label
 0  "Just like students are given a couple of week...            NaN
 1  You don’t have to do this. My grandmother is i...            6.0
 2  I know five people from Kentucky. They are all...            0.0
 3  Pvt. Joe Bowers: What are these electrolytes? ...            2.0
 4  This is a fallacy of irrelevance that is based...            NaN,
                                       source_article  updated_label
 0  People who drive big cars probably hate the en...            NaN
 1      White men can't jump. No

In [3]:
# Remove rows with missing labels from all datasets
edu_train_cleaned = edu_train_cleaned.dropna(subset=['updated_label'])
edu_dev_cleaned = edu_dev_cleaned.dropna(subset=['updated_label'])
edu_test_cleaned = edu_test_cleaned.dropna(subset=['updated_label'])

# Convert labels to integers (they may have been cast as floats due to NaN handling)
edu_train_cleaned['updated_label'] = edu_train_cleaned['updated_label'].astype(int)
edu_dev_cleaned['updated_label'] = edu_dev_cleaned['updated_label'].astype(int)
edu_test_cleaned['updated_label'] = edu_test_cleaned['updated_label'].astype(int)

# Verify the cleaned datasets
edu_train_cleaned.head(), edu_dev_cleaned.head(), edu_test_cleaned.head()

(                                      source_article  updated_label
 0          company's slogan "Expect More. Pay Less."              6
 1  The bigger a child's shoe size, the better the...              1
 3  Senator Randall isn't lying when she says she ...              2
 5  A mother tells her children not to leave the y...              6
 6  If we ban Hummers because they are bad for the...              0,
                                        source_article  updated_label
 1   You don’t have to do this. My grandmother is i...              6
 2   I know five people from Kentucky. They are all...              0
 3   Pvt. Joe Bowers: What are these electrolytes? ...              2
 7   If we use one more can of hairspray this month...              0
 11   The best example of this fallacy is presented...              1,
                                       source_article  updated_label
 1      White men can't jump. No, really, they can't!              0
 3  "Why are you hitting y

## Llama-2-7b

In [4]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Load your datasets
edu_train = pd.read_csv("edu_train.csv")
edu_dev = pd.read_csv("edu_dev.csv")
edu_test = pd.read_csv("edu_test.csv")
mappings = pd.read_csv("mappings.csv")

# Clean the datasets by removing unwanted columns and mapping labels
edu_train_cleaned = edu_train[['source_article', 'updated_label']].dropna()
edu_dev_cleaned = edu_dev[['source_article', 'updated_label']].dropna()
edu_test_cleaned = edu_test[['source_article', 'updated_label']].dropna()

# Create a mapping from 'Understandable Name' to integer ids
label2id = {row['Understandable Name']: idx for idx, row in mappings.iterrows()}
id2label = {idx: row['Understandable Name'] for idx, row in mappings.iterrows()}

# Map the labels to integer ids and remove NaN values
edu_train_cleaned['updated_label'] = edu_train_cleaned['updated_label'].map(label2id)
edu_train_cleaned = edu_train_cleaned.dropna(subset=['updated_label'])
edu_train_cleaned['updated_label'] = edu_train_cleaned['updated_label'].astype(int)

edu_dev_cleaned['updated_label'] = edu_dev_cleaned['updated_label'].map(label2id)
edu_dev_cleaned = edu_dev_cleaned.dropna(subset=['updated_label'])
edu_dev_cleaned['updated_label'] = edu_dev_cleaned['updated_label'].astype(int)

edu_test_cleaned['updated_label'] = edu_test_cleaned['updated_label'].map(label2id)
edu_test_cleaned = edu_test_cleaned.dropna(subset=['updated_label'])
edu_test_cleaned['updated_label'] = edu_test_cleaned['updated_label'].astype(int)

# Load the LLaMA-2-7B model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add new pad token

# Update pad token ID in model configuration
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(mappings))
model.config.pad_token_id = tokenizer.pad_token_id  # Ensure model is aware of the pad token

# Resize token embeddings of the model to account for the new pad_token
model.resize_token_embeddings(len(tokenizer))

# Tokenize the datasets and ensure the labels are integers
def tokenize_function_with_labels(examples):
    tokens = tokenizer(examples["source_article"], padding="max_length", truncation=True)
    tokens["labels"] = [int(label) for label in examples["updated_label"]]
    return tokens

train_dataset = Dataset.from_pandas(edu_train_cleaned).map(tokenize_function_with_labels, batched=True)
dev_dataset = Dataset.from_pandas(edu_dev_cleaned).map(tokenize_function_with_labels, batched=True)
test_dataset = Dataset.from_pandas(edu_test_cleaned).map(tokenize_function_with_labels, batched=True)

# Define metrics for evaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Adjust batch size based on GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    # Enable distributed data parallel if multiple GPUs are available
    dataloader_pin_memory=True,
)

# Trainer to handle training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/757 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1720538437738/work/c10/cuda/driver_api.cpp":27, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib/x86_64-linux-gnu/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType