In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset

In [None]:
# load  cais/mmlu


In [None]:
data = pd.read_csv('data/news_factor.csv')

print(data.loc[0, 'full_prefix'])
print(data.loc[0, 'completion'])
print(data.loc[0, 'contradiction_0'])
print(data.loc[0, 'contradiction_1'])
print(data.loc[0, 'contradiction_2'])

In [None]:
import argparse
import os

import numpy as np
import pandas as pd
import torch
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# load data
def extract_example(row):
    return {'full_prefix': row.full_prefix, 'completion': row.completion,
            'contradictions': [row.contradiction_0, row.contradiction_1, row.contradiction_2]}


def read_data(path, prefix_col):
    df = pd.read_csv(path)[[prefix_col, 'doc_id', 'completion', 'contradiction_0', 'contradiction_1', 'contradiction_2']]
    df.rename(columns={prefix_col: 'full_prefix'}, inplace=True)
    return df.apply(lambda row: extract_example(row), axis=1).to_list()

# load model
def load_tokenizer(model_name, max_tokens):
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='right', truncation_side='left',
                                              model_max_length=max_tokens)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def load_model_and_tokenizer(model_name, cache_dir=None, max_tokens=1024):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    multi_gpus = torch.cuda.device_count() > 1
    config = AutoConfig.from_pretrained(model_name)
    model_args = {}
    if cache_dir is not None and device != 'cpu':
        model_args["cache_dir"] = cache_dir
    if multi_gpus:
        model_args["device_map"] = "auto"
        model_args["low_cpu_mem_usage"] = True
    if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
        model_args["torch_dtype"] = config.torch_dtype

    model = AutoModelForCausalLM.from_pretrained(model_name, **model_args).eval()
    if not multi_gpus:
        model = model.to(device)
    tokenizer = load_tokenizer(model_name, max_tokens)
    print(model.dtype)
    model.config.pad_token_id = model.config.eos_token_id
    return model, tokenizer, device

# prepare examples for evaluation
def format_data(ex):
    prefix = ex['full_prefix']
    completion = ex['completion']
    contradictions = ex['contradictions']

    # make sure completion don't contain trailing spaces
    completion = completion.lstrip(' ')
    contradictions = [cont.lstrip(' ') for cont in contradictions]

    # if the prefix ends with a new line, just concatenate.
    # Else, add space to the completion, remove it from the prefix if necessary
    if prefix.endswith(' '):
        prefix = prefix[:-1]
        batch = [f"{prefix} {completion}"] + [f"{prefix} {cont}" for cont in contradictions]
        labels_batch = [f" {completion}"] + [f" {cont}" for cont in contradictions]
    else:
        batch = [f"{prefix}{completion}"] + [f"{prefix}{cont}" for cont in contradictions]
        labels_batch = [completion] + contradictions
    return batch, labels_batch


def prep_batch(ex, tokenizer, device):
    # prepare examples for tokenization
    batch, labels_batch = format_data(ex)
    # encode full text (context + completions)
    encoding = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', add_special_tokens=False).to(device)
    encoding = {k: v.to(device) for k, v in encoding.items()}
    input_ids = encoding['input_ids']
    # extract labels from input text
    labels_encoding = tokenizer(labels_batch, padding=True, truncation=True, return_tensors='pt', add_special_tokens=False).to(device)
    input_lens = torch.sum(encoding['attention_mask'], axis=-1).to(device)
    target_lens = torch.sum(labels_encoding['attention_mask'], axis=-1).to(device)
    offsets = input_lens - target_lens
    positions = torch.arange(0, encoding['input_ids'].size(-1))[None, :].to(device)
    labels_mask = (positions >= offsets[:, None]) * encoding['attention_mask']

    labels = input_ids*labels_mask + (-100)*(1-labels_mask)

    # assert all labels match
    for input_id, label, target_len, offset, comp in zip(input_ids, labels, target_lens, offsets, labels_batch):
        assert torch.all(input_id[offset: offset + target_len].eq(label[offset:offset+target_len])), "labels don't appear in input ids"
        assert torch.all(label[:offset] == -100), "labels include redundant prefix"
        assert torch.all(label[offset + target_len:] == -100), "labels include redundant suffix"
    encoding = {k: v.to(device) for k, v in encoding.items()}
    return encoding, labels, target_lens


def get_losses(logits, labels):
    loss_fct = CrossEntropyLoss(reduction="none")
    nll = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1)).cpu()
    nll = nll.view(labels.size())
    return nll


def run_eval(model, tokenizer, data, device):
    all_scores = torch.empty((len(data), 4))
    for i, ex in tqdm(enumerate(data)):
        print(ex)
        break
        input_ids, target, target_lens = prep_batch(ex, tokenizer, device=device)
        with torch.no_grad():
            out = model(**input_ids)
            nll = get_losses(out.logits[..., :-1, :], target[:, 1:])

        # get scores for the full the sequence
        scores = torch.sum(nll, axis=-1)
        scores = scores / target_lens.to('cpu')
        all_scores[i] = scores
        if i % 100 == 0:
            acc = np.sum(np.argmin(np.array(all_scores[:(i+1), :].tolist()), axis=1) == 0) / (i+1)
            print(f"processed: {i+1}/{len(data)} examples. accuracy: {acc}")
    return all_scores


def main(args):
    prefix_col = 'turncated_prefixes'
    data = read_data(args.data_file, prefix_col)
    model, tokenizer, device = load_model_and_tokenizer(args.model_name, args.cache_dir, max_tokens=args.max_tokens)
    all_scores = run_eval(model, tokenizer, data, device)
    data = pd.DataFrame(data)
    data['scores'] = list(all_scores.to('cpu').numpy())
    acc = np.sum(np.argmin(np.array(data['scores'].to_list()), axis=1) == 0) / len(data)
    print(f"acc = {acc}")
    data.to_json(get_results_path(args.output_folder, args.model_name), lines=True,
                 orient='records')
    print("Done!")


def get_results_path(output_folder, model_name):
    return os.path.join(output_folder, model_name.split('/')[-1] + '.jsonl')


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()

#     # Data params
#     parser.add_argument('--data_file', required=True, type=str, help="csv file")
#     parser.add_argument('--output_folder', required=True, type=str)

#     # Model params
#     parser.add_argument('--model_name', default='gpt2', type=str)
#     parser.add_argument('--max_tokens', type=int, default=1024)

#     parser.add_argument("--cache_dir", type=str, default="/dev/shm/cache-transformers/")
#     args = parser.parse_args()
#     main(args)


In [None]:
prefix_col = 'turncated_prefixes'
data = read_data('data/news_factor.csv', prefix_col)
model, tokenizer, device = load_model_and_tokenizer('gpt2', './cache-transformers/', max_tokens=1024)
all_scores = run_eval(model, tokenizer, data, device)
data = pd.DataFrame(data)
data['scores'] = list(all_scores.to('cpu').numpy())
acc = np.sum(np.argmin(np.array(data['scores'].to_list()), axis=1) == 0) / len(data)
print(f"acc = {acc}")
data.to_json(get_results_path(args.output_folder, args.model_name), lines=True,
                orient='records')
print("Done!")

In [None]:
data[0]

In [None]:
# prep_batch(data[0], tokenizer, device)
# format_data(data[0])

In [None]:
dataset = load_dataset("parquet", data_files="/iopsstor/scratch/cscs/dfan/data/robots-txt/RawData-NYTimes/*.parquet")
len(dataset['train'])

In [None]:
from datasets import Dataset
import os

# Set the folder containing the .txt files
folder_path = "/iopsstor/scratch/cscs/ansaripo/factor/NYT articles/new_articles"
dataset_name = "nytimes_new_verbatim_256"
pre_len = 256

# Read text files into a list of dicts
data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
            data.append({"filename": filename, "text": f.read()})

# Create a Hugging Face Dataset
dataset = Dataset.from_list(data)



In [None]:
dataset.to_pandas()['text'].apply(lambda x: len(x.split())).hist(bins=100)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('alehc/swissai-tokenizer')
for data in dataset:
    if len(data['text'].split()) <= 256:
        print('found')
        print(data['text'])
        print(len(tokenizer.tokenize(data['text'])))
        # break

In [None]:
bins = [0, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]
pd.cut(dataset['train'].to_pandas().loc[:1000, 'text'].apply(lambda x: len(x.split())), bins=bins, include_lowest=True).value_counts()

In [None]:
from transformers import AutoTokenizer
# get the first 20000 examples
# fset = dataset["train"].select(range(12800))
fset = dataset
tokenizer = AutoTokenizer.from_pretrained('alehc/swissai-tokenizer')

In [None]:
# from transformers import AutoTokenizer
# # get the first 20000 examples
# subset = dataset["train"].select(range(12800))
# tokenizer = AutoTokenizer.from_pretrained('alehc/swissai-tokenizer')

def split_example(examples, input_size=pre_len, max_tokens=4096):
    # text = example['text']
    # tokens = tokenizer.tokenize(text)
    # input = tokenizer.convert_tokens_to_string(tokens[:input_size])
    # target = tokenizer.convert_tokens_to_string(tokens[input_size:max_tokens])
    # return {'input_text': input, 'target_text': target}
    input_texts = []
    target_texts = []
    
    for i, text in enumerate(examples['text']):
        tokens = tokenizer.tokenize(text)
        input = tokenizer.convert_tokens_to_string(tokens[:input_size])
        target = tokenizer.convert_tokens_to_string(tokens[input_size:max_tokens])
        input_texts.append(input)
        target_texts.append(target)
    
    return {'input_text': input_texts, 'target_text': target_texts}

subset = fset.map(split_example, batched=True)


In [None]:
positive_examples = subset.filter(lambda example: len(example['target_text']) != 0)

In [None]:
print(len(positive_examples), len(subset))

In [None]:
# # save the dataset
# positive_examples.save_to_disk('/iopsstor/scratch/cscs/ansaripo/data/nytimes_verbatim')

In [None]:
#login to huggingface
!transform
!transformers-cli login

In [None]:
# push the dataset to the hub
from datasets import Dataset, DatasetDict

HF_TOKEN=''
from huggingface_hub import login
login(HF_TOKEN)
DatasetDict({'test': positive_examples}).push_to_hub(dataset_name)

In [None]:
from transformers import AutoModelForCausalLM
!export HF_TOKEN=
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-Nemo-Base-2407')

In [None]:
load_dataset("mansaripo/nytimes_verbatim")

In [None]:
dataset = load_dataset("mansaripo/nytimes_verbatim")

In [None]:
for data in dataset['test']:
    if '00 photographs.' in data['target_text']:
        print(data['input_text'])
        print(data['target_text'])

In [None]:
for data in dataset['test']:
    if 'why?' in data['target_text'] and len(data['target_text']) < 100:
        print(data['input_text'])
        print(data['target_text'])

In [None]:
import json
import os
from datasets import DatasetDict, Dataset

output_file = "/users/ansaripo/deepseek_questions_mcq.json"
dataset_name = "nytimes_mcq_with_context"
pre_questions = json.load(open(output_file, "r")) if os.path.exists(output_file) else []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('alehc/swissai-tokenizer')


# tokens = tokenizer.tokenize(text)
# input = tokenizer.convert_tokens_to_string(tokens[:input_size])
# target = tokenizer.convert_tokens_to_string(tokens[input_size:max_tokens])
# input_texts.append(input)
# target_texts.append(target)

article_data = "/iopsstor/scratch/cscs/dfan/data/robots-txt/RawData-NYTimes/*.parquet"
processed_data = load_dataset("parquet", data_files=article_data)['train']

for q in pre_questions:
    tokens = tokenizer.tokenize(processed_data[q['index']]['text'])
    input = tokenizer.convert_tokens_to_string(tokens[:(4096 - 70)])
    q['generated_question']['article'] = input

    


DatasetDict({'test': Dataset.from_list([q['generated_question'] for q in pre_questions])}).push_to_hub(dataset_name)


In [None]:
load_dataset("mansaripo/nytimes_mcq_with_context")


In [None]:
!export HF_HOME=/iopsstor/scratch/cscs/ansaripo/huggingface

In [None]:
!echo $HF_HOME

In [None]:
from datasets import load_dataset

datsetset = load_dataset("mansaripo/nytimes_mcq_eval")

correct = 0
for data in datsetset['test']:
    answer = data['generated_question']['answer']
    pred = data['prediction']
    if answer == pred:
        correct += 1
print(f"Accuracy: {correct/1000}")

In [None]:
from datasets import load_dataset

datsetset = load_dataset("mansaripo/nytimes_mcq_eval_blind")

correct = 0
for data in datsetset['test']:
    answer = data['generated_question']['answer']
    pred = data['prediction']
    if answer == pred:
        correct += 1
print(f"Accuracy: {correct/1000}")
print(f"Accuracy: {correct/len(datsetset['test'])}")

In [None]:
from datasets import load_dataset

datsetset = load_dataset("mansaripo/nytimes_mcq_eval_blind_gpt")

correct = 0
for data in datsetset['test']:
    answer = data['generated_question']['answer']
    pred = data['prediction']
    if answer == pred:
        correct += 1
print(f"Accuracy: {correct/len(datsetset['test'])}")