In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
from transformers import (AutoModelForCausalLM, AutoTokenizer, BertForSequenceClassification, BertTokenizer, BertModel,
 RobertaForSequenceClassification, RobertaTokenizer, RobertaModel, TrainingArguments, Trainer, pipeline, RobertaConfig)



from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import evaluate
import wandb
import os

os.environ["WANDB_PROJECT"] = "gen_detector"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

# Init Generator and Detector

In [6]:
#GEN_PATH = "microsoft/phi-2"
GEN_PATH = "openai-community/gpt2"
#GEN_PATH = "Qwen/Qwen1.5-0.5B-Chat"
#BERT_PATH = "bert-base-uncased"
BERT_PATH = "openai-community/roberta-base-openai-detector"
device = "cuda" if torch.cuda.is_available() else "cpu"


class LLMGenerator(nn.Module):
  def __init__(self, gpt_model, tokenizer, device=None, gen_params=None):
    super().__init__()

    # gpt should already be trained
    self.gpt = gpt_model
    self.tokenizer = tokenizer
    if device is None:
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
      self.device = device

    self.default_gen_params = {
        "max_length": 150,
        #"max_new_tokens": 100,
        "temperature": 0.8,
        "top_p": 0.8,
        "repetition_penalty": 1
    }
    self.gen_params = gen_params if gen_params is not None else self.default_gen_params
       

  def forward_old(self, text, max_length=512, max_new_tokens=100, temperature=1, top_k=50, top_p=0.9, repetition_penalty=1, skip_special_tokens=True):

    # tokenize text using the tokenizer
    input_ids = self.tokenizer.encode(text, return_tensors="pt").to(self.device)
    # generate text using the gpt model
    #output_ids = self.gpt.generate(input_ids, max_length=max_length, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty)
    with torch.no_grad():
      output_ids = self.gpt.generate(input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty)

    # optional, remove input_ids from output_ids
    #output_ids = [output_id[len(input_ids):] for input_id, output_id in zip(input_ids, output_ids)]

    # decode the generated text
    decoded_output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=skip_special_tokens)[0]

    # remove the input text from the generated text
    decoded_output = decoded_output.replace(text, "").strip()

    # remove special tokens from the generated text
    special_tokens = self.tokenizer.additional_special_tokens

    for special_token in special_tokens:
      decoded_output = decoded_output.replace(special_token, "")

    #decoded_output = decoded_output.replace(text, "")
    return decoded_output
  

  def forward(self, samples, max_new_tokens=100):

    max_length = self.gen_params["max_length"]
    #self.gen_params["max_new_tokens"] = max_new_tokens
    encoding = self.tokenizer.batch_encode_plus(samples, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    input_ids = encoding['input_ids'].to(self.device)

    # generate text using the gpt model
    with torch.no_grad():
      #output_ids = self.gpt.generate(input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)
      output_ids = self.gpt.generate(input_ids, pad_token_id = self.tokenizer.pad_token_id, **self.gen_params)

    # decode the generated text
    #decoded_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=skip_special_tokens)
    decoded_outputs = self.tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:])
        
    # remove special tokens from the generated text
    special_tokens = self.tokenizer.additional_special_tokens + [self.tokenizer.pad_token] + [self.tokenizer.eos_token]
    for i, sample in enumerate(samples):
        decoded_output = decoded_outputs[i]
        for special_token in special_tokens:
            decoded_output = decoded_output.replace(special_token, "")
        decoded_outputs[i] = decoded_output
        
    return decoded_outputs
  

class LLMDetector(nn.Module):
  def __init__(self, bert_model, tokenizer, num_classes):
    super().__init__()

    self.tokenizer = tokenizer

    # bert should already be trained
    self.bert = bert_model

    # set num_classes
    self.num_classes = num_classes

  def forward(self, text):

    # tokenize text using the tokenizer
    output = self.tokenizer(text, return_tensors="pt")
    input_ids = output["input_ids"].to(device)
    logits = self.bert(input_ids)["logits"]

    # apply sigmoid to get probabilities of each class
    output = torch.sigmoid(logits)
    return output
        

In [24]:
test_list = []

if  test_list:
    print("Hello")

In [18]:
# this causes an issue with Trainer
#torch.set_default_device("cuda")

gen_model = AutoModelForCausalLM.from_pretrained(GEN_PATH, torch_dtype="auto").to(device)
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_PATH, trust_remote_code=True)

gen_tokenizer.pad_token = gen_tokenizer.eos_token

gen_params = {
        "max_length": 200,
        "max_new_tokens": None,
        "temperature": 0.8,
        "top_p": 0.8,
        "repetition_penalty": 1,
        "do_sample": True
    }

gen_params["repetition_penalty"] = 1.1
generator = LLMGenerator(gen_model, gen_tokenizer, gen_params=gen_params, device=device)



text_input = ['''def print_prime(n):
   """
   Print all primes between 1 and n
   """''']
with torch.no_grad():
   output = generator(text_input)
print(output[0])


 return [0, 0] for i in range (3) do try: # Get prime numbers from the list of digits if len(primples[i]) > 5 then s = [] while not sorted.empty() or None : tuple=dict([1..9], dict((j-s), strlen(tuple))).split().join(' ','') end break elseif k >= 2 then j = lambda x: vy += 3 * np.zeros(-k + y)*2/4 except KeyboardError as e: raise ExitIteration EndIterator def get_routine():... >>> int64("%c", "").putln(lambda r: f32+f34)+"|".format(predictor()) - pprintl(*{'x':''})....}..... python


In [14]:
detector_model = RobertaForSequenceClassification.from_pretrained(BERT_PATH).to(device)
bert_tokenizer = RobertaTokenizer.from_pretrained(BERT_PATH)
detector = LLMDetector(detector_model, bert_tokenizer, 2)

text = "def print_prime(n):\n   \"\"\"\n   Print all primes between 1 and n\n   \"\"\""

logits = detector(text)
fake = logits.argmax().item()

Some weights of the model checkpoint at openai-community/roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
detector_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [16]:
num_classes = 2
detector_model.classifier = nn.Sequential(
        nn.Linear(detector_model.config.hidden_size, detector_model.config.hidden_size),
        nn.Dropout(0.1),
        nn.Linear(detector_model.config.hidden_size, detector_model.config.hidden_size),
        nn.Dropout(0.1),
        nn.Linear(detector_model.config.hidden_size, num_classes)
      )

In [17]:
detector_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [4]:
test_squence =  [0, 4993,  688]
decoded = bert_tokenizer.decode(test_squence)
print(decoded)

<s>After weeks


In [7]:
test_str = "The women are too afraid and ashamed to show their faces.  They have been raped, abused and left for dead by men they trusted.  Now they are"
test_str = test_str.replace(".  ", ". ")
test_str

'The women are too afraid and ashamed to show their faces. They have been raped, abused and left for dead by men they trusted. Now they are'

In [4]:
detector_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [16]:
for name, param in detector_model.named_parameters():
    if name.startswith("classifier"):
        print(name)

classifier.dense.weight
classifier.dense.bias
classifier.out_proj.weight
classifier.out_proj.bias


In [8]:
fake_text_mistral = "This sentence.  Has double spaces.  And is not a complete sentence"
fake_text_mistral = fake_text_mistral.replace(".  ", ". ")
fake_text_mistral

'This sentence. Has double spaces. And is not a complete sentence'

# Load dataset of instructions and output with gen

In [11]:
dataset_path = "databricks/databricks-dolly-15k"
dataset = load_dataset(dataset_path)

dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15011
    })
})

In [12]:
dataset["train"][0]

{'instruction': 'When did Virgin Australia start operating?',
 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'category': 'closed_qa'}

In [13]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [14]:
type(dataset["train"])

datasets.arrow_dataset.Dataset

## Data exploration

In [15]:
# compute mean length of the responses
lengths_response = [len(x) for x in dataset["train"]["response"]]
print("Average length of responses:", np.mean(lengths_response))

Average length of responses: 358.10419026047566


In [16]:
lengts_instruction = [len(x) for x in dataset["train"]["instruction"]]
print("Average length of instructions:", np.mean(lengts_instruction))

Average length of instructions: 71.83938445140231


In [17]:
# discard instructions that are more than max_nb_tokens_input tokens
max_nb_tokens_input = 100

# tokenize the instructions
dataset = dataset.map(lambda x: {"tokenized_instruction": gen_tokenizer(x["instruction"])})
dataset = dataset.map(lambda x: {"tokenized_context": gen_tokenizer(x["context"])})
dataset_before_len = len(dataset["train"])
dataset = dataset.filter(lambda x: len(x["tokenized_instruction"]["input_ids"]) + len(x["tokenized_context"]["input_ids"]) <= max_nb_tokens_input)
dataset_after_len = len(dataset["train"])
print(f"Percent of data discarded: {100*(1 - dataset_after_len/dataset_before_len):.2f}%")

Percent of data discarded: 26.29%


In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category', 'tokenized_instruction', 'tokenized_context'],
        num_rows: 11065
    })
})

In [19]:
# test output with first instruction
text = dataset["train"][0]
text


{'instruction': 'Which is a species of fish? Tope or Rope',
 'context': '',
 'response': 'Tope',
 'category': 'classification',
 'tokenized_instruction': {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [23085, 374, 264, 9419, 315, 7640, 30, 2014, 375, 476, 97896]},
 'tokenized_context': {'attention_mask': [], 'input_ids': []}}

In [20]:
text_instruction = f"Context: {text["context"]} \n Question: {text["instruction"]}"
output = generator(text_instruction)
print("Question: ", text_instruction)
#print()
print("Generated answer: ", output)
#print()
print("Real human answer: ", text["response"])


Question:  Context:  
 Question: Which is a species of fish? Tope or Rope
Generated answer:  Context:  
 Question: Which is a species of fish? Tope or Rope
Real human answer:  Tope


In [21]:
output = generator("What is the capital of France?")
output

'What is the capital of France?'

In [22]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
]
text = gen_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [23]:
output = generator(text)
print(output)

system
You are a helpful assistant.
user
What is the capital of France?
assistant
Paris


In [24]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"{text_instruction}"},
]
text = gen_tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

output = generator(text)
print(output)

system
You are a helpful assistant.
user
Context:  
 Question: Which is a species of fish? Tope or Rope
assistant
Tope is a species of fish.


## generate fake dataset

see Open AI GPT-2 generated dataset: https://github.com/openai/gpt-2-output-dataset?tab=readme-ov-file
and https://github.com/openai/gpt-2-output-dataset/tree/master/detector for their roberta detector

In [25]:
def generate_fake_responses(generator, dataset):
    """
    Traverse dataset and generate responses for each instruction
    """

    fake_responses = []
    for data in dataset:
        # Create query in the format that the generator expects
        text_instruction = f"Context: {data['context']} \n {data['instruction']}"
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"{text_instruction}"},
        ]
        text = gen_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Generate response
        output = generator(text, skip_special_tokens=False)
        
        fake_responses.append(output)
    return fake_responses

def create_random_subset(dataset, n=10):
    """
    Create a random subset of the dataset
    """
    if n > len(dataset):
        n = len(dataset)
    indices = np.random.choice(len(dataset), n, replace=False)
    subset = dataset.select(indices)
    return subset

def filter_instruction(sample):
    """
    Note: only works if special tokens are not removed
    """

    text_instruction = f"Context: {sample['context']} \n {sample['instruction']}"
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"{text_instruction}"},
    ]
    text_template = gen_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    generated_response = sample["generated_response"]

    response_without_instruction = generated_response.replace(text_template, "")
    return {"generated_response": response_without_instruction}

In [26]:
# take a random subset of the dataset
subset_size = 5
train_subset = create_random_subset(dataset["train"], n=subset_size)
#test_subset = create_random_subset(dataset["test"], n=10)
#eval_subset = create_random_subset(dataset["validation"], n=10)

# generate fake responses for the subsets
fake_responses_train = generate_fake_responses(generator, train_subset)
#fake_responses_test = generate_fake_responses(generator, test_subset)
#fake_responses_eval = generate_fake_responses(generator, eval_subset)

#fake_dataset = Dataset.from_dict({"train": fake_responses_train, "test": fake_responses_test, "validation": fake_responses_eval})

fake_responses_train = Dataset.from_dict({"generated_response": fake_responses_train, "instruction": train_subset["instruction"],
    "context": train_subset["context"], "true_response": train_subset["response"], "category": train_subset["category"]})

fake_dataset = DatasetDict()
fake_dataset["train"] = fake_responses_train

# save fake dataset
fake_dataset.save_to_disk("fake_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

In [27]:
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_PATH, trust_remote_code=True)

# load fake dataset
fake_dataset = load_from_disk("fake_dataset")
fake_dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['generated_response', 'instruction', 'context', 'true_response', 'category'],
        num_rows: 5
    })
})

In [28]:
print(fake_dataset["train"][0]["instruction"])

Where was the 11th BRICS held?


In [29]:
print(fake_dataset["train"][0]["true_response"])

Brasila, the capital of Brazil


In [30]:
print(fake_dataset["train"][0]["generated_response"])

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Context:  
 Where was the 11th BRICS held?<|im_end|>
<|im_start|>assistant
The 11th BRICS held in Brazil took place at the National Stadium in Brasília.<|im_end|>


In [31]:
fake_dataset = fake_dataset.map(filter_instruction)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [32]:
print(fake_dataset["train"][0]["generated_response"])

The 11th BRICS held in Brazil took place at the National Stadium in Brasília.<|im_end|>


In [33]:
true_dataset = dataset.select_columns(["response"])
true_dataset = true_dataset.rename_column("response", "text")

# select random samples from true_dataset to match fake_dataset size
true_dataset = true_dataset.shuffle(seed=42)


In [34]:
true_dataset = true_dataset.select(range(len(fake_dataset["train"])))

# create label = 0 for true responses and label = 1 for fake responses
true_dataset = true_dataset.map(lambda x: {"label": 0})
fake_dataset = fake_dataset.map(lambda x: {"label": 1})

# remove true_response from fake_dataset
fake_dataset = fake_dataset.remove_columns(["true_response"])

# rename generated_response to response
fake_dataset = fake_dataset.rename_column("generated_response", "text")
fake_dataset = fake_dataset.select_columns(["text", "label"])

# merge fake and true datasets
merged = concatenate_datasets([true_dataset["train"], fake_dataset["train"]])
merged_dataset = DatasetDict()
merged_dataset["train"] = merged

# shuffle the dataset
merged_dataset = merged_dataset.shuffle(seed=42)

# save merged dataset
merged_dataset.save_to_disk("merged_dataset")

AttributeError: 'DatasetDict' object has no attribute 'select'

In [None]:
# load merged dataset
human_fake_train_dataset = load_from_disk("merged_dataset")

In [None]:
true_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 15011
    })
})

In [None]:
fake_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

In [None]:
merged_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 15016
    })
})

# Training Detector

In [None]:
# tokenize the merged dataset with the detector tokenizer
def tokenize_text(x, tokenizer):
    return tokenizer(x["text"], truncation=True, padding="max_length")

human_fake_train_dataset = human_fake_train_dataset.map(lambda x: tokenize_text(x, bert_tokenizer), batched=True)


In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=1e-3,
    logging_steps=5,
    logging_dir="./logs",
    report_to="wandb",



)

trainer = Trainer(
    model=detector_model,
    args=training_args,
    train_dataset=human_fake_train_dataset["train"],
    #eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mhdasilva[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,3.803
10,2.3678
15,0.8344
20,0.779
25,2.2466
30,2.0463
35,1.1581
40,0.9196
45,0.2135
50,0.0602


KeyboardInterrupt: 

# DPO training BERT

In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    report_to="wandb",
    logging_dir="./logs",
    logging_steps=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=1e-3,
    num_train_epochs=3,
)

dpo_trainer = DPOTrainer(
    model,
    model_ref=None,
    args=training_args,
    beta=0.1,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

dpo_trainer.train()
wandb.finish()

# Phi-2 with prompt

In [2]:
# Initialize the pipeline
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, trust_remote_code=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer , device=0)

In [16]:
text = '''Here is a story about a cat and a dog:

Story:'''
result = generator(text, max_length=200)
print(result[0]['generated_text'])

Here is a story about a cat and a dog:

Story: The Cat and the Dog

Once upon a time, there was a cat named Fluffy and a dog named Spot. Fluffy and Spot were best friends, and they liked to play together. They also liked to eat together, and they shared their food.

One day, Fluffy and Spot went to the park with their owners. They saw a lot of other animals there, like birds, squirrels, rabbits, and horses. They also saw a lot of people there, like children, parents, teachers, and doctors.

Fluffy and Spot were curious about the other animals and people. They wanted to learn more about them and their lives. They decided to explore the park and see what they could find.

Fluffy and Spot saw a bird flying in the sky. They wondered what it was like to fly. They saw a squirrel climbing a tree. They wondered what it was like


In [18]:
text = '''Here is a story about a cat and a dog written by Shakespeare:

Story:'''
result = generator(text, max_length=200)
print(result[0]['generated_text'])

Here is a story about a cat and a dog written by Shakespeare:

Story:

Once upon a time, there was a cat named Whiskers and a dog named Spot. They lived in a cozy house with their owner, Mr. Brown. Whiskers and Spot were very different from each other. Whiskers was independent and aloof, while Spot was loyal and friendly.

One day, Mr. Brown decided to take them on a trip to the park. He put them in a basket and drove them to the bus stop. Whiskers was nervous and hid under the basket, while Spot wagged his tail and barked happily.

When they arrived at the park, Mr. Brown let them out of the basket. Whiskers cautiously explored the grass and trees, while Spot ran around and played with other dogs. Mr. Brown threw a ball for Spot to fetch, and Spot brought it back to him with a big smile.




In [25]:
text = '''Here is a story about a cat and a dog written by Victor Hugo:

Story:'''
result = generator(text, max_length=200)
print(result[0]['generated_text'])

Here is a story about a cat and a dog written by Victor Hugo:

Story: The Cat and the Dog

Once upon a time, there was a cat named Mimi and a dog named Max. Mimi and Max were best friends, but they had very different personalities. Mimi was shy and quiet, while Max was loud and energetic.

One day, Mimi and Max decided to go on an adventure together. They wanted to explore the forest near their house, where they had never been before. They packed some food and water, and set off early in the morning.

As they walked through the forest, Mimi and Max saw many interesting things. They saw birds, squirrels, rabbits, and butterflies. They also saw a river, a waterfall, and a cave. They were having a lot of fun, until they came across a fork in the road.

One path led to the left, and the other led to the right.


In [28]:
text = '''"As he awaits a crucial progress report on Iraq, President Bush'''
result = generator(text, max_length=200)
print(result[0]['generated_text'])

"As he awaits a crucial progress report on Iraq, President Bush is facing a difficult decision," the article stated. "He must choose between the immediate threat of a terrorist attack and the long-term goal of a stable and prosperous Iraq."

The article went on to explain that the President had already made a decision to withdraw U.S. troops from Iraq. However, he was now considering whether to extend the troops' mission or withdraw them completely.

"The President's decision will have far-reaching consequences," the article continued. "It will determine the future of Iraq and the stability of the region."

The article provided a detailed analysis of the President's options. It discussed the potential risks and benefits of each choice, as well as the potential impact on the lives of the Iraqi people.

"The decision is not an easy one," the article concluded. "It requires careful consideration and a deep understanding of the complex issues at hand."

As the article


In [30]:
text = '''I saw this piece of news on the internet today:

"As he awaits a crucial progress report on Iraq, President Bush'''
result = generator(text, max_length=200)
print(result[0]['generated_text'])


I saw this piece of news on the internet today:

"As he awaits a crucial progress report on Iraq, President Bush is being urged to take a more aggressive approach to the war. The Bush administration has been under increasing pressure to show that it is making progress in Iraq, but the situation remains complex and uncertain. The Bush administration has been criticized for its lack of transparency and for not providing enough details about the progress being made. Many people are concerned that the war is not going as planned and that the situation in Iraq is getting worse.

I think it's important for the president to be honest with the public about the situation in Iraq. He needs to provide clear and detailed information about the progress being made and the challenges that still need to be overcome. The public needs to know that the war is not just about winning a conflict, but also about creating a stable and secure environment for the people of Iraq.

I hope that the president will 

In [2]:
# Initialize the pipeline
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", torch_dtype=torch.bfloat16, trust_remote_code=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", trust_remote_code=True)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer , device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(generator(text, max_length=200)[0]["generated_text"])

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Give me a short introduction to large language model.<|im_end|>
<|im_start|>assistant


A Large Language Model is a type of artificial intelligence that uses large amounts of text data and algorithms to generate human-like responses to questions or prompts. These models are designed to be able to understand natural language, which means they can recognize and respond to variations in language styles, contexts, and phrasing.

Large Language Models have become increasingly important in fields such as machine learning, deep learning, and natural language processing. They are used by many organizations to help automate tasks that involve handling large volumes of data and providing personalized assistance to users.

Overall, Large Language Models have the potential to revolutionize the way we interact with technology and communicate with one another. As the field continues to evolve, we can expect to see even more s

In [9]:
prompt = "As he awaits a crucial progress report on Iraq, President Bush"
messages = [
    {"role": "system", "content": "You are a helpful assistant. Your task is to continue the provided text."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(generator(text, max_length=200)[0]["generated_text"])

<|im_start|>system
You are a helpful assistant. Your task is to continue the provided text.<|im_end|>
<|im_start|>user
As he awaits a crucial progress report on Iraq, President Bush<|im_end|>
<|im_start|>assistant
searches through internet for information about the conflict and the situation in Iraq. He visits various news sources and forums, including The New York Times and CNN, to get a sense of what is happening in the country.
As he continues his search, he meets with top government officials and military personnel to discuss potential developments and challenges that may arise during the ongoing war. He also meets with local leaders and activists who are working to support their communities and fight against the conflict.
Throughout the process, President Bush remains focused on the needs of his country and its people, and his determination to bring peace and stability back to Iraq is something he is deeply committed to. As he sits in his office, surrounded by his staff and adviso

In [8]:
prompt = "As he awaits a crucial progress report on Iraq, President Bush"
messages = [
    {"role": "system", "content": "You are a helpful assistant. You write text in the style of Shakespeare. Your task is to continue the provided text with a Shakespearean style."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(generator(text, max_length=200)[0]["generated_text"])

<|im_start|>system
You are a helpful assistant. You write text in the style of Shakespeare. Your task is to continue the provided text with a Shakespearean style.<|im_end|>
<|im_start|>user
As he awaits a crucial progress report on Iraq, President Bush<|im_end|>
<|im_start|>assistant
attends his weekly meeting with fellow officials and advisors, intently scanning through the report's pages before nodding in approval as it becomes available. His eyes scan through each line, taking in all the details that need to be addressed for the report's success. As he moves on, he considers the potential implications of this report, considering the broader context in which it will impact the country.


In [10]:
prompt = "As he awaits a crucial progress report on Iraq, President Bush"
messages = [
    {"role": "system", "content": "You are a helpful assistant. You write text for the New York Times. Your task is to write a news article by continuing the provided text."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(generator(text, max_length=200)[0]["generated_text"])

<|im_start|>system
You are a helpful assistant. You write text for the New York Times. Your task is to write a news article by continuing the provided text.<|im_end|>
<|im_start|>user
As he awaits a crucial progress report on Iraq, President Bush<|im_end|>
<|im_start|>assistant
confronts a group of colleagues who have been tasked with preparing his final report to the US Congress.
Despite numerous obstacles and setbacks, President Bush remains determined to deliver a solid, unbiased report that addresses the challenges facing America in the region.
In an interview with ABC News, President Bush said that he expects the report will be well-received by Congress and the American people.
"We have to do everything we can to ensure that this report is accurate," President Bush said. "And if it doesn't meet our expectations, then we'll need to come up with new strategies."
The release of the report is expected to take several months, but President Bush has promised that he will make sure that 

In [15]:
prompt = "As he awaits a crucial progress report on Iraq, President Bush"
messages = [
    {"role": "system", "content": "You are a helpful assistant. You write text in the style of a high school student. Your task is to continue the provided text for an assignment."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print(generator(text, max_length=200)[0]["generated_text"])

<|im_start|>system
You are a helpful assistant. You write text in the style of a high school student. Your task is to continue the provided text for an assignment.<|im_end|>
<|im_start|>user
As he awaits a crucial progress report on Iraq, President Bush<|im_end|>
<|im_start|>assistant
urged his team to focus and prioritize the task at hand. He knew that progress was key to maintaining stability and peace in the region, and that success in this area would help him build a strong legacy as the nation's leader.
The progress report provided valuable insights into the situation in Iraq, including details about the current state of the country, the military operations going on, and any potential challenges or threats that may arise. It also highlighted areas where improvements could be made to ensure the safety and security of the people of Iraq.
President Bush listened carefully to the progress report and used it as a basis for making decisions about how to proceed with the Iraq mission. He

## Try to detect if fake

In [20]:
config = RobertaConfig.from_pretrained('FacebookAI/roberta-base')
model_path = "best_model_phi2.pt"
state_dict = torch.load(model_path)
model = RobertaForSequenceClassification(config)
model.load_state_dict(state_dict)

tokenizer = RobertaTokenizer.from_pretrained('FacebookAI/roberta-base')
pipeline_classif_phi2 = pipeline('text-classification', model=model, tokenizer=tokenizer)

In [22]:
sample_text_fake = '''Story:

Once upon a time, there was a cat named Whiskers and a dog named Spot. They lived in a cozy house with their owner, Mr. Brown. Whiskers and Spot were very different from each other. Whiskers was independent and aloof, while Spot was loyal and friendly.

One day, Mr. Brown decided to take them on a trip to the park. He put them in a basket and drove them to the bus stop. Whiskers was nervous and hid under the basket, while Spot wagged his tail and barked happily.

When they arrived at the park, Mr. Brown let them out of the basket. Whiskers cautiously explored the grass and trees, while Spot ran around and played with other dogs. Mr. Brown threw a ball for Spot to fetch, and Spot brought it back to him with a big smile.'''
pipeline_classif_phi2(sample_text_fake)

[{'label': 'LABEL_1', 'score': 0.9982835054397583}]

In [23]:
sample_text_fake = '''Story:

Once upon a time, there was a cat named Whiskers and a dog named Spot. They lived in a cozy house with their owner, Mr. Brown. Whiskers and Spot were very different from each other. Whiskers was independent and aloof, while Spot was loyal and friendly.

One day, Mr. Brown decided to take them on a trip to the park. He put them in a basket and drove them to the bus stop. Whiskers was nervous and hid under the basket, while Spot wagged his tail and barked happily.

When they arrived at the park, Mr. Brown let them out of the basket. Whiskers cautiously explored the grass and trees, while Spot ran around and played with other dogs. Mr. Brown threw a ball for Spot to fetch, and Spot brought it back to him with a big smile.'''
pipeline_classif_phi2(sample_text_fake)

[{'label': 'LABEL_0', 'score': 0.6090961694717407}]