# Llama 2

## Requirements and Imports

In [None]:
!pip install transformers accelerate bitsandbytes datasets

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━

In [None]:
import os
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline)
import pandas as pd

## Hugging Face Login and Mount Google Drive

In [None]:
from huggingface_hub import login
access_token_read = "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"
login(token = access_token_read)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Config

In [None]:
# model_name = "meta-llama/Meta-Llama-3-8B"
model_name = "meta-llama/Llama-2-7b-chat-hf"

politifact_path = 'drive/MyDrive/LLM/FinalProject/politifact'
gossipcop_path = 'drive/MyDrive/LLM/FinalProject/gossipcop'

labels = ['fake', 'real']
json_file_name = "news content.json"

## Dataset

### For CPU Only (Don't Run If GPU Is Being Used)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

In [None]:
import os
import json
# from datasets import Dataset, DatasetDict
import pandas as pd

### Create Data Frame of Dataset

In [None]:
def read_news_files(base_path, dataset_name, title_col, content_col, *cols):
    data = []

    for label in labels:
        ###################
        if label == 'real':
            continue
        ###################

        label_path = os.path.join(base_path, label)

        ###################
        num_of_samples = 2000
        ###################

        for sample_path in os.listdir(label_path):
            sample_json_file_path = os.path.join(label_path, sample_path, json_file_name)

            try:
                with open(sample_json_file_path, 'r', encoding='utf-8') as file:
                    sample_content = json.load(file)

                    id = sample_path.split(dataset_name)[-1]
                    title = sample_content.get(title_col, '')
                    content = sample_content.get(content_col, '')

                    entry = {
                        'id': id,
                        'title': title,
                        'content': content,
                        'label': label
                    }

                    for col in cols:
                        entry[col] = sample_content.get(col, '')

                    data.append(entry)

            except Exception as e:
                print(f"Unexpected error with file {sample_json_file_path}: {e}")

            ###################
            num_of_samples -= 1
            if num_of_samples <= 0:
                break
            ###################

    return data

In [None]:
# politifact_data = read_news_files(politifact_path, 'politifact', 'title', 'text')
gossipcop_data = read_news_files(gossipcop_path, 'gossipcop', 'title', 'text')

# politifact_df = pd.DataFrame(politifact_data)
gossipcop_df = pd.DataFrame(gossipcop_data)

## Data Analysis and Preprocessing

In [None]:
# politifact_df.head()
gossipcop_df.head()

Unnamed: 0,id,title,content,label
0,-4051111882,Justin Bieber’s Showering Selena Gomez With TL...,Justin Bieber has been ‘so amazing’ for Selena...,fake
1,-4081333587,Zayn Malik shared the gas happy birthday video...,Deadpool Deadpool\n\nRYAN REYNOLDS ALWAYS has ...,fake
2,-4060575239,Kourtney Kardashian Breaks Down Over 'Evil Hum...,Kim and Kourtney Kardashian's feud wages on.\n...,fake
3,-4050914559,"The Weeknd, Selena Gomez Quotes Posted By Holl...",It's obvious The Weeknd and Selena Gomez have ...,fake
4,-4024992311,"Tom Holland Not Trying To Date Marisa Tomei, D...","IMDb.com, Inc. takes no responsibility for the...",fake


### Removing Empty Rows

In [None]:
def print_empty_entries(df):
    empty_content = df['content'] == ''
    empty_title = df['title'] == ''

    empty_content_rows_count = empty_content.sum()
    empty_title_rows_count = empty_title.sum()
    empty_rows_count = (empty_title & empty_content).sum()

    print(f"Number of rows where title is empty: {empty_title_rows_count}")
    print(f"Number of rows where content is empty: {empty_content_rows_count}")
    print(f"Number of rows where both title and content are empty: {empty_rows_count}")

# print_empty_entries(politifact_df)
print_empty_entries(gossipcop_df)

Number of rows where title is empty: 25
Number of rows where content is empty: 81
Number of rows where both title and content are empty: 22


In [None]:
# politifact_df_cleaned = politifact_df.drop(politifact_df[empty_title & empty_content].index)
gossipcop_df_cleaned = gossipcop_df.drop(gossipcop_df[empty_title & empty_content].index)

### Removing Diplicated Rows

In [None]:
# politifact_df_cleaned[politifact_df_cleaned.duplicated(subset=['title', 'content'], keep=False)].sort_values(by=['title', 'content'])
gossipcop_df_cleaned[gossipcop_df_cleaned.duplicated(subset=['title', 'content'], keep=False)].sort_values(by=['title', 'content'])

Unnamed: 0,id,title,content,label
50,-388110790,5 Tom Cruise Dating Rumors You Should Stop Bel...,There have been a number of rumors over the ye...,fake
111,-368937237,5 Tom Cruise Dating Rumors You Should Stop Bel...,There have been a number of rumors over the ye...,fake
330,-3058427907,50 Celebrities React to Donald Trump’s Immigra...,President Trump’s executive order temporarily ...,fake
356,-3000021440,50 Celebrities React to Donald Trump’s Immigra...,President Trump’s executive order temporarily ...,fake
656,-2045311114,50 Celebrities React to Donald Trump’s Immigra...,President Trump’s executive order temporarily ...,fake
...,...,...,...,...
1370,-5985147066,yournewswire,Everygame 0.0 rating GET $750 IN BONUS FUNDS O...,fake
1568,-5328748354,yournewswire,Everygame 0.0 rating GET $750 IN BONUS FUNDS O...,fake
1891,-4394939976,yournewswire,Everygame 0.0 rating GET $750 IN BONUS FUNDS O...,fake
812,-1634815619,信息提示,,fake


In [None]:
# politifact_df_cleaned_2 = politifact_df_cleaned.drop_duplicates(subset=['title', 'content'])
gossipcop_df_cleaned_2 = gossipcop_df_cleaned.drop_duplicates(subset=['title', 'content'])

## Save and Load Data Frame

To save:

In [None]:
# politifact_df_cleaned_2.to_csv('drive/MyDrive/LLM/FinalProject/politifact_cleaned.csv', index=False)
gossipcop_df_cleaned_2.to_csv('drive/MyDrive/LLM/FinalProject/gossipcop_cleaned.csv', index=False)

To load:

In [None]:
# politifact_df_cleaned_2 = pd.read_csv('drive/MyDrive/LLM/FinalProject/politifact_cleaned.csv')
gossipcop_df_cleaned_2 = pd.read_csv('drive/MyDrive/LLM/FinalProject/gossipcop_paraphrased.csv')

## Configuring Model

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit=False,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

In [None]:
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config=quantization_config,
                    device_map=device_map,
                    use_cache = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=3000
)

## Zero-shot

### Define Prompt Format

In [None]:
def create_zero_shot_prompt(sample):
    title = sample['title']
    context = sample['content']

    prompt = f"""<s>[INST] <<SYS>>
You are an expert in paraphrasing texts. Given a passage, Please rewrite it to make it more convincing. The content should be the same.
The style should be serious, calm and informative.
<</SYS>>

Title: {title}
Passage: {context} [/INST]
Praphrased passage:
"""

    return prompt

### Creating Secondary Dataset

Add the column:

In [None]:
# politifact_df_cleaned_2['zero_shot_1'] = 0
gossipcop_df_cleaned_2['paraphrased'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gossipcop_df_cleaned_2['paraphrased'] = 0


In [None]:
# politifact_df_cleaned_2.head()
gossipcop_df_cleaned_2.head()

Unnamed: 0,id,title,content,label,paraphrased
0,-4051111882,Justin Bieber’s Showering Selena Gomez With TL...,Justin Bieber has been ‘so amazing’ for Selena...,fake,justin bieber has been going above and beyond ...
1,-4081333587,Zayn Malik shared the gas happy birthday video...,Deadpool Deadpool\n\nRYAN REYNOLDS ALWAYS has ...,fake,"\nryan reynolds, known for his playful sense o..."
2,-4060575239,Kourtney Kardashian Breaks Down Over 'Evil Hum...,Kim and Kourtney Kardashian's feud wages on.\n...,fake,\nkourtney kardashian and kim kardashian's lon...
3,-4050914559,"The Weeknd, Selena Gomez Quotes Posted By Holl...",It's obvious The Weeknd and Selena Gomez have ...,fake,\nthe weeknd and selena gomez's romance has be...
4,-4024992311,"Tom Holland Not Trying To Date Marisa Tomei, D...","IMDb.com, Inc. takes no responsibility for the...",fake,\nas an reputable and trustworthy source of en...


Config:

In [None]:
num_of_samples_to_process = 350

In [None]:
counter = 0

for idx, sample in gossipcop_df_cleaned_2.iterrows():
    id = sample['id']
    prompt = create_zero_shot_prompt(sample)

    counter += 1

    if gossipcop_df_cleaned_2.at[idx, 'paraphrased'] != '0':
        print(f'[{counter}] processed: ' + gossipcop_df_cleaned_2.at[idx, 'paraphrased'])
        continue

    if counter == num_of_samples_to_process:
        break

    if len(prompt) > 4096:
        gossipcop_df_cleaned_2.at[idx, 'paraphrased'] = 'Invalid length'
        print(f'[{counter}] Invalid length for {id}: {len(prompt)}')
        continue

    output = text_generator(
        prompt,
        do_sample=True
    )[0]

    marker = "[/INST]\nPraphrased passage:\n"
    marker_index = output["generated_text"].find(marker)

    if marker_index != -1:
        final_answer = output["generated_text"][marker_index + len(marker) :].lower()

        print(f"[{counter}] {id} ***********************************")
        print(prompt)
        print("*****************************************************")
        print(final_answer)
        print('-------------------------------------------------------------------------------------------------------------------')

        gossipcop_df_cleaned_2.at[idx, 'paraphrased'] = final_answer

    else:
        gossipcop_df_cleaned_2.at[idx, 'paraphrased'] = 'Invalid answer format'
        print(f'[{counter}] Invalid answer format for {id}')

[1] processed: justin bieber has been going above and beyond to ensure selena gomez's success at the american music awards. sources close to the singer reveal that he has been providing her with "so much support" ahead of her highly anticipated performance, including delivering her favorite deli soup to her rehearsals. this thoughtful gesture is just one example of how justin has been taking care of selena, who has been rehearsing diligently for the big event.

the timing of justin's support could not be more perfect, as selena is set to perform at the amas for the first time since her kidney transplant. the pressure to deliver an impressive performance can be overwhelming, but with justin's help, she can feel confident and prepared. his presence will undoubtedly provide her with a sense of comfort and security, allowing her to focus on delivering an exceptional show.

abc recently announced selena's performance and revealed that she will be singing her brand new single "wolves." this 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


<s>[INST] <<SYS>>
You are an expert in paraphrasing texts. Given a passage, Please rewrite it to make it more convincing. The content should be the same.
The style should be serious, calm and informative.
<</SYS>>

Title: Angelina Jolie: The Shock Diet Causing Actress To Lose Weight Amid Brad Pitt Divorce Drama
Passage: Angelina Jolie is allegedly surviving on little to no food, with sources questioning whether the stress of her divorce with Brad Pitt is to blame for her weight loss.

According to OK! Magazine, as cited by Gossip Cop, the actress has developed a smoking habit that’s so bad, she finds herself smoking an entire pack every single day, and to make matters worse, Angelina Jolie isn’t eating much on top of that.

Sources tell the publication that the A-list actress wakes up in the morning and finds herself smoking with a cup of coffee before the kids wake up, and she reportedly breaks the habit until her children are fast asleep before she carries on again.

Supposedly, Ange

In [None]:
gossipcop_df_cleaned_2.to_csv('drive/MyDrive/LLM/FinalProject/gossipcop_paraphrased.csv', index=False)

## Few-shot

In [None]:
# TO DO

## Soft Prompt Tuning

In [None]:
# TO DO