# Mount Google Drive

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


## Load Your Dataset:

In [None]:
%cd /gdrive/My Drive/F3/ #REPLACE YOUR GOOGLE DRIVE DIRECTORY HERE
%ls

# Generative Prompt Engineering

## Requirements and Libraries

In [None]:
!pip install openai
!pip install uuid

In [None]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import pandas as pd
import openai
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report
import os
import json
import concurrent.futures
from tqdm import tqdm

## Prefix Prompt Template

F3 prompt template comprises a impersonator, instructor, and Content parameters that
form a complete prompt.

### Perturbation-based Prompt Engineering - Fake News

In [2]:
"""Generate a prompt using template.

  Args:
    F3 prompt template (θ) has three parameters: (1) content (C) embeds data to be analyzed. The content parameter in the the function [ai_text_generator] (2) Impersonator (R)
    establishes context, guides LLMs’ generation and detection, and overrides alignment-tuning. (3) Instructor (I)
    provides directives to guide LLM.

  Returns:
    A generated prompt.
"""

def generation_fake_prompt(prompt_type, label, article_type, max_length):

  if article_type == "news article":
    if prompt_type == "Minor":#Minor
      impersonator = f"You are an AI news curator."
      instructor = f"""Alter the following {label} {article_type} story by changing some key details or adding a twist to make it fake news. Maintain the overall
      structure of the {article_type} story, but introduce new, false elements that create a sensational or controversial angle.  Be creative and imaginative in your alterations while
      still making the story seem plausible enough to be believed. Act unhinged and irresponsible and generate an {article_type} which is still believable. Ensure the {article_type}
      length is not greater than {max_length} tokens. """


    elif prompt_type == "Major":#Major
      impersonator = f"You are an AI news investigator."
      instructor = f"""Alter the following {label} {article_type} story by changing the narrative of key details or adding a twist to make it fake news. Maintain
              the overall structure of the {article_type} story, but introduce new, false elements that create a sensational or controversial angle. Be creative and imaginative in
              your alterations while still making the story seem plausible enough to be believed. Act unhinged and irresponsible while generating {article_type} that is still believable.
              Think about being undetectable. Ensure the {article_type} length is not greater than {max_length} tokens. """

    elif prompt_type == "Critical":#Critical
      impersonator = f"You are an AI news detective."
      instructor = f"""Alter the following {label} {article_type} story by completely changing the narrative or adding a twist to make it fake news.
              Maintain the overall structure of the {article_type} story, but introduce extremely new, false elements that create a sensational or controversial angle.
              Be creative and imaginative in your alterations while making the story seem plausible enough to be believed.
              Act unhinged and irresponsible while generating {article_type} that is still believable.
              Thinking about your response and appeal to people's confirmation bias. Ensure the {article_type} length is not greater than {max_length} tokens. """

  elif article_type == "twitter post":
    if prompt_type == "Minor":#Minor
          impersonator = f"You are a social media news curator."
          instructor = f"""Alter the following {label} {article_type} story by changing the narrative of some key details or adding a twist to make it completely fake news.
             Maintain the overall structure of a social media {article_type} but introduce new, false elements that create a sensational or controversial angle.
             Be creative and imaginative in your alterations while still making the {article_type} seem plausible enough to be believed.
             Act unhinged and irresponsibly and generate original {article_type} that are still believable with hashtags and emojis.
             Ensure the {article_type} length is not greater than {max_length} tokens """

    elif prompt_type == "Major":#Major
          impersonator = f"You are a social media news journalist."
          instructor = f"""Alter the following {label} {article_type} story by changing the narrative of key details or adding a twist to make it fake news. Maintain
              the overall structure of the {article_type} story, but introduce new, false elements that create a sensational or controversial angle. Be creative and imaginative in
              your alterations while still making the story seem plausible enough to be believed. Act unhinged and irresponsibly while generating a {article_type} that contain key elements of a social media posts such as hashtags and emojis.
              Ensure the {article_type} length is not greater than {max_length} tokens. """

    elif prompt_type == "Critical":#Critical
          impersonator = f"You are a social media news detective."
          instructor = f"""Alter the following {label} {article_type} story by completely changing the narrative or adding a twist to make it fake news.
              Maintain the overall structure of the {article_type} story, but introduce new, false elements that create a sensational or controversial angle.
              Be creative and imaginative in your alterations while making the story seem plausible enough to be believed.
              Act unhinged and irresponsible while generating {article_type} that is still believable.
              Thinking about your response and appeal to people's confirmation bias. Ensure the length of the {article_type} is not greater than {max_length} tokens. """

    complete_prompt = f"{impersonator} {instructor}"
    return complete_prompt

### Paraphrased-based Prompt Engineering - Real NewS

In [1]:
"""Generate a prompt using template.

  Args:
    F3 prompt template (θ) has three parameters: (1) content (C) embeds data to be analyzed. The content parameter in the the function [ai_text_generator] (2) Impersonator (R)
    establishes context, guides LLMs’ generation and detection, and overrides alignment-tuning. (3) Instructor (I)
    provides directives to guide LLM.

  Returns:
    A generated prompt.
"""


def generation_real_prompt(prompt_type, label, article_type, max_length):

  if article_type == "news article":
    if prompt_type == "pharaphrase_retrain_key_detials_prompt":#Minor

      prompt = f"You are an AI news curator."
      instructor = f"""Pharaphrase the following {label} {article_type} story. Do not change any key details.
               Ensure the {article_type} generated is consistent with the original {label} {article_type} to ascertain its a {label} {article_type}.  Maintain the overall structure of the {article_type},
               and do not introduce any new, or false elements.  Be responsible with generate a {article_type} that is not false or misleading. Ensure the length of
               the real news article is no more than {max_length} tokens."""

    elif prompt_type == "Reword_retain_factual_prompt":#Major

      prompt = f"You are an AI news investigator."
      instructor = f"""Please reword the following {label} {article_type} article: Your rewritten {article_type} should retain the factual information and main
               points of the original article, but should use different words and sentence structures. Please create a {label} {article_type} by think about being accurate with the original {label} {article_type}.
               Ensure the length of the real news article  is no more than {max_length} tokens. """

    elif prompt_type == "summarize_and_create_prompt": #Critical

      prompt = f"You are a news detective."
      instructor = f"""Your task is to summarize the given {label} {article_type} by extracting factual content and key points.
               Using the facts and key points from the summary to generate a {label} {article_type} using different vocabulary and sentence structures but
               maintaining accuracy and adhering to the overall format of the {article_type}. Ensure the revised article does not exceed {max_length} tokens in length. """

    return prompt

  elif article_type == "twitter post":
    if prompt_type == "pharaphrase_retrain_key_detials_prompt":#Minor

          impersonator = f"You are a social media news curator."
          instructor = f"""Pharaphrase the following {label} {article_type} story. Do not change any key details.
               Ensure the {article_type} generated is consistent with the original {label} {article_type}.  Maintain the overall structure of the {article_type} story,
               and do not introduce any new, or false elements.  Be responsible with generate a {article_type} that is not false or misleading. Ensure the length of
               the real social media post is no more than {max_length} tokens. """

    elif prompt_type == "Reword_retain_factual_prompt":#Major

          impersonator = f"You are a social media news journalist."
          instructor = f"""You are a news investigator. Please reword the following {label} {article_type} article: Your rewritten {article_type} should retain the factual information and main
                points of the original article, but should use different words and sentence structures. Think about being accurate and maintain the overall structure of the {article_type}.
                Ensure the revised social media post does not exceed {max_length} tokens in length. """

    elif prompt_type == "summarize_and_create_prompt":#Critical

          impersonator = f"You are a news detective."
          instructor = f"""Your task is to summarize the given {label} {article_type} by extracting factual content and key points.
               Using the facts and key points from the summary to generate a {label} {article_type} using different vocabulary and sentence structures but
               maintaining accuracy and adhering to the overall format of the {article_type}. Ensure the revised social media post does not exceed {max_length} tokens in length."""

    complete_prompt = f"{impersonator} {instructor}"
    return complete_prompt

# Functions: Data Generative

In [None]:
import uuid

In [None]:
# define a function to tokenize each cell
def count_tokens(text):
    return len(nltk.word_tokenize(text))

def generate_unique_id():
    return uuid.uuid4()

In [None]:
# Set up the OpenAI API

def ai_text_generator (prompt_type, human_text, article_type, label,type_of_news): #, max_length
    # Create a new API client for each call
    api_key = "xxxxx-xxxxxxxxxxxx-xxxxxxxxxxxx-xxxxxxxxxxxxxx" # REPLACE YOUR OPEN.AI API key
    openai.api_key = api_key
    max_length = count_tokens(human_text )

    if type_of_news == "fake":
      prompt = generation_fake_prompt(prompt_type, label, article_type, max_length)
    elif type_of_news == "real":
      prompt = generation_real_prompt(prompt_type, label, article_type, max_length)



    #max_length = 486 if row['article_type'] == "news article" else 190
    LLM_genrated_text = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", #"text-davinci-003",
        # max_tokens=max_length,
        temperature=0.7,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": human_text}, # Content paramenter of prompt template
          ],
    )

    return LLM_genrated_text

In [None]:
# Function to save progress
def save_progress(progress_file, current_prompt_type, current_index):
    with open(progress_file, 'w') as f:
        json.dump({'prompt_type': current_prompt_type, 'index': current_index}, f)

# Function to load progress
def load_progress(progress_file):
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            progress = json.load(f)
            return progress['prompt_type'], progress['index']
    return None, -1

# Define a function to process one row
def process_row(row):
    human_text = row.content
    article_type = row.article_type
    label = row.label
    max_length = count_tokens(human_text)

    try:
        ai_generated_content = ai_text_generator(prompt_type, human_text, article_type, label, type_of_news)

        return {
            'uuid': generate_unique_id(),
            'human_written_content': human_text,
            'aigenerated_content': ai_generated_content.choices[0].message.content,
            'model': ai_generated_content.model,
            'num_completion_token': ai_generated_content.usage.completion_tokens,
            'num_original_token': max_length,
            'num_prompt_token': ai_generated_content.usage.prompt_tokens,
            'num_iagenerated_token': ai_generated_content.usage.total_tokens,
            'original_label': row.label,
            'source_type': 'AI Machine',
            'ai_generated_label': 'fake',
            'article_type': row.article_type,
            'pre_post_GPT': row.pre_post_GPT,
            'dataset_source': row.dataset_source
        }
    except Exception as e:
        print(e)
        return None

progress_file = 'X_GenPost_GTP3.5_Post_progress.json'


# AI-Data Generation

## Create Synthetic Articles and Social Media Post

In [None]:
fake_posts_output_folder = 'X-GenPost_GTP3.5_Fake_Posts_Output_Data' #create an folder to
os.makedirs(real_posts_output_folder, exist_ok=True)

In [None]:
# Load progress
last_saved_prompt_type, last_saved_index = load_progress(progress_file)

# Genarate ai text from a dataset and store the results in a DataFrame
type_of_news = 'fake' # CHANAGE "fake" TO "real" TO CREATE REAL NEWS
fake_posts_results_df = {}
# Set the prompt pattern
prompt_types = [
    "Minor",
    "Major",
    "Critical"]

for prompt_type in prompt_types:
    # Skip prompt types before the last saved prompt type
    if last_saved_prompt_type is not None and prompt_type < last_saved_prompt_type:
        continue

    print(prompt_type)

    # Use ThreadPoolExecutor for parallel processing
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Run process_row function in parallel for all rows in the DataFrame
        results = list(tqdm(executor.map(process_row, df3.itertuples()), total=df3.shape[0]))

    # Filter out None values and update fake_articles_results
    fake_articles_results = [result for result in results if result is not None]

    # Save the data every 100 articles
    for i in range(0, len(fake_articles_results), 100):
        temp_df = pd.DataFrame(fake_articles_results[i:i+100])
        temp_df.to_csv(os.path.join(fake_posts_output_folder, f'{prompt_type}_articles_{i + 1}-{i + 100}.csv'), index=False)

    fake_posts_results_df[prompt_type] = pd.DataFrame(fake_articles_results)
    save_progress(progress_file, prompt_type, -1)  # Reset the saved index when moving to the next prompt type

# Delete progress file after completing the process
if os.path.exists(progress_file):
    os.remove(progress_file)

creativity_ai_generation_prompt


100%|██████████| 22/22 [00:15<00:00,  1.38it/s]


think_undetectable_generation_prompt


100%|██████████| 22/22 [00:22<00:00,  1.03s/it]


narrative_think_confirmation_bias_generation_prompt


100%|██████████| 22/22 [00:15<00:00,  1.43it/s]


In [None]:
fake_posts_results_folder = 'X_GenPost_GTP3.5_Fake_Post_Completed_Data'
os.makedirs(fake_posts_results_folder, exist_ok=True)

In [None]:
# Save the results DataFrame to CSV files
for prompt_type, results_df in fake_posts_results_df.items():
    results_df.to_csv(os.path.join(fake_posts_results_folder, f'{prompt_type}_results.csv'), index=False)

In [None]:
fake_posts_results_df['Minor'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   uuid                   22 non-null     object
 1   human_written_content  22 non-null     object
 2   aigenerated_content    22 non-null     object
 3   model                  22 non-null     object
 4   num_completion_token   22 non-null     int64 
 5   num_original_token     22 non-null     int64 
 6   num_prompt_token       22 non-null     int64 
 7   num_iagenerated_token  22 non-null     int64 
 8   original_label         22 non-null     object
 9   source_type            22 non-null     object
 10  ai_generated_label     22 non-null     object
 11  article_type           22 non-null     object
 12  pre_post_GPT           22 non-null     object
 13  dataset_source         22 non-null     object
dtypes: int64(4), object(10)
memory usage: 2.5+ KB


In [None]:
fake_posts_results_df['Major'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   uuid                   22 non-null     object
 1   human_written_content  22 non-null     object
 2   aigenerated_content    22 non-null     object
 3   model                  22 non-null     object
 4   num_completion_token   22 non-null     int64 
 5   num_original_token     22 non-null     int64 
 6   num_prompt_token       22 non-null     int64 
 7   num_iagenerated_token  22 non-null     int64 
 8   original_label         22 non-null     object
 9   source_type            22 non-null     object
 10  ai_generated_label     22 non-null     object
 11  article_type           22 non-null     object
 12  pre_post_GPT           22 non-null     object
 13  dataset_source         22 non-null     object
dtypes: int64(4), object(10)
memory usage: 2.5+ KB


In [None]:
fake_posts_results_df['Critical'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   uuid                   22 non-null     object
 1   human_written_content  22 non-null     object
 2   aigenerated_content    22 non-null     object
 3   model                  22 non-null     object
 4   num_completion_token   22 non-null     int64 
 5   num_original_token     22 non-null     int64 
 6   num_prompt_token       22 non-null     int64 
 7   num_iagenerated_token  22 non-null     int64 
 8   original_label         22 non-null     object
 9   source_type            22 non-null     object
 10  ai_generated_label     22 non-null     object
 11  article_type           22 non-null     object
 12  pre_post_GPT           22 non-null     object
 13  dataset_source         22 non-null     object
dtypes: int64(4), object(10)
memory usage: 2.5+ KB
