In [2]:
import ssl
import certifi
import aiohttp
import random
import logging
import asyncio
from aiohttp import ClientSession, ClientTimeout, ClientError
from tqdm import tqdm
import itertools

import numpy as np
import pandas as pd


ssl_context = ssl.create_default_context(cafile=certifi.where())

In [17]:
API_URL = "https://api-inference.huggingface.co/models/openai-community/gpt2"
API_TOKEN = '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'

In [4]:
# asynchronous functions for efficiently calling on LLM APIs with batching
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
async def request_with_retry_hf(session, url, headers, json, semaphore, retries=4, backoff_factor=3):
    attempt = 0
    while attempt < retries:
        async with semaphore:
            try:
                async with session.post(url, headers=headers, json=json) as response:
                    if response.status in [200, 201]:
                        return await response.json()
                    elif response.status == 429:
                        retry_after = int(response.headers.get("Retry-After", 60))
                        logging.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
                    else:
                        raise RuntimeError(f"API returned a non-200 status code: {response.status}")
            except (ClientError, asyncio.TimeoutError) as e:
                logging.error(f"Request failed due to network error: {e}")
            sleep_time = backoff_factor ** attempt
            logging.info(f"Retrying in {sleep_time} seconds...")
            await asyncio.sleep(sleep_time)
            attempt += 1
            
    raise RuntimeError("Request failed after multiple retries.")

In [6]:
async def generate_text_async_hf(session, text, generation_params, semaphore):
    payload = {
        "inputs": text,
        "parameters": {**generation_params}
    }
    
    HEADERS = {"Authorization": f"Bearer {API_TOKEN}"}
    
    # Call the request_with_retry function to handle potential retries
    response_json = await request_with_retry_hf(session, API_URL, HEADERS, payload, semaphore)
    generated_text = response_json[0].get("generated_text", "No text generated")
    
    if "error" in response_json:
        raise RuntimeError(f"API returned an error: {response_json['error']}")
        
    return generated_text

In [7]:
async def run_batch(dataset, api_provider, num_generates, sleep_time, batch_size = 1):
    results_dict = {}
    semaphore = asyncio.BoundedSemaphore(128)
    timeout = ClientTimeout(total=60)

    #async with ClientSession(timeout=timeout) as session:
    async with ClientSession(timeout=timeout, connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
        for i in tqdm(range(0, num_generates, batch_size), desc="Processing batches"):
            
            generation_params = get_params()
            text_batch = dataset["assignment"] #15 samples
            
            if api_provider == "HF":
                tasks = [generate_text_async_hf(session, text, generation_params, semaphore) for text in text_batch]
            
            results_batch = await asyncio.gather(*tasks)
            
            for j in range(len(text_batch)):
                if text_batch[j] not in results_dict.keys():
                    results_dict[text_batch[j]] = [results_batch[j]]
                    
                else:
                    results_dict[text_batch[j]].append(results_batch[j])
            
            
            await asyncio.sleep(sleep_time)

    return results_dict

In [8]:
def get_params():
    temp_values = [0.7, 0.8, 0.9]
    top_k_values = [40, 80]
    top_p_values = [0.9, 0.95]
    max_new_token_values = [250, 300, 350]
    repetition_penalty_values = [1.0, 1.2]
    
    all_combinations = list(itertools.product(temp_values, top_k_values, top_p_values, max_new_token_values, repetition_penalty_values))
    index = np.random.randint(len(all_combinations), size=1)[0]
    
    
    combination = all_combinations[index]
    
    generation_params = dict(
        temperature=combination[0],
        top_k=combination[1],
        top_p=combination[2],
        max_new_tokens=combination[3],
        repetition_penalty=combination[4],
        do_sample=False,
        return_full_text=False,
        #seed=SEED,  # no seed, because we need randomness for self-consistency
        max_time=None, 
        stream=False,
        details=False,
        use_cache=False,
        wait_for_model=False,
    )
    
    return generation_params

In [9]:
persuade_df = pd.read_csv("persuade.csv")

In [10]:
persuade_df

Unnamed: 0,essay_id_comp,full_text,holistic_essay_score,word_count,prompt_name,task,assignment,source_text,gender,grade_level,ell_status,race_ethnicity,economically_disadvantaged,student_disability_status
0,423A1CA112E2,Phones\n\nModern humans today are always on th...,3,378,Phones and driving,Independent,Today the majority of humans own and operate c...,,M,,,Black/African American,,
1,BC75783F96E3,This essay will explain if drivers should or s...,4,432,Phones and driving,Independent,Today the majority of humans own and operate c...,,M,,,Black/African American,,
2,74C8BC7417DE,Driving while the use of cellular devices\n\nT...,2,179,Phones and driving,Independent,Today the majority of humans own and operate c...,,F,,,White,,
3,A8445CABFECE,Phones & Driving\n\nDrivers should not be able...,3,221,Phones and driving,Independent,Today the majority of humans own and operate c...,,M,,,Black/African American,,
4,6B4F7A0165B9,Cell Phone Operation While Driving\n\nThe abil...,4,334,Phones and driving,Independent,Today the majority of humans own and operate c...,,M,,,White,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25991,18409261F5C2,80% of Americans believe seeking multiple opin...,5,1050,Seeking multiple opinions,Independent,"When people ask for advice, they sometimes tal...",,M,8.0,No,Asian/Pacific Islander,Economically disadvantaged,Not identified as having disability
25992,D46BCB48440A,"When people ask for advice,they sometimes talk...",4,373,Seeking multiple opinions,Independent,"When people ask for advice, they sometimes tal...",,F,8.0,No,Black/African American,Economically disadvantaged,Not identified as having disability
25993,0FB0700DAF44,"During a group project, have you ever asked a ...",4,631,Seeking multiple opinions,Independent,"When people ask for advice, they sometimes tal...",,M,8.0,No,Asian/Pacific Islander,Not economically disadvantaged,Not identified as having disability
25994,D72CB1C11673,Making choices in life can be very difficult. ...,4,417,Seeking multiple opinions,Independent,"When people ask for advice, they sometimes tal...",,F,8.0,No,Black/African American,Economically disadvantaged,Not identified as having disability


In [11]:
unique_assignment = pd.DataFrame()
unique_assignment['assignment'] = persuade_df['assignment'].unique()

In [12]:
unique_assignment

Unnamed: 0,assignment
0,Today the majority of humans own and operate c...
1,Write an explanatory essay to inform fellow ci...
2,Some schools require students to complete summ...
3,"You have just read the article, 'A Cowboy Who ..."
4,Your principal has decided that all students m...
5,"In ""The Challenge of Exploring Venus,"" the aut..."
6,"In the article ""Making Mona Lisa Smile,"" the a..."
7,You have read the article 'Unmasking the Face ...
8,Some of your friends perform community service...
9,Your principal is considering changing school ...


In [13]:
for i in range(len(unique_assignment['assignment'])):
    text = 'Imagine that you are a high school student. '
    text += unique_assignment['assignment'].iloc[i]
    unique_assignment['assignment'].iloc[i] = text

In [18]:
# (dataset, api_provider, num_generates, sleep_time, batch_size = 1):
output_simple = await run_batch(unique_assignment, "HF", 2, 1)

Processing batches:   0%|                                                               | 0/2 [00:00<?, ?it/s]


RuntimeError: API returned a non-200 status code: 503

2024-04-20 16:03:22,084 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,085 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,087 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,088 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,093 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,094 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,102 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,102 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,103 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,104 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,105 - ERROR - Request failed due to network error: Connector is closed.
2024-04-20 16:03:22,107 - INFO - Retrying in 1 seconds...
2024-04-20 16:03:22,108 - ERROR - Request failed due to network error: Connector is closed.
2024-04-

In [15]:
output_simple

{'Imagine that you are a high school student. Today the majority of humans own and operate cell phones on a daily basis. In essay form, explain if drivers should or should not be able to use cell phones in any capacity while operating a vehicle.': [" Do you think there should be stricter laws regarding texting or talking on the phone while driving?\n\nDrivers Should Not Be Able to Use Cell Phones While Operating a Vehicle\n\nTechnology has greatly improved the way humans interact and communicate with each other. Unfortunately, technology has also posed a significant risk and danger to human beings, especially when it comes to driving. As a result, drivers should not be able to use cell phones in any capacity while operating a vehicle due to the potential harm and danger it can bring to the driver and other road users.\n\nDistracted Driving\n\nOne reason drivers should not use cell phones while driving is that it distracts them from the primary task of driving. The Centers for Disease C

In [16]:
import pandas as pd
sample_ai = pd.DataFrame(columns = ['Question', 'Text'])

In [17]:
for key in output_simple.keys():
    for value in output_simple[key]:
        row = pd.Series([key, value], index = ['Question', 'Text'])
        sample_ai = pd.concat([sample_ai, pd.DataFrame([row])], ignore_index=True)

In [18]:
import re

def clean_text(text):
    return re.sub(r'\n+', ' ', text)

sample_ai['Text'] = sample_ai['Text'].apply(clean_text)

In [28]:
def filter_top_n_entries_per_question(df, n=2):
    df = df.sample(frac = 1)
    filtered_df = df.groupby('assignment').head(n)
    return filtered_df

filtered_df = filter_top_n_entries_per_question(persuade_df, n=2)
sample_student = pd.DataFrame()
sample_student['Question'] = filtered_df['assignment']
sample_student['Text'] = filtered_df['full_text']

In [29]:
sample_student['Text'] = sample_student['Text'].apply(clean_text)

In [30]:
sample_ai['LABEL'] = 0
sample_student['LABEL'] = 1

In [31]:
result = pd.concat([sample_ai, sample_student], axis=0)

In [32]:
result = result.sample(frac = 1)

In [33]:
result.shape

(60, 3)

In [34]:
result.to_csv('Generated_Data_highschool.csv')

## Generate Data - 300

In [None]:
# (dataset, api_provider, num_generates, sleep_time, batch_size = 1):
output_simple = await run_batch(unique_assignment, "HF", 20, 1)

In [None]:
ai_df = pd.DataFrame()

for key in output_simple.keys():
    for value in output_simple[key]:
        row = pd.Series([key, value], index = ['Question', 'Text'])
        ai_df = pd.concat([ai_df, pd.DataFrame([row])], ignore_index=True)
        
ai_df['Text'] = ai_df['Text'].apply(clean_text)

In [None]:
def filter_top_n_entries_per_question(df, n):
    filtered_df = df.groupby('assignment').head(n)
    return filtered_df

filtered_df = filter_top_n_entries_per_question(persuade_df, n=20)
student_df = pd.DataFrame()
student_df['Question'] = filtered_df['assignment']
student_df['Text'] = filtered_df['full_text']
student_df['Text'] = student_df['Text'].apply(clean_text)

In [None]:
len(student_df)

In [None]:
ai_df['LABEL'] = 0
student_df['LABEL'] = 1

In [None]:
result_df = pd.concat([ai_df, student_df], axis=0)
result_df = result_df.sample(frac = 1)

In [None]:
result_df

In [None]:
result_df.to_csv('mistral_data.csv')

# Sample Generate

In [None]:
import requests

def generate_text(inputs, generation_params):
    payload = {
        "inputs": inputs, 
        "parameters": {**generation_params}
    }
    response = requests.post(
        API_URL, 
        headers = {"Authorization": f"Bearer {API_TOKEN}"},
        json=payload
    )
    return response.json()

In [None]:
generated_params = dict(
                top_p = 0.9,
                top_k = 40,
                temperature = 0.8,
                max_new_tokens = 300,
                repetition_penalty = 1.0,
                use_cache = False,
                return_full_text = False
            )

In [None]:
inputs = 'Today the majority of humans own and operate cell phones on a daily basis. In essay form, explain if drivers should or should not be able to use cell phones in any capacity while operating a vehicle.'

In [None]:
inputs

In [None]:
generate_text(inputs, generated_params)

In [None]:
txt = "\n\nCell phones have become a necessity in today's fast-paced world. However, their use while driving has become a topic of debate due to the numerous accidents caused by distracted driving. While some argue that drivers should be allowed to use cell phones in certain capacities, others believe that their use while driving should be completely prohibited. In my opinion, drivers should not be able to use cell phones in any capacity while operating a vehicle.\n\nFirstly, using a cell phone while driving is a significant distraction that can lead to accidents. According to the National Highway Traffic Safety Administration, distracted driving claimed 3,142 lives in 2019. Cell phone use while driving is a form of distracted driving that takes the driver's attention away from the road, increasing the risk of accidents.\n\nSecondly, using a cell phone while driving impairs the driver's ability to react to changing road conditions. Studies have shown that drivers who use cell phones while driving have slower reaction times, making it difficult for them to avoid accidents. Moreover, using a cell phone while driving can also lead to tunnel vision, where the driver's peripheral vision is compromised, making it difficult to see other vehicles, pedestrians, or obstacles on the road.\n\nThirdly, using a hands-free device while driving is not a safe alternative. While some argue that hands-free devices are safer than hand"

In [None]:
txt = clean_text(txt)

In [None]:
txt

## Generate - 3000

In [27]:
output_simple = await run_batch(unique_assignment, "HF", 30, 1)

Processing batches:  40%|█████████████████████▏                               | 12/30 [02:48<04:12, 14.04s/it]


RuntimeError: API returned a non-200 status code: 502

2024-04-12 23:14:30,647 - ERROR - Request failed due to network error: [Errno 1] [SSL: APPLICATION_DATA_AFTER_CLOSE_NOTIFY] application data after close notify (_ssl.c:2758)
2024-04-12 23:14:30,649 - INFO - Retrying in 1 seconds...
2024-04-12 23:14:31,159 - ERROR - Request failed due to network error: [Errno 1] [SSL: APPLICATION_DATA_AFTER_CLOSE_NOTIFY] application data after close notify (_ssl.c:2758)
2024-04-12 23:14:31,161 - INFO - Retrying in 1 seconds...
2024-04-12 23:14:31,774 - ERROR - Request failed due to network error: [Errno 1] [SSL: APPLICATION_DATA_AFTER_CLOSE_NOTIFY] application data after close notify (_ssl.c:2758)
2024-04-12 23:14:31,776 - INFO - Retrying in 1 seconds...
2024-04-12 23:14:31,824 - ERROR - Request failed due to network error: [Errno 1] [SSL: APPLICATION_DATA_AFTER_CLOSE_NOTIFY] application data after close notify (_ssl.c:2758)
2024-04-12 23:14:31,826 - INFO - Retrying in 1 seconds...
2024-04-12 23:14:32,294 - ERROR - Request failed due to network error: [

In [13]:
len(output_simple)

NameError: name 'output_simple' is not defined

## Paraphrase

In [2]:
from paraphraser import paraphrase

txt = "Cell phones have become a necessity in today's fast-paced world. However, their use while driving has become a topic of debate due to the numerous accidents caused by distracted driving. While some argue that drivers should be allowed to use cell phones in certain capacities, others believe that their use while driving should be completely prohibited. In my opinion, drivers should not be able to use cell phones in any capacity while operating a vehicle. Firstly, using a cell phone while driving is a significant distraction that can lead to accidents. According to the National Highway Traffic Safety Administration, distracted driving claimed 3,142 lives in 2019. Cell phone use while driving is a form of distracted driving that takes the driver's attention away from the road, increasing the risk of accidents. Secondly, using a cell phone while driving impairs the driver's ability to react to changing road conditions. Studies have shown that drivers who use cell phones while driving have slower reaction times, making it difficult for them to avoid accidents. Moreover, using a cell phone while driving can also lead to tunnel vision, where the driver's peripheral vision is compromised, making it difficult to see other vehicles, pedestrians, or obstacles on the road. Thirdly, using a hands-free device while driving is not a safe alternative. While some argue that hands-free devices are safer than hand"
sent = paraphrase([txt])

print(txt)
print(sent)

AttributeError: 'NoneType' object has no attribute 'group'

## ChatGPT

In [4]:
f = open("ML_GPT_GEN_Essays.txt", "r")
f.readline()
f.readline()

'    "Today the majority of humans own and operate cell phones on a daily basis. In essay form, explain if drivers should or should not be able to use cell phones in any capacity while operating a vehicle.": [\n'