In [None]:
!pip install accelerate peft bitsandbytes transformers trl tqdm pandas torch transformers

In [2]:
import pandas as pd
import torch
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import (
    pipeline,
)
def format_prompt_baseline(essay, assignment):
    text = (
        "You are my English teacher. Read my essay and assignment:\n\n"
        f"Essay: '''{essay}'''\n\n"
        f"Assignment: '''{assignment}'''\n\n"
        "Give me feedback to help me revise. Extract three to five very short excerpts from my essay and give me feedback on those. Keep your feedback very short. List the excerpts and feedback like this:\n"
        "*[excerpt]---[feedback]\n"
        "*[excerpt]---[feedback]\n"
        "*[excerpt]---[feedback]\n"
    )
    return text

def generate(input, model, tokenizer):
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2048)
    result = pipe(f"<s>[INST] {input} [/INST]")
    return result[0]['generated_text']

In [5]:
# Load dataset
df = pd.read_csv('eval_data_asap.csv')
df['input'] = df.apply(lambda row: format_prompt_baseline(row['essay'], row['prompt']), axis=1)

In [6]:
df.head()

Unnamed: 0,essay_id,grade,prompt,essay,excerpt,feedback,tid,isRepresentative,comment_id,input
0,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",I have noticed that many people have been spen...,Clear statement of your opinion. Good work.,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...
1,12191.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....",The mood of the memoir is appreciation. I thin...,memoir,"state the memoir for the audience, please.",2.0,yes,1713802209139,You are my English teacher. Read my essay and ...
2,13093.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....","In the excerpt ""Narciso Rodriguez"" from the bo...",the author tries to create a mood that complem...,The mood being set by the author is not clear....,3.0,no,1712885228942,You are my English teacher. Read my essay and ...
3,1317.0,8.0,"More and more people use computers, but not ev...","Dear local Newspaper, @CAPS1 you love computer...",you love computers but fear that you or loved ...,Good opening hook!,5.0,,1713795675380,You are my English teacher. Read my essay and ...
4,13619.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....",The @CAPS1 sets a good mood in this memoir abo...,good,Can you be more specific about what kind of go...,5.0,yes,1713797625167,You are my English teacher. Read my essay and ...


In [7]:
df.shape

(55, 10)

In [9]:
device = 0
# Set up model config
base_model = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:40<00:00, 20.01s/it]


In [10]:
#try just 1
res = generate(df['input'][0], model, tokenizer)
print(res)

<s>[INST] You are my English teacher. Read my essay and assignment:

Essay: '''Dear editor @ORGANIZATION2 the source, @CAPS1, I have noticed that many people have been spending too much time using their computers. This is something that needs to stop because it imports peoples health, safety and education. To open my statements, overuse @ORGANIZATION2 the computer con lead to poor health. Take my friend @PERSON1 for example. He sits at home on his computer all @DATE1 and plays games, goes on @CAPS2 and @CAPS3. He rarely gets any excersize and he almost never sees daylight. @PERSON1 @LOCATION1 gained @NUM1 pounds, which lead to obeisity and heart problems. He even wears wooden teeth because his general hygene declined as well. Another common health problem that developes from excessive use @ORGANIZATION2 the computer is arthrietis. Dr. @PERSON3 @ORGANIZATION2 the @ORGANIZATION1 told @ORGANIZATION3, "The massive amounts @ORGANIZATION2 typing on the keyboard can lead to joint problems, ar

In [11]:
tqdm.pandas()
df['predictions'] = df.progress_apply(lambda row: generate(row['input'], model, tokenizer), axis=1)
df['predictions'][0]

100%|██████████| 55/55 [11:14<00:00, 12.27s/it]


'<s>[INST] You are my English teacher. Read my essay and assignment:\n\nEssay: \'\'\'Dear editor @ORGANIZATION2 the source, @CAPS1, I have noticed that many people have been spending too much time using their computers. This is something that needs to stop because it imports peoples health, safety and education. To open my statements, overuse @ORGANIZATION2 the computer con lead to poor health. Take my friend @PERSON1 for example. He sits at home on his computer all @DATE1 and plays games, goes on @CAPS2 and @CAPS3. He rarely gets any excersize and he almost never sees daylight. @PERSON1 @LOCATION1 gained @NUM1 pounds, which lead to obeisity and heart problems. He even wears wooden teeth because his general hygene declined as well. Another common health problem that developes from excessive use @ORGANIZATION2 the computer is arthrietis. Dr. @PERSON3 @ORGANIZATION2 the @ORGANIZATION1 told @ORGANIZATION3, "The massive amounts @ORGANIZATION2 typing on the keyboard can lead to joint proble

In [12]:
import re
def extract_text_after_tag(text, tag='[/INST]'):
    pattern = re.compile(f"{re.escape(tag)}(.*)", re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip()
    return ""

def extract_feedback_sections(text):
    # Split the text by asterisks to separate sections
    sections = text.split('*')[1:]  # Ignore the part before the first asterisk
    
    result = []
    for section in sections:
        # Use regex to extract the text between * and ---
        excerpt_match = re.search(r'"(.*?)"---', section, re.DOTALL)
        excerpt = excerpt_match.group(1).strip() if excerpt_match else ''

        # Extract the feedback text after --- and before \n
        feedback_match = re.search(r'---(.*?)(\n|$)', section, re.DOTALL)
        #feedback_match = re.search(r'---(.*)', section, re.DOTALL)
        feedback = feedback_match.group(1).strip() if feedback_match else ''

        if len(excerpt) > 0 and len(feedback) > 0:
            result.append({'excerpt': excerpt, 'feedback': feedback})
    
    return result

def expand_pred_df(df, input_column):
    new_rows = []
    for idx, row in df.iterrows():
        input_text = row[input_column]
        sections = extract_feedback_sections(input_text)
        for section in sections:
            new_row = row.to_dict()
            new_row['excerpt'] = section['excerpt']
            new_row['feedback'] = section['feedback']
            new_rows.append(new_row)
    
    expanded_df = pd.DataFrame(new_rows)
    return expanded_df

In [13]:
df['extracted_content'] = df['predictions'].apply(extract_text_after_tag)

In [14]:
df.head()

Unnamed: 0,essay_id,grade,prompt,essay,excerpt,feedback,tid,isRepresentative,comment_id,input,predictions,extracted_content
0,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",I have noticed that many people have been spen...,Clear statement of your opinion. Good work.,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...
1,12191.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....",The mood of the memoir is appreciation. I thin...,memoir,"state the memoir for the audience, please.",2.0,yes,1713802209139,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! I'd be happy to help you revise you...
2,13093.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....","In the excerpt ""Narciso Rodriguez"" from the bo...",the author tries to create a mood that complem...,The mood being set by the author is not clear....,3.0,no,1712885228942,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! I'd be happy to help you revise you...
3,1317.0,8.0,"More and more people use computers, but not ev...","Dear local Newspaper, @CAPS1 you love computer...",you love computers but fear that you or loved ...,Good opening hook!,5.0,,1713795675380,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,"Sure, I'd be happy to help you revise your ess..."
4,13619.0,8.0,"Read ""Narciso Rodriguez"" by Narciso Rodriguez....",The @CAPS1 sets a good mood in this memoir abo...,good,Can you be more specific about what kind of go...,5.0,yes,1713797625167,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! I'd be happy to help you revise you...


In [15]:
res_df = expand_pred_df(df, 'extracted_content')

In [16]:
res_df.head()

Unnamed: 0,essay_id,grade,prompt,essay,excerpt,feedback,tid,isRepresentative,comment_id,input,predictions,extracted_content
0,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",Overuse @ORGANIZATION2 the computer con lead t...,This sentence is too long and wordy. Consider ...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...
1,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...","When people excessively use the computer, they...",This sentence is vague and doesn't provide spe...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...
2,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",Dr. @PERSON4 @ORGANIZATION2 the @ORGANIZATION2...,This sentence is too long and includes too man...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...
3,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",The internet hold false information and can be...,This sentence is too vague and doesn't provide...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...
4,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...","More and more people use computers, but not ev...",This sentence is too broad and doesn't provide...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...


In [17]:
res_df.shape

(190, 12)

In [18]:
res_df.to_csv('baseline_res_asap.csv', index=False)

In [1]:
import pandas as pd
# merge with metadata
meta = pd.read_csv('eval_data_6_30_var.csv')
ftres = pd.read_csv('baseline_res.csv')
merged_df = pd.merge(ftres, meta, on=['essay', 'prompt'])

In [25]:
meta = pd.read_csv('eval_data_asap.csv')
merged_df = pd.merge(res_df, meta, on=['essay', 'prompt'])

In [26]:
merged_df.head()

Unnamed: 0,essay_id_x,grade,prompt,essay,excerpt,feedback,tid,isRepresentative,comment_id,input,predictions,extracted_content,essay_id_y,grade_level,domain1_score,type
0,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",Overuse @ORGANIZATION2 the computer con lead t...,This sentence is too long and wordy. Consider ...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...,1206,8,11,persuasive
1,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...","When people excessively use the computer, they...",This sentence is vague and doesn't provide spe...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...,1206,8,11,persuasive
2,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",Dr. @PERSON4 @ORGANIZATION2 the @ORGANIZATION2...,This sentence is too long and includes too man...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...,1206,8,11,persuasive
3,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...",The internet hold false information and can be...,This sentence is too vague and doesn't provide...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...,1206,8,11,persuasive
4,1206.0,8.0,"More and more people use computers, but not ev...","Dear editor @ORGANIZATION2 the source, @CAPS1,...","More and more people use computers, but not ev...",This sentence is too broad and doesn't provide...,12.0,yes,1712531606881,You are my English teacher. Read my essay and ...,<s>[INST] You are my English teacher. Read my ...,Of course! Here are some excerpts from your es...,1206,8,11,persuasive


In [27]:
merged_df.to_csv('baseline_res_asap.csv')