In [None]:
!pip install together
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["TOGETHER_API_KEY"] = "f92beca4df59af2b0bd5f0c8fd60c5e3690f01e008fd834fd2e471310862167b"

import pandas as pd
from datasets import load_dataset
import re
import numpy as np
import torch
from together import Together

In [None]:
# Load the dataset
df = pd.read_csv("gpt-writing-prompts-llama.csv")
print(df.head())
print(len(df))


                                              prompt  \
0  You 've finally managed to discover the secret...   
1  You 've finally managed to discover the secret...   
2  The moon is actually a giant egg , and it has ...   
3  The moon is actually a giant egg , and it has ...   
4  For years in your youth the same imaginary cha...   

                                               story writer  
0  I sat nervously in the room , draped almost al...  human  
1  I stared at the business card in my hand, my m...  llama  
2  Sadie 's window was n't very large , no more t...  human  
3  The night sky was ablaze with an otherworldly ...  llama  
4  “ No , no no no ... ” She backed up and turned...  human  
150


In [None]:
questions = {
    "Fluency":["Does the manipulation of time in terms of compression or stretching feel appropriate and balanced?",
    "Does the story demonstrate fluency and insight in balancing scenes with summary and exposition?",
    "Does the story make sophisticated use of idioms, metaphors or literary allusions?",
    "Does the end of the story feel natural and earned, as opposed to arbitrary or abrupt?",
    "Do the various elements of the story flow seamlessly together, creating a unified, engaging, and satisfying whole?",
    "Is the pacing of the story effective, ensuring that no part feels rushed or drawn out?",
    "Are transitions between scenes smooth and coherent?"],
    "Flexibility":["Does the story provide diverse perspectives, and if there are unlikeable characters, are their perspectives presented convincingly and accurately?",
    "Does the story maintain a dynamic balance between interiority and exteriority, allowing for emotional flexibility and depth?",
    "Does the story contain turns that are both surprising and fitting?",
    "Are different stylistic choices used effectively without feeling forced?",
    "Does the story explore multiple themes or ideas in a way that deepens its complexity?",
    "Are character responses and emotions varied, rather than predictable or one-dimensional?",
    "Does the story incorporate different narrative techniques such as action, description and dialogue to enhance storytelling?"],
    "Elaboration":[
    "Does the writer make the fictional world believable at the sensory level?",
    "Does each character in the story feel developed at the appropriate complexity level, ensuring that no character feels like they are present simply to satisfy a plot requirement?",
    "Does the story develop multiple layers of meaning, balancing surface narrative with rich subtext?",
    "Are important moments in the story fully realized, rather than rushed or underdeveloped?",
    "Does the story provide rich, specific details that enhance immersion and engagement?",
    "Are the relationships between characters complex and well-developed rather than simplistic?",
    "Does the dialogue feel natural and reveal depth about the characters?"],
    "Originality":[
    "Will an average reader of this story obtain a unique and original idea from reading it?",
    "Is the story an original piece of writing without any cliches?",
    "Does the story show originality in its form?",
    "Does the narrative structure deviate from conventional storytelling in a meaningful way?",
    "Are the characters’ motivations and arcs fresh and distinct rather than predictable?",
    "Does the story introduce or explore ideas that feel thought-provoking and unconventional?",
    "Does the story introduce unexpected yet meaningful elements that distinguish it from similar works?"]
}



length = 0
for category in questions.values():
  length += len(category)

#Creates a dataframe with Text, source, qi_reasoning for i in length, qi_yes_no for i in length
columns = ["Text", "Source"]
for i in range(0, length):
    columns.append(f"q{i}_reasoning")
    columns.append(f"q{i}_yes_no")
# Initialize the empty DataFrame
results = pd.DataFrame(columns=columns)
print(results.columns)

Index(['Text', 'Source', 'q0_reasoning', 'q0_yes_no', 'q1_reasoning',
       'q1_yes_no', 'q2_reasoning', 'q2_yes_no', 'q3_reasoning', 'q3_yes_no',
       'q4_reasoning', 'q4_yes_no', 'q5_reasoning', 'q5_yes_no',
       'q6_reasoning', 'q6_yes_no', 'q7_reasoning', 'q7_yes_no',
       'q8_reasoning', 'q8_yes_no', 'q9_reasoning', 'q9_yes_no',
       'q10_reasoning', 'q10_yes_no', 'q11_reasoning', 'q11_yes_no',
       'q12_reasoning', 'q12_yes_no', 'q13_reasoning', 'q13_yes_no',
       'q14_reasoning', 'q14_yes_no', 'q15_reasoning', 'q15_yes_no',
       'q16_reasoning', 'q16_yes_no', 'q17_reasoning', 'q17_yes_no',
       'q18_reasoning', 'q18_yes_no', 'q19_reasoning', 'q19_yes_no',
       'q20_reasoning', 'q20_yes_no', 'q21_reasoning', 'q21_yes_no',
       'q22_reasoning', 'q22_yes_no', 'q23_reasoning', 'q23_yes_no',
       'q24_reasoning', 'q24_yes_no', 'q25_reasoning', 'q25_yes_no',
       'q26_reasoning', 'q26_yes_no', 'q27_reasoning', 'q27_yes_no'],
      dtype='object')


In [None]:
client = Together()
model = "deepseek-ai/DeepSeek-V3" #Real model
#model = "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free" #Free model on together for testing

In [None]:
def evaluate_story(index, text, source, reason=False):
    """Evaluates a story using an LLM and appends structured responses to an existing DataFrame."""

    row = {"Text": index, "Source": source}

    # There are exactly 7 questions in each category
    num_questions = 7

    # Iterate through each question index
    for question_idx in range(num_questions):
        # Collect questions at the current index from each category
        current_questions = []
        for category in questions.values():
            if question_idx < len(category):
                current_questions.append(category[question_idx])

        # Create the prompt with the current set of questions
        if reason:
            prompt = (
                "Provide your answers in the following structured format without italics or bold:\n"
                "Question: <Question>\nReasoning: <Reasoning>\nAnswer: <yes/no>\n"
                "Read the following short story and answer the questions below using the exact format provided with Questions, Reasonings and an Answer. Include all three categories for all responses with the format below."
                "For each question, provide a two or three sentence reasoning for your answer, "
                "Then, for answer, pick either 'yes' or a 'no'. Only pick yes or no even if your answer is somewhat in the middle. Pick the option it is closer to.\n\n"
                f"Story: {text}\n\n"
                "Questions:\n" + "\n".join(f"- {q}" for q in current_questions) + "\n\n"
            )
        else:
            prompt = (
                "Provide your answers in the following structured format without italics or bold and include the Reasoning: None:\n"
                "Question: <Question>\nReasoning: None\nAnswer: <yes/no>\n"
                "Read the following short story and answer the questions below using the exact format provided with Questions, Reasonings and an Answer. Include all three categories for all responses."
                "For each question, provide a one word 'yes' or 'no' answer.\n\n"
                f"Story: {text}\n\n"
                "Questions:\n" + "\n".join(f"- {q}" for q in current_questions) + "\n\n"
            )

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a literature critic analyzing short stories."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        answer_text = response.choices[0].message.content.strip()
        pattern = rf"(?:\*\*Question\*\*|Question):\s*(.*?)\s*(?:\*\*Reasoning\*\*|Reasoning):\s*(.*?)\s*(?:\*\*Answer\*\*|Answer):\s*(yes|no)"

        for question in current_questions:
            mtch = re.search(pattern, answer_text, re.IGNORECASE | re.DOTALL)
            if mtch:
                _, reasoning, yes_no = mtch.groups()
                row[f"q{question_idx}_reasoning"] = reasoning.strip()
                row[f"q{question_idx}_yes_no"] = yes_no.lower()
            else:
                print("we messed up :(")
                print(answer_text)
                row[f"q{question_idx}_reasoning"] = None
                row[f"q{question_idx}_yes_no"] = None

            question_idx += 1  # Increment question index

    results.loc[len(results)] = row  # Append row to the DataFrame

In [None]:
#11 human written and 19(up to 25) AI written stories with reasoning.
for idx, row in df.iterrows():
    if idx < 20:
        evaluate_story(idx, row["story"], row["writer"], reason = True)
    else:
        evaluate_story(idx, row["story"], row["writer"], reason = False)
    print(f"Completed {idx} stories.")
    if idx % 4 == 0:
        print("SAVED")
        results.to_csv("results3.csv")


Completed 0 stories.
SAVED
Completed 1 stories.
Completed 2 stories.
Completed 3 stories.
Completed 4 stories.
SAVED
Completed 5 stories.
Completed 6 stories.
Completed 7 stories.
Completed 8 stories.
SAVED
Completed 9 stories.
Completed 10 stories.
Completed 11 stories.
Completed 12 stories.
SAVED
Completed 13 stories.
Completed 14 stories.
Completed 15 stories.
Completed 16 stories.
SAVED
Completed 17 stories.
Completed 18 stories.
Completed 19 stories.
Completed 20 stories.
SAVED
Completed 21 stories.
Completed 22 stories.
Completed 23 stories.
Completed 24 stories.
SAVED
Completed 25 stories.
Completed 26 stories.
Completed 27 stories.
Completed 28 stories.
SAVED
Completed 29 stories.
Completed 30 stories.
Completed 31 stories.
Completed 32 stories.
SAVED
Completed 33 stories.
Completed 34 stories.
Completed 35 stories.
Completed 36 stories.
SAVED
Completed 37 stories.
Completed 38 stories.
Completed 39 stories.
Completed 40 stories.
SAVED
Completed 41 stories.
Completed 42 stories

In [None]:
print(results.head())
print(len(results))

   Text Source                                       q0_reasoning q0_yes_no  \
0     0  human  The story effectively compresses a thousand ye...       yes   
1     0  human  The story effectively compresses time by skipp...       yes   
2     1  llama  The story effectively compresses the protagoni...       yes   
3     2  human  The story effectively compresses time during m...       yes   
4     3  llama  The story compresses time effectively, particu...       yes   

                                        q1_reasoning q1_yes_no  \
0  The story effectively compresses a thousand ye...       yes   
1  The story effectively compresses time by skipp...       yes   
2  The story effectively compresses the protagoni...       yes   
3  The story effectively compresses time during m...       yes   
4  The story compresses time effectively, particu...       yes   

                                        q2_reasoning q2_yes_no  \
0  The story effectively compresses a thousand ye...       yes

In [None]:
#save to csv
results.to_csv("results3.csv", index=False)
print("Results saved to results3.csv")

Results saved to results_prompt_1.csv
