# Storyteller

In [3]:
import pandas as pd
import re
import torch
from transformers import AutoModelForCausalLM, AutoModelForTokenClassification, AutoTokenizer, pipeline


In [6]:
from evaluate.evaluate import evaluate_text
from evaluate.generate_prompt import generate_prompt
from evaluate.generate_chapter import generate_chapter

## Models and tokenizers

In [4]:
# Generation pipeline

phi_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct",
    # device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

phi_pipe = pipeline("fill-mask", model=phi_model, tokenizer=phi_tokenizer)


# Evaluation pipeline
NER_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
NER_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
NER_pipe = pipeline('ner', model=NER_model, tokenizer=NER_tokenizer)

## Data

In [8]:
# load data from json
data = pd.read_json("data/data.json")

# drop columns
data = data.drop(columns=["row_idx", "truncated_cells"])

#rename columns
data = data.rename(columns={"row": "text"})

# get only values from dicts in text column
data["text"] = data["text"].apply(lambda x: x["text"])

data.head()

Unnamed: 0,text
0,"I am in my 30 's , married and have recently ,..."
1,It strikes me that I never posted the link to ...
2,Hye was waiting for her usual bus at the usual...
3,""" Remember , go straight to Grandma 's house ,..."
4,Leave a reply Tisha Porter didn 't think she w...


In [5]:
# add pormpt column and contiunation_idx column
data["prompt"] = ""

prompt_max_len = 512

for i, row in data.iterrows():
    prompt = generate_prompt(row["text"], prompt_max_len)
    data.at[i, "prompt"] = prompt

data.head()


Unnamed: 0,text,prompt
0,"I am in my 30 's , married and have recently ,...","I am in my 30 's , married and have recently ,..."
1,It strikes me that I never posted the link to ...,It strikes me that I never posted the link to ...
2,Hye was waiting for her usual bus at the usual...,Hye was waiting for her usual bus at the usual...
3,""" Remember , go straight to Grandma 's house ,...",""" Remember , go straight to Grandma 's house ,..."
4,Leave a reply Tisha Porter didn 't think she w...,Leave a reply Tisha Porter didn 't think she w...


##### I wont to work without generation


In [9]:
# prepering the prompts for the model
def prepare_prompt(text, prompt_max_len):
    # truncate the text to the first "." before the prompt_max_length
    truncated_text = text[:prompt_max_len]
    # find last "." in the text
    last_dot_idx = truncated_text.rfind(".")
    # truncate the string to the last "."
    output = truncated_text[:last_dot_idx+1]

    return output, last_dot_idx + 2

# add pormpt column and contiunation_idx column
data["prompt"] = ""
data["continuation_idx"] = 0

prompt_max_len = 512

for i, row in data.iterrows():
    prompt, continuation_idx = prepare_prompt(row["text"], prompt_max_len)
    data.at[i, "prompt"] = prompt
    data.at[i, "continuation_idx"] = continuation_idx

data.head()

Unnamed: 0,text,prompt,continuation_idx
0,"I am in my 30 's , married and have recently ,...","I am in my 30 's , married and have recently ,...",440
1,It strikes me that I never posted the link to ...,It strikes me that I never posted the link to ...,486
2,Hye was waiting for her usual bus at the usual...,Hye was waiting for her usual bus at the usual...,503
3,""" Remember , go straight to Grandma 's house ,...",""" Remember , go straight to Grandma 's house ,...",471
4,Leave a reply Tisha Porter didn 't think she w...,Leave a reply Tisha Porter didn 't think she w...,468


In [10]:
data["continuation"] = ""

for i, row in data.iterrows():
    prompt, _ = prepare_prompt(row["text"][row["continuation_idx"]:], prompt_max_len)
    data.at[i, "continuation"] = prompt

data.head()

Unnamed: 0,text,prompt,continuation_idx,continuation
0,"I am in my 30 's , married and have recently ,...","I am in my 30 's , married and have recently ,...",440,I think back on the past year with mixed emoti...
1,It strikes me that I never posted the link to ...,It strikes me that I never posted the link to ...,486,We had a ton of fun ( and learned a lot ) maki...
2,Hye was waiting for her usual bus at the usual...,Hye was waiting for her usual bus at the usual...,503,"It was very cold that night , but it didn 't b..."
3,""" Remember , go straight to Grandma 's house ,...",""" Remember , go straight to Grandma 's house ,...",471,""" Who is it ? "" he called in a cackly voice . ..."
4,Leave a reply Tisha Porter didn 't think she w...,Leave a reply Tisha Porter didn 't think she w...,468,She wasn 't counting on the stress being too m...


## Generation

In [11]:
sample_row = data.iloc[7]
text = sample_row["text"]
prompt = sample_row["prompt"]
prompt

'Overtime wore on late that day , and by the time Douno left the office , it was past nine at night . An accident had happened along the route home , closing off an entire lane and bringing traffic to a standstill . Douno did not get to his apartment until past ten . The asphalt in the parking lot still carried a damp smell from the heat wave during the day .'

In [None]:
chapter = sample_row["continuation"]

## Evaluation

In [13]:
evaluation = evaluate_text(prompt, chapter, NER_pipe)

evaluation

30.0 1.0
