In [None]:
import time
from langchain_ollama import ChatOllama

MODEL_NAME = "qwen3:1.7b"

BATCH_SIZE = 10 
FILE_PATH = "solo_frasi.txt"

chat = ChatOllama(
    model=MODEL_NAME,
    temperature=0.0
)


BATCH_SYSTEM_PROMPT = f"""
You are a highly efficient dialogue act classifier.

Possible labels:
STATEMENT, QUESTION, ANSWER, ACKNOWLEDGEMENT, BACKCHANNEL,
DIRECTIVE, REQUEST, REPAIR, CLARIFICATION, EXPRESSIVE, EMOTIVE,
APOLOGY, GREETING, GOODBYE, OTHER.

RULES:
- You MUST classify ALL TARGET utterances provided in the user prompt.
- PREVIOUS is context, NOT classification target.
- Ignore all transcription symbols (e.g., ((laugh)), °°, [ ], :, ↑, ↓).

OUTPUT FORMAT:
Sentence: <target_1>
Label: <one_label_1>
Sentence: <target_2>
Label: <one_label_2>
... (and so on for every target in the batch)

Do NOT output any other text, markdown formatting, or comments.
"""

def create_batch_prompt(batch_data):
    """Creates a single, large user prompt for a batch of sentences."""
    user_prompt = "Classify the following targets, using the preceding context:\n\n"
    
    for i, (prev_u, target) in enumerate(batch_data):
        user_prompt += f"--- Target {i+1} ---\n"
        user_prompt += f"Previous: {prev_u}\n"
        user_prompt += f"Target: {target}\n"
        
    user_prompt += "\n--- END OF BATCH ---\n\nNow, provide the classification results using the specified OUTPUT FORMAT."
    return user_prompt

def annotate_batch(batch_data):
    """Invokes the model with a batch prompt and returns the raw response."""
    prompt = create_batch_prompt(batch_data)
    
    response = chat.invoke(
        [
            {"role": "system", "content": BATCH_SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ]
    )
    return response.content.strip()

def annotate_test_batch(path, batch_size):
    """
    Loads data, prepares a single batch (up to batch_size), calls the model, 
    and prints the results and timing.
    """
    try:
        with open(path, "r") as f:
            lines = [l.strip() for l in f if l.strip()]
    except FileNotFoundError:
        print(f"Error: The file '{path}' was not found.")
        return

    # Prepare all (Previous, Target) pairs, limited to the batch size
    all_data = []
    for i, target in enumerate(lines[:batch_size]):
        prev_u = lines[i-1] if i > 0 else ""
        all_data.append((prev_u, target))
    
    if not all_data:
        print("No sentences found to annotate.")
        return

    # Start timer
    start_time = time.time()
    total_annotations = len(all_data)
    
    print(f"--- Running Test Batch ---")
    print(f"Sentences to classify: {total_annotations}")
    print(f"Model: {MODEL_NAME}")
    print("-" * 50)
    
    try:
        # Get the batched classification result
        batch_results_text = annotate_batch(all_data)
        
        # Print the results for this batch
        print("Model Output:")
        print(batch_results_text)
        
    except Exception as e:
        print("--- ERROR DURING INFERENCE ---")
        print(f"An error occurred: {e}")
        
    # End timer and print summary
    end_time = time.time()
    total_time = end_time - start_time
    
    print("-" * 50)
    print(f"Test Batch Complete. Total Time for {total_annotations} sentences: {total_time:.2f} seconds.")
    if total_annotations > 0:
        avg_time_per_sentence = total_time / total_annotations
        print(f"Average time per sentence: {avg_time_per_sentence:.3f} seconds.")


# Execute the test batch annotation
annotate_test_batch(FILE_PATH, BATCH_SIZE)

--- Running Test Batch ---
Sentences to classify: 10
Model: qwen3:1.7b
--------------------------------------------------
Model Output:
Sentence: we just ↑remi[NISCE]  
Label: STATEMENT  

Sentence: [remi]nisce  
Label: CLARIFICATION  

Sentence: [eh]  
Label: BACKCHANNEL  

Sentence: (a)[bout] a shared memory  
Label: REQUEST  

Sentence: 'kay  
Label: EXPRESSIVE  

Sentence: m::h hhh ((tsk))  
Label: OTHER  

Sentence: which memory would you like  
Label: QUESTION  

Sentence: but i'm tryin' to think which memory is (0.6) within five minutes because they're all short (.) cute moments  
Label: CLARIFICATION  

Sentence: mh=yea:h  
Label: EXPRESSIVE  

Sentence: °we- we'll probably spend more than five minutes finding a memory°  
Label: STATEMENT
--------------------------------------------------
Test Batch Complete. Total Time for 10 sentences: 72.91 seconds.
Average time per sentence: 7.291 seconds.


In [43]:
from langchain_ollama import ChatOllama
import json
from datetime import datetime

#https://ollama.com/models
#ollama pull qwen3:1.7b

dialogue_labels = ['STATEMENT', 'QUESTION', 'ANSWER', 'ACKNOWLEDGEMENT', 'BACKCHANNEL',
'DIRECTIVE', 'REQUEST', 'REPAIR', 'CLARIFICATION', 'EXPRESSIVE', 'EMOTIVE',
'APOLOGY', 'GREETING', 'GOODBYE', 'OTHER']

examples = [
    {'INPUT': 'we just ↑remi[NISCE]', 'OUTPUT': 'STATEMENT'},
    {'INPUT': '((laughs))', 'OUTPUT': 'OTHER'},
    {'INPUT': '[that- that is (.)] tr[ue]', 'OUTPUT': 'ACKNOWLEDGMENT'},
    {'INPUT': '°yeah°', 'OUTPUT': 'BACKCHANNEL'},
    {'INPUT': '°that was a good one°', 'OUTPUT': 'EXPRESSIVE'},
]

class LLMAnnotator:
    
    def __init__(self,  model="qwen3:1.7b", labels=dialogue_labels, examples=examples, max_context=3):
        
        self._client = ChatOllama(
            model=model,
            think=False,
            temperature= 0.0,
            #num_predict = 20, ## max number of tokens to predict
        )
        self._max_context = max_context
        self._history = []
        self._instruct = []
        self.create_label_instruct(labels, examples)

    def create_label_instruct(self, labels =[], examples=[]):
        self._instruct = [{"role": "system", "content": "You are an expert annotator trained in Conversation Analysis and Dialogue Act tagging."}]
        self._instruct.append({"role": "system", "content": "You must assign exactly ONE dialogue act label to EACH utterance."})
        self._instruct.append({"role": "system", "content": "Utterances use JEFFERSON TRANSCRIPTION. Use them as cues for emotion, repair, overlap, etc., but do NOT treat them as words."})
        self._instruct.append({"role": "system", "content": "Only use one of the following labels:{}.".format(str(labels))}) #the label i'm giving to earlier function
        self._instruct.append({"role": "system", "content": "Classify ONE UTTERANCE AT A TIME, use the preceding utterances as context."})
        #self._instruct.append({"role": "system", "content": "")
        self._instruct.append({"role": "system", "content": "Output the most appropriate label in JSON format."})
        self._instruct.append({"role": "system", "content": "Do not output anything else."})
        if examples:
            self._instruct.append({"role": "system", "content": "Here are a few examples:"})
            for example in examples:
                self._instruct.append({"role": "user", "content": example["INPUT"]})
                self._instruct.append({"role": "system", "content": example["OUTPUT"]})
        print("My instructions are:", self._instruct)

    def annotate_conversation(self, input=[]):
        report = 5
        annotations = []
        counter = 0
        start = datetime.now()
        previous = start
        print("Annotating a conversation with {} utterances".format(len(input)))
        for text in input:
            annotation = self.annotate(text)
            annotations.append(annotation)
            counter+=1
            if counter % report == 0:
                now  = datetime.now()
                print('Processed', report, 'in', (now - previous).seconds, 'seconds')
                print("Processed", counter, "turns in total out of", len(input))
                previous = now
        return annotations
  
    def annotate(self, utterance):
        annotation ={}
        self._history.append({"role": "user", "content": "Input: {}".format(utterance)})

        ## if the history exceeds the maximum context length, we trim it by one
        if len(self._history)>self._max_context:
            self._history = self._history[1:]
    
        prompt = self._instruct+self._history
        response = self._client.invoke(prompt)
        
        ### We need to remove the <think></think> part from the output
        end_of_think = response.content.find("</think>")
        answer = response
        if end_of_think>0:
            think = response.content[:end_of_think+8]
            answer = response.content[end_of_think+8:].replace("\n", "")
        annotation={"Input": utterance, "Output": answer}
        return annotation
    
#if __name__ == "__main__":
#    ekman_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
#    labels = ["positive", "negative", "neutral"]
#    input = ["I am very happy", "I am surprised", "What can I do about it?", "Now I am having real fun.", "It make me sad and depressed", "Sorry to hear that"]
#    examples = [{"Input": "I love dogs", "Output": "joy"}, {"Input": "I hate cats", "Output": "disgust"}]

In [30]:
with open('solo_frasi.txt', 'r') as infilez:
    all_file = infilez.read()
all_file = all_file.split('\n')
len(all_file)

196

In [29]:
llm_annotator  = LLMAnnotator(labels=dialogue_labels, examples=examples, max_context=3)
annotations = llm_annotator.annotate_conversation(all_file[:10])
for annotation in annotations:
    print(annotation)

My instructions are: [{'role': 'system', 'content': 'You are an expert annotator trained in Conversation Analysis and Dialogue Act tagging.'}, {'role': 'system', 'content': 'You must assign exactly ONE dialogue act label to EACH utterance.'}, {'role': 'system', 'content': 'Utterances use JEFFERSON TRANSCRIPTION. Use them as cues for emotion, repair, overlap, etc., but do NOT treat them as words.'}, {'role': 'system', 'content': "Only use one of the following labels:['STATEMENT', 'QUESTION', 'ANSWER', 'ACKNOWLEDGEMENT', 'BACKCHANNEL', 'DIRECTIVE', 'REQUEST', 'REPAIR', 'CLARIFICATION', 'EXPRESSIVE', 'EMOTIVE', 'APOLOGY', 'GREETING', 'GOODBYE', 'OTHER']."}, {'role': 'system', 'content': 'Classify ONE UTTERANCE AT A TIME, use the preceding utterances as context.'}, {'role': 'system', 'content': 'Output the most appropriate label in JSON format.'}, {'role': 'system', 'content': 'Do not output anything else.'}, {'role': 'system', 'content': 'Here are a few examples:'}, {'role': 'user', 'cont

In [44]:
from langchain_ollama import ChatOllama
import json
from datetime import datetime
import re


dialogue_labels = ['STATEMENT', 'QUESTION', 'ANSWER', 'ACKNOWLEDGEMENT', 'BACKCHANNEL',
'DIRECTIVE', 'REQUEST', 'REPAIR', 'CLARIFICATION', 'EXPRESSIVE', 'EMOTIVE',
'APOLOGY', 'GREETING', 'GOODBYE', 'OTHER']


examples = [
    {'INPUT': 'we just ↑remi[NISCE]', 'OUTPUT': 'STATEMENT'},
    {'INPUT': '((laughs))', 'OUTPUT': 'OTHER'},
    {'INPUT': '[that- that is (.)] tr[ue]', 'OUTPUT': 'ACKNOWLEDGEMENT'},
    {'INPUT': '°yeah°', 'OUTPUT': 'BACKCHANNEL'},
    {'INPUT': '°that was a good one°', 'OUTPUT': 'EXPRESSIVE'},
]


class LLMAnnotator:
    
    def __init__(self, model="qwen3:1.7b", labels=dialogue_labels, examples=examples, max_context=3):
        self._client = ChatOllama(
            model=model,
            temperature=0.0,
        )
        self._max_context = max_context
        self._history = []
        self._instruct = self._build_instructions(labels, examples)

    def _build_instructions(self, labels, examples):
        instruct = [
            {"role": "system", "content": (
                "You are an expert annotator trained in Conversation Analysis and Dialogue Act tagging.\n"
                "You must output EXACTLY ONE label for the current utterance.\n"
                "Allowed labels: " + ", ".join(labels) + "\n"
                "Output MUST be a JSON object of the form: {\"label\": \"...\"}\n"
                "Do NOT output multiple labels. Do NOT output multiple JSON objects. Do NOT explain.\n"
                "Only classify the *current* input.\n"
            )}
        ]

        # few-shot examples: proper format (user -> assistant)
        for ex in examples:
            instruct.append({"role": "user", "content": ex["INPUT"]})
            instruct.append({"role": "assistant", "content": json.dumps({"label": ex["OUTPUT"]})})

        return instruct

    def _clean_response(self, text):
        # remove <think> blocks if present
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
        # extract JSON object
        m = re.search(r"\{.*\}", text, flags=re.DOTALL)
        if not m:
            return {"label": "OTHER"}
        try:
            return json.loads(m.group(0))
        except:
            # fallback: extract label manually
            for lbl in dialogue_labels:
                if lbl.lower() in text.lower():
                    return {"label": lbl}
        return {"label": "OTHER"}

    def annotate(self, utterance):
        # Update short sliding context
        self._history.append({"role": "user", "content": utterance})
        if len(self._history) > self._max_context:
            self._history = self._history[-self._max_context:]

        prompt = self._instruct + self._history
        response = self._client.invoke(prompt)
        cleaned = self._clean_response(response.content)

        return {"Input": utterance, "Output": cleaned}

    def annotate_conversation(self, utterances):
        annotations = []
        start = datetime.now()
        for idx, utt in enumerate(utterances, 1):
            ann = self.annotate(utt)
            annotations.append(ann)
            if idx % 5 == 0:
                print(f"Processed {idx}/{len(utterances)} in {(datetime.now() - start).seconds} seconds")
        return annotations



# --- RUNNING ---
with open('solo_frasi.txt', 'r') as infile:
    lines = infile.read().split("\n")

llm = LLMAnnotator(labels=dialogue_labels, examples=examples)
annotations = llm.annotate_conversation(lines[:10])

for ann in annotations:
    print(ann)


Processed 5/10 in 160 seconds
Processed 10/10 in 255 seconds
{'Input': 'we just ↑remi[NISCE]', 'Output': {'label': 'STATEMENT'}}
{'Input': '[remi]nisce', 'Output': {'label': 'OTHER'}}
{'Input': '[eh]', 'Output': {'label': 'STATEMENT'}}
{'Input': '(a)[bout] a shared memory', 'Output': {'label': 'STATEMENT'}}
{'Input': "'kay", 'Output': {'label': 'BACKCHANNEL'}}
{'Input': 'm::h hhh ((tsk))', 'Output': {'label': 'STATEMENT'}}
{'Input': 'which memory would you like', 'Output': {'label': 'REQUEST'}}
{'Input': "but i'm tryin' to think which memory is (0.6) within five minutes because they're all short (.) cute moments", 'Output': {'label': 'QUESTION'}}
{'Input': 'mh=yea:h', 'Output': {'label': 'REQUEST'}}
{'Input': "°we- we'll probably spend more than five minutes finding a memory°", 'Output': {'label': 'STATEMENT'}}


In [45]:
from langchain_ollama import ChatOllama
import json
import re
from datetime import datetime


dialogue_labels = [
    'STATEMENT', 'QUESTION', 'ANSWER', 'ACKNOWLEDGEMENT', 'BACKCHANNEL',
    'DIRECTIVE', 'REQUEST', 'REPAIR', 'CLARIFICATION', 'EXPRESSIVE',
    'EMOTIVE', 'APOLOGY', 'GREETING', 'GOODBYE', 'OTHER'
]

examples = [
    {'INPUT': 'we just ↑remi[NISCE]', 'OUTPUT': 'STATEMENT'},
    {'INPUT': '((laughs))', 'OUTPUT': 'OTHER'},
    {'INPUT': '[that- that is (.)] tr[ue]', 'OUTPUT': 'ACKNOWLEDGEMENT'},
    {'INPUT': '°yeah°', 'OUTPUT': 'BACKCHANNEL'},
    {'INPUT': '°that was a good one°', 'OUTPUT': 'EXPRESSIVE'},
]


class LLMAnnotator:
    
    def __init__(self, model="qwen3:1.7b", labels=dialogue_labels, examples=examples, max_context=3):
        self._client = ChatOllama(
            model=model,
            temperature=0.0,
        )
        self._max_context = max_context
        self._history = []
        self._instruct = self._build_instructions(labels, examples)

    def _build_instructions(self, labels, examples):
        instruct = [
            {
                "role": "system",
                "content": (
                    "You are an expert annotator of dialogue acts.\n"
                    "You MUST output exactly one label for the current utterance.\n"
                    "Allowed labels: " + ", ".join(labels) + "\n"
                    "Output must be a JSON object: {\"label\": \"...\"}\n"
                    "Do NOT output anything else.\n"
                )
            }
        ]

        # Correct few-shot examples (user → assistant)
        for ex in examples:
            instruct.append({"role": "user", "content": ex["INPUT"]})
            instruct.append({"role": "assistant", "content": json.dumps({"label": ex["OUTPUT"]})})

        return instruct

    def _extract_json(self, text):
        # Strip possible <think> blocks
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

        # Extract only the first JSON object
        start = text.find("{")
        end = text.rfind("}")

        if start != -1 and end != -1:
            candidate = text[start:end+1]
            try:
                parsed = json.loads(candidate)
                return parsed
            except:
                pass

        # fallback if malformed
        for lbl in dialogue_labels:
            if lbl.lower() in text.lower():
                return {"label": lbl}

        return {"label": "OTHER"}

    def annotate(self, utterance):
        # Add utterance to context WITHOUT the "Input:" prefix
        self._history.append({"role": "user", "content": utterance})

        # Trim context
        if len(self._history) > self._max_context:
            self._history = self._history[-self._max_context:]

        # Build prompt
        prompt = self._instruct + self._history

        # Query model
        response = self._client.invoke(prompt)
        cleaned = self._extract_json(response.content)

        return {"Input": utterance, "Output": cleaned}

    def annotate_conversation(self, utterances):
        annotations = []
        start = datetime.now()

        for idx, utt in enumerate(utterances, 1):
            ann = self.annotate(utt)
            annotations.append(ann)

            if idx % 5 == 0:
                print(f"Processed {idx}/{len(utterances)} in {(datetime.now() - start).seconds} seconds")

        return annotations


# ============================
# RUN THE ANNOTATOR
# ============================

with open('solo_frasi.txt', 'r') as infile:
    lines = infile.read().split("\n")

llm = LLMAnnotator(labels=dialogue_labels, examples=examples, max_context=3)
annotations = llm.annotate_conversation(lines[:10])

for ann in annotations:
    print(ann)

Processed 5/10 in 162 seconds
Processed 10/10 in 323 seconds
{'Input': 'we just ↑remi[NISCE]', 'Output': {'label': 'DIRECTIVE'}}
{'Input': '[remi]nisce', 'Output': {'label': 'STATEMENT'}}
{'Input': '[eh]', 'Output': {'label': 'BACKCHANNEL'}}
{'Input': '(a)[bout] a shared memory', 'Output': {'label': 'STATEMENT'}}
{'Input': "'kay", 'Output': {'label': 'STATEMENT'}}
{'Input': 'm::h hhh ((tsk))', 'Output': {'label': 'ANSWER'}}
{'Input': 'which memory would you like', 'Output': {'label': 'OTHER'}}
{'Input': "but i'm tryin' to think which memory is (0.6) within five minutes because they're all short (.) cute moments", 'Output': {'label': 'REQUEST'}}
{'Input': 'mh=yea:h', 'Output': {'label': 'QUESTION'}}
{'Input': "°we- we'll probably spend more than five minutes finding a memory°", 'Output': {'label': 'STATEMENT'}}


In [None]:
from langchain_ollama import ChatOllama
import json
import re
from datetime import datetime


dialogue_labels = [
    'STATEMENT', 'QUESTION', 'ANSWER', 'ACKNOWLEDGEMENT', 'BACKCHANNEL',
    'DIRECTIVE', 'REQUEST', 'REPAIR', 'CLARIFICATION', 'EXPRESSIVE',
    'EMOTIVE', 'APOLOGY', 'GREETING', 'GOODBYE', 'OTHER'
]

examples = [
    {'INPUT': 'we just ↑remi[NISCE]', 'OUTPUT': 'STATEMENT'},
    {'INPUT': '((laughs))', 'OUTPUT': 'OTHER'},
    {'INPUT': '[that- that is (.)] tr[ue]', 'OUTPUT': 'ACKNOWLEDGEMENT'},
    {'INPUT': '°yeah°', 'OUTPUT': 'BACKCHANNEL'},
    {'INPUT': '°that was a good one°', 'OUTPUT': 'EXPRESSIVE'},
]


class LLMAnnotator:
    
    def __init__(self, model="qwen3:1.7b", labels=dialogue_labels, examples=examples, max_context=3):
        self._client = ChatOllama(
            model=model,
            temperature=0.0,
        )
        self._max_context = max_context
        self._history = []
        self._instruct = self._build_instructions(labels, examples)

    def _build_instructions(self, labels, examples):
        instruct = [
            {
                "role": "system",
                "content": (
                    "You are an expert annotator of dialogue acts.\n"
                    "You MUST output exactly one label for the current utterance.\n"
                    "Allowed labels: " + ", ".join(labels) + "\n"
                    "Output must be a JSON object: {\"label\": \"...\"}\n"
                    "Do NOT output anything else.\n"
                )
            }
        ]

        # Correct few-shot examples (user → assistant)
        for ex in examples:
            instruct.append({"role": "user", "content": ex["INPUT"]})
            instruct.append({"role": "assistant", "content": json.dumps({"label": ex["OUTPUT"]})})

        return instruct

    def _extract_json(self, text):
        # Strip possible <think> blocks
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

        # Extract only the first JSON object
        start = text.find("{")
        end = text.rfind("}")

        if start != -1 and end != -1:
            candidate = text[start:end+1]
            try:
                parsed = json.loads(candidate)
                return parsed
            except:
                pass

        # fallback if malformed
        for lbl in dialogue_labels:
            if lbl.lower() in text.lower():
                return {"label": lbl}

        return {"label": "OTHER"}

    def annotate(self, utterance):
        # Add utterance to context WITHOUT the "Input:" prefix
        self._history.append({"role": "user", "content": utterance})

        # Trim context
        if len(self._history) > self._max_context:
            self._history = self._history[-self._max_context:]

        # Build prompt
        prompt = self._instruct + self._history

        # Query model
        response = self._client.invoke(prompt)
        cleaned = self._extract_json(response.content)

        return {"Input": utterance, "Output": cleaned}

    def annotate_conversation(self, utterances):
        annotations = []
        start = datetime.now()

        for idx, utt in enumerate(utterances, 1):
            ann = self.annotate(utt)
            annotations.append(ann)

            if idx % 5 == 0:
                print(f"Processed {idx}/{len(utterances)} in {(datetime.now() - start).seconds} seconds")

        return annotations


# ============================
# RUN THE ANNOTATOR
# ============================

with open('solo_frasi.txt', 'r') as infile:
    lines = infile.read().split("\n")

llm = LLMAnnotator(labels=dialogue_labels, examples=examples, max_context=3)
annotations = llm.annotate_conversation(lines[:10])

for ann in annotations:
    print(ann)
