## Agreement Metrics

In [None]:
!pip install jsonlines
import jsonlines
from nltk.metrics import agreement

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
def load_annotations(file_path):
    annotations = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            annotations.append(obj)
    return annotations

def prepare_data(annotations, annotator_id):
    prepared_data = []
    for ann in annotations:
        # Ensure 'item' and 'label' are converted to strings to be hashable
        item = str(ann['text'])
        label = str(ann['label'])
        prepared_data.append((annotator_id, item, label))
    return prepared_data

In [None]:
# Load annotations from different annotators
file1 = 'control.jsonl'
file2 = 'EN_ann.jsonl'

annotations_annotator1 = load_annotations(file1)
annotations_annotator2 = load_annotations(file2)
# Repeat for other annotators as needed

# Prepare data
data_annotator1 = prepare_data(annotations_annotator1, 'annotator1')
data_annotator2 = prepare_data(annotations_annotator1, 'annotator2')
# Repeat for other annotators as needed

# Combine all the data
all_data = data_annotator1 + data_annotator2  # + other annotators' data

# Create the AnnotationTask
task = agreement.AnnotationTask(all_data)

# Print various agreement measures
print(file1, ' vs ', file2, '\n')
print("Kappa:", task.kappa())
print("Fleiss' Kappa:", task.multi_kappa())
print("Pi:", task.pi())
print("Alpha:", task.alpha())


control.jsonl  vs  EN_ann.jsonl 

Kappa: 1.0
Fleiss' Kappa: 1.0
Pi: 1.0
Alpha: 1.0


In [None]:
!pip install python-docx
!pip install nltk


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


## Ero speaker counter

In [None]:
import os
import docx
import nltk
from nltk.tokenize import sent_tokenize

# Ensure you have the required nltk data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def count_marked_sentences_in_file(docx_file):
    # Load the DOCX file
    doc = docx.Document(docx_file)

    # Extract text from paragraphs
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join all paragraphs into a single string and tokenize into sentences
    text = ' '.join(full_text)
    sentences = sent_tokenize(text)

    # Initialize counters and marker
    section_marker = None
    section_count = 0
    half_count = 0

    # Process each sentence
    for sentence in sentences:
        if '§' in sentence:
            section_marker = '§'
            section_count += 1
        elif '½' in sentence:
            section_marker = '½'
            half_count += 1
        elif section_marker == '§':
            section_count += 1
        elif section_marker == '½':
            half_count += 1

    return section_count, half_count

def count_marked_sentences_in_directory(directory):
    total_section_count = 0
    total_half_count = 0

    for filename in os.listdir(directory):
        if filename.endswith(".docx"):
            file_path = os.path.join(directory, filename)
            section_count, half_count = count_marked_sentences_in_file(file_path)
            total_section_count += section_count
            total_half_count += half_count

    print(f"Total number of sentences marked with §: {total_section_count}")
    print(f"Total number of sentences marked with ½: {total_half_count}")

# Usage example
count_marked_sentences_in_directory('/content/')


Total number of sentences marked with §: 3
Total number of sentences marked with ½: 98


## Eero Speaker to jsonl

In [None]:
import os
import json
import docx
import nltk
import re
from nltk.tokenize import sent_tokenize

# Ensure you have the required nltk data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def is_srt_timestamp(line):
    # Regular expression to match SRT timestamps
    is_srt = bool(re.match(r'^\d+$', line)) or bool(re.match(r'^\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}$', line))
    return is_srt

def process_sentences_in_file(docx_file):
    doc = docx.Document(docx_file)
    full_text = []
    skip = True
    for para in doc.paragraphs:
        text = para.text.strip()
        if text.endswith('.txt'):
            skip = False
            continue
        if skip:
            continue
        lines = text.split('\n')
        cleaned_lines = [line for line in lines if not is_srt_timestamp(line)]
        cleaned_text = ' '.join(cleaned_lines).strip()
        if cleaned_text:
            full_text.append(cleaned_text)

    text = ' '.join(full_text)
    sentences = sent_tokenize(text)
    section_marker = None
    labels = []

    for sentence in sentences:
        start_idx = text.find(sentence)
        end_idx = start_idx + len(sentence)
        if '§' in sentence:
            section_marker = 'interviewer'
        elif '½' in sentence:
            section_marker = 'interviewee'
        if section_marker:
            labels.append([start_idx, end_idx, section_marker])

    return {'text': text, 'cats': [], 'Comments': [], 'label': labels}


def process_directory_and_generate_jsonl(directory, output_file):
    all_labeled_sentences = []

    for filename in os.listdir(directory):
        if filename.endswith(".docx"):
            file_path = os.path.join(directory, filename)
            labeled_sentences = process_sentences_in_file(file_path)
            all_labeled_sentences.append(labeled_sentences)

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_labeled_sentences:
            f.write(json.dumps(item) + '\n')

    print(f"Generated JSONL file: {output_file}")

# Usage example
process_directory_and_generate_jsonl('/content/', '/content/speaker-Eero.jsonl')


Generated JSONL file: /content/speaker-Eero.jsonl


In [None]:
###### seem to be NOT latest version

def is_srt_timestamp(line):
    # Regular expression to match SRT timestamps (e.g., "2" or "00:00:14,100 --> 00:00:15,000")
    is_srt = bool(re.match(r'^\d+$', line)) or bool(re.match(r'^\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}$', line))

    return is_srt

def process_sentences_in_file(docx_file):
    # Load the DOCX file
    doc = docx.Document(docx_file)

    # Extract text from paragraphs and skip unwanted parts
    full_text = []
    skip = True
    for para in doc.paragraphs:
        text = para.text.strip()
        # Skip lines that are .txt filenames
        if text.endswith('.txt'):
            skip = False
            continue
        if skip:
          continue
        # Skip lines that are SRT timestamps but keep the rest of the text
        lines = text.split('\n')
        #print(lines)
        cleaned_lines = [line for line in lines if not is_srt_timestamp(line)]
        cleaned_text = ' '.join(cleaned_lines).strip()
        if cleaned_text:
            full_text.append(cleaned_text)

    # Join all paragraphs into a single string and tokenize into sentences
    text = ' '.join(full_text)
    sentences = sent_tokenize(text)

    # Initialize marker and list to store labeled sentences
    section_marker = None
    labeled_sentences = []

    # Process each sentence

    for sentence in sentences:
        if '§' in sentence:
            section_marker = 'interviewer'
        elif '½' in sentence:
            section_marker = 'interviewee'

        if section_marker:
            labeled_sentences.append({
                'text': sentence,
                'label': section_marker
            })

    return labeled_sentences

def process_directory_and_generate_jsonl(directory, output_file):
    all_labeled_sentences = []

    for filename in os.listdir(directory):
        if filename.endswith(".docx"):
            file_path = os.path.join(directory, filename)
            labeled_sentences = process_sentences_in_file(file_path)
            all_labeled_sentences.extend(labeled_sentences)

    # Write the results to a JSONL file
    with open(output_file, 'w', encoding='utf-8') as f:
        for sentence in all_labeled_sentences:
            f.write(json.dumps(sentence) + '\n')

    print(f"Generated JSONL file: {output_file}")

# Usage example
process_directory_and_generate_jsonl('/content/', '/content/speaker-Eero.jsonl')


## Eero Emotion Extraction

In [None]:
!pip install python-docx

import re
import os
import csv
from docx import Document
from docx.oxml.ns import qn

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
def get_highlight_color(run):
    highlight = run.font.highlight_color
    if highlight:
        return highlight
    return None

def is_end_of_sentence(text):
    return re.search(r'[.!?]', text) is not None

def extract_highlighted_text(doc_path):
    doc = Document(doc_path)
    highlighted_data = []
    current_text = ""
    current_color = None
    found_txt = False


    for para in doc.paragraphs:

        if not found_txt:
            if '.txt' in para.text:
                found_txt = True
                # Skip up to and including the occurrence of ".txt"
                para.text = para.text.split('.txt', 1)[1]
            else:
                continue
        for run in para.runs:
            color = get_highlight_color(run)
            if color:
                if color == current_color:
                    current_text += run.text
                else:
                    if current_text:
                        highlighted_data.append({
                            "text": current_text.strip(),
                            "color": current_color
                        })
                    current_text = run.text
                    current_color = color
            else:
                if current_text:
                    highlighted_data.append({
                        "text": current_text.strip(),
                        "color": current_color
                    })
                    current_text = ""
                    current_color = None

    if current_text:
        highlighted_data.append({
            "text": current_text.strip(),
            "color": current_color
        })

    sentence_highlighted_data = []

    for item in highlighted_data:
        text = item["text"]
        sentences = re.split(r'(?<=[,.!?]) +', text)
        for sentence in sentences:
            sentence_highlighted_data.append({
                "text": sentence.strip(),
                "color": item["color"]
            })

    return sentence_highlighted_data

# List of document paths
doc_paths = ['/content/1_interview.docx', '/content/2_interview.docx', '/content/3_interview.docx', '/content/4_interview.docx']

doc_paths2 = ['/content/1.docx', '/content/A1_Hololens.docx',
             '/content/B1_camera.docx', '/content/C2_HoloLens.docx',
             '/content/2.docx', '/content/A2_Hololens.docx',
             '/content/B2_camera.docx', '/content/D1_camera.docx',
             '/content/3.docx', '/content/A3_Hololens.docx',
             '/content/C1_Hololens.docx', '/content/D2_phone.docx']  # Add your document paths here

# Extract highlighted text and color from multiple documents
all_highlighted_data = []

for doc_path in doc_paths:
    highlighted_data = extract_highlighted_text(doc_path)
    for data in highlighted_data:
        all_highlighted_data.append({
            "document": os.path.basename(doc_path),
            "highlighted_text": data["text"],
            "color": data["color"]
        })

# Write the highlighted text and color to a CSV file
csv_file = 'highlighted_text.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Document", "Highlighted Text", "Color"])
    writer.writeheader()
    for entry in all_highlighted_data:
        writer.writerow({
            "Document": entry["document"],
            "Highlighted Text": entry["highlighted_text"],
            "Color": entry["color"]
        })

print(f"Highlighted text and colors have been extracted and saved to {csv_file}")


Highlighted text and colors have been extracted and saved to highlighted_text.csv


### Counting

In [None]:
import csv
from collections import defaultdict

# Read the highlighted text and colors from the CSV file
input_csv_file = 'highlighted_text.csv'
color_counts = defaultdict(int)

with open(input_csv_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    # Skip the first 41 rows
    for _ in range(41):
        next(reader, None)

    for row in reader:
        color = row['Color']
        color_counts[color] += 1

# Write the color counts to a new CSV file
output_csv_file = 'color_counts.csv'
with open(output_csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Color", "Count"])
    for color, count in color_counts.items():
        writer.writerow([color, count])

print(f"Color counts have been extracted and saved to {output_csv_file}")


Color counts have been extracted and saved to color_counts.csv


### JsonL

In [None]:
import os
import json
from docx import Document
from docx.oxml.ns import qn

def is_srt_timestamp(line):
    # Regular expression to match SRT timestamps
    is_srt = bool(re.match(r'^\d+$', line)) or bool(re.match(r'^\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}$', line))
    return is_srt

# Function to check if a run is highlighted and get its color
def get_highlight_color(run):
    highlight = run.font.element.xpath('.//w:highlight')
    if highlight:
        return highlight[0].get(qn('w:val'))
    return None

# Function to extract highlighted text and color from a document
def extract_highlighted_text_with_labels(doc_path, label_colors):
    doc = Document(doc_path)
    highlighted_data = []
    text = ""
    annotations = []
    current_position = 0
    skip = True

    for para in doc.paragraphs:

        text = para.text.strip()
        if text.endswith('.txt'):
            skip = False
            continue
        if skip or is_srt_timestamp(text):
            continue


        for run in para.runs:
            color = get_highlight_color(run)
            if color:
                label = label_colors.get(color, "unknown")
                if label == "unknown":
                  print(color)
                annotations.append([current_position, current_position + len(run.text), label])
            text += run.text
            current_position += len(run.text)

    return text, annotations

# Define the label colors based on the user's input
label_colors = {
    'red': 'Anger',
    'gray': 'Trust',
    'lightGray': 'Trust',
    'darkYellow': 'Anticipation',
    'yellow': 'Joy',
    'green': 'Disgust',
    'darkGreen': 'Disgust',
    'teal': 'Surprise',
    'darkCyan': 'Surprise',
    'pink': 'Fear',
    'magenta': 'Fear',
    'cyan': 'Sadness'
}

# List of document paths
doc_folder = '/content/'  # Change this to your documents folder path
doc_paths = [os.path.join(doc_folder, file) for file in os.listdir(doc_folder) if file.endswith('.docx')]

# Extract highlighted text and annotations from multiple documents
all_annotations_data = []

for doc_path in doc_paths:
    text, annotations = extract_highlighted_text_with_labels(doc_path, label_colors)
    all_annotations_data.append({
        "document": os.path.basename(doc_path),
        "text": text,
        "label": annotations
    })

# Write the annotations with labels to a JSON Lines (jsonl) file
jsonl_file_with_labels = 'highlighted_annotations_with_labels.jsonl'
with open(jsonl_file_with_labels, mode='w', encoding='utf-8') as file:
    for entry in all_annotations_data:
        json.dump(entry, file)
        file.write('\n')

print(f"Highlighted annotations with labels have been extracted and saved to {jsonl_file_with_labels}")


Highlighted annotations with labels have been extracted and saved to highlighted_annotations_with_labels.jsonl


## GPT Speaker Counter

In [None]:
import re
import os

def count_sentences_in_file(file_path, interviewer_label="interviewer"):
    interviewer_sentences = 0
    other_sentences = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line in lines:
        # Check if the line is spoken by the interviewer
        if line.strip().startswith(f"{interviewer_label}:"):
            # Count the number of sentences by splitting at periods
            interviewer_sentences += len(re.findall(r'\b[^,.?!]*[,.?!]', line.strip()))
        elif re.match(r'^[A-Za-z]+:', line.strip()):
            # Count the number of sentences by splitting at periods for others
            other_sentences += len(re.findall(r'\b[^,.?!]*[,.?!]', line.strip()))

    return interviewer_sentences, other_sentences

def count_sentences_in_files(file_paths, interviewer_label="interviewer"):
    total_interviewer_sentences = 0
    total_other_sentences = 0

    for file_path in file_paths:
        interviewer_sentences, other_sentences = count_sentences_in_file(file_path, interviewer_label)
        total_interviewer_sentences += interviewer_sentences
        total_other_sentences += other_sentences
        print(f"File: {file_path} -> Interviewer: {interviewer_sentences}, Others: {other_sentences}")

    return total_interviewer_sentences, total_other_sentences

# Example usage
directory = '/content/'
interviewer_label = 'Speaker1'  # Change this if your interviewer label is different

# Get list of all .srt files in the directory
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.srt')]

total_interviewer_sentences, total_other_sentences = count_sentences_in_files(file_paths, interviewer_label)
print(f"Total number of sentences spoken by the interviewer: {total_interviewer_sentences}")
print(f"Total number of sentences spoken by others: {total_other_sentences}")


File: /content/Maked_corrected_3.srt -> Interviewer: 35, Others: 29
File: /content/Maked_corrected_1.srt -> Interviewer: 48, Others: 38
File: /content/Maked_corrected_2.srt -> Interviewer: 18, Others: 2
Total number of sentences spoken by the interviewer: 101
Total number of sentences spoken by others: 69


## Tomi Annotation fixing

In [None]:
import json
import re
from collections import defaultdict

def count_sentences(text_segment):
    sentences = re.split(r'[.!?]+', text_segment)
    sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty sentences and strip spaces
    return len(sentences)

def count_sentences_by_label(jsonl_file):
    label_sentence_counts = defaultdict(int)

    # Read the JSONL file and process each line
    with open(jsonl_file, 'r') as file:
        for line in file:
            line = line.strip()  # Remove any leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            try:
                annotation = json.loads(line)
                text = annotation.get("text", "")
                labels = annotation.get("label", [])

                for label_data in labels:
                    start_idx, end_idx, label = label_data
                    text_segment = text[start_idx:end_idx]
                    #print(f"Label: {label}, Text Segment: '{text_segment}'")
                    label_sentence_counts[label] += count_sentences(text_segment)

            except json.JSONDecodeError as e:

                print(f"Skipping invalid JSON line: {line}")
            except TypeError as e:
                print(f"Skipping invalid data type in line: {line}")

    return label_sentence_counts

# Example usage
jsonl_file = 'admin.jsonl'
sentence_counts = count_sentences_by_label(jsonl_file)
print("Sentence counts by label:")
print(sentence_counts)


Sentence counts by label:
defaultdict(<class 'int'>, {'Speaker 1': 790, 'Surprise': 125, 'Disgust': 93, 'Speaker 2': 291, 'Fear': 85, 'Anticipation': 151, 'Joy': 86, 'Trust': 424, 'Sadness': 34, 'Anger': 76, 'Interviewer': 132})


## GPT speaker to jsonl

In [None]:
import re
import os
import json
from nltk.tokenize import sent_tokenize

def process_annotated_file(srt_file):
    with open(srt_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Normalizing the content for uniform label handling
    content = re.sub(r"Speaker1", "Speaker 1", content)
    content = re.sub(r"Speaker2", "Speaker 2", content)

    # Use sent_tokenize to get initial sentence split, then further split by speaker labels
    initial_sentences = sent_tokenize(content)
    sentences = []
    for sentence in initial_sentences:
        # Further split each sentence by the specific speaker labels, retaining the labels
        parts = re.split(r'(\bSpeaker 1:\s*|\bSpeaker 2:\s*|\bInstructor:\s*)', sentence)
        if len(parts) > 1:
            buffer = ""
            for part in parts:
                if part.startswith(('Speaker 1:', 'Speaker 2:', 'Instructor:')):
                    if buffer:
                        sentences.append(buffer.strip())
                        buffer = part
                    else:
                        buffer = part
                else:
                    buffer += part
            if buffer:
                sentences.append(buffer.strip())
        else:
            sentences.append(sentence.strip())

    modified_sentences = []
    labels = []
    last_label = None

    for sentence in sentences:

        start_idx = content.find(sentence)
        end_idx = start_idx + len(sentence)

        # Determine if the sentence starts with a speaker label and process accordingly
        if re.match(r'^Speaker 1:', sentence):
            label = 'S1'
            clean_sentence = sentence[len('Speaker 1:'):].strip()
        elif re.match(r'^Speaker 2:', sentence):
            label = 'S2'
            clean_sentence = sentence[len('Speaker 2:'):].strip()
        elif re.match(r'^Instructor:', sentence):
            label = 'Instructor'
            clean_sentence = sentence[len('Instructor:'):].strip()
        else:
            clean_sentence = sentence
            label = last_label

        if label:
            labels.append([start_idx, end_idx, label])
            last_label = label

        modified_sentences.append(clean_sentence)

    text = ' '.join(modified_sentences)
    return {'text': text, 'label': labels}

def process_directory_and_generate_jsonl(directory, output_file):
    all_labeled_sentences = []

    for filename in os.listdir(directory):
        if filename.endswith(".srt"):
            file_path = os.path.join(directory, filename)
            labeled_sentences = process_annotated_file(file_path)
            all_labeled_sentences.append(labeled_sentences)

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_labeled_sentences:
            f.write(json.dumps(item) + '\n')

    print(f"Generated JSONL file: {output_file}")

# Usage example
process_directory_and_generate_jsonl('/content/', '/content/output.jsonl')

Instructor: So to speak, yes.

Instructor: So, who will go first is up to you and then just go through it all. We'll pick you up in 10 minutes.

Speaker 1: Can you see me? 
Speaker 2: Yeah, I see you, good.

Speaker 1: Who starts? Am I supposed to perform?

Speaker 1: So...
Speaker 2: Well, I have this drawing here.

Speaker 2: My game is on mute, and I don't feel like replying. But I have pictures too. Should I go first? The same place doesn't matter.

Speaker 1: Can you see red color on your end?
Speaker 2: I see it.

Speaker 1: Ok.
Speaker 2: Like at home. 
Speaker 1: Yeah, yeah.

Speaker 2: Uhm... Do I have to perform or should I tell? I just...
Speaker 1: Over there.

Speaker 1: Alright, all of them are embarrassing. There's a penguin here and then... No well I said ok. I'll take a bit, I can start yeah, just wait.

Speaker 2: Draw something. 
Speaker 1: Yeah.

Speaker 2: I guessed it.

Speaker 1: Do we have the same slips?
Speaker 2: I don't know.

Speaker 1: More.

Speaker 1: Ar