# 基于SRL的忏悔识别

In [4]:
import spacy
import pandas as pd
import numpy as np
nlp = spacy.load("en_core_web_lg")

In [8]:
sample_df = pd.read_csv("../data/last_words_sampled_rows.csv")
sample_statements = sample_df["last.statement"].tolist()
sample_statements

['I have come here today to die, not make speeches. Today is a good day for dying. My Honor Is My Life.',
 ' Tell Mama I love her.',
 nan,
 'I just want everyone to know that the prosecutor and Bill Scott are sorry sons of bitches. To his family he added that he loved them all.',
 nan,
 nan,
 " I'm innocent. I had nothing to do with my family's murders. I want to thank everyone who has supported me. I hope they continue to fight. You know who you are. That's all. Thank you, Warden.  ",
 "  Yes sir. To Edie, Tom, and and Carma - I love all you all.  I appreciate all your support.  I love you Margherita, Father Guido, and Father Angelo.  I appreciate your spiritual support and all those that were in prayer for me.  I will be O.K.  I am at peace with all of this and I won't have to wake up in prison any more.  I love you all.  I totally surrender to the Lord.  I am ready, Warden. ",
 ' Into your hands Oh Lord, I commence my spirit.  Amen.  ',
 "I would like to apologize to the Nix family 

## 找到文本中的受害者

In [None]:
def is_family_member(sentence, entity):
    """
    Determine if an entity in a sentence is a family member based on contextual clues.
    
    Args:
        sentence (str): The sentence containing the entity
        entity (spacy.tokens.span.Span): The entity to check
        
    Returns:
        bool: True if the entity is likely a family member, False otherwise
    """
    if entity.text == "God":
        return False
    # Family-related terms
    family_terms = [
        "family", "mother", "father", "mom", "dad", "brother", "sister", 
        "son", "daughter", "child", "children", "kids", "wife", "husband", 
        "spouse", "grandma", "grandpa", "grandmother", "grandfather", 
        "aunt", "uncle", "cousin", "niece", "nephew", "parents"
    ]
    
    # Create a spaCy Doc object
    doc = nlp(sentence)
    
    # Find the entity in the sentence
    entity_tokens = []
    entity_text = entity.text.lower()
    for token in doc:
        if token.text.lower() == entity_text:
            entity_tokens.append(token)
    
    if not entity_tokens:
        return False
    
    # Check if the entity is explicitly referred to as a family member
    for token in entity_tokens:
        # Look at a window of 5 tokens around the entity
        start = max(0, token.i - 5)
        end = min(len(doc), token.i + 6)
        window = doc[start:end]
        
        # Check if there are family terms in the window
        for w in window:
            if w.text.lower() in family_terms:
                return True
        
        # Check for possessive patterns like "my [entity]" or "[entity] is my..."
        for i, w in enumerate(window):
            if i < len(window) - 1 and w.text.lower() == "my" and window[i+1].text.lower() == entity_text:
                return True
                
        # Check for relationship indicators
        for w in window:
            if w.dep_ == "poss" and w.text.lower() in ["my", "our"]:
                return True
    
    return False

In [70]:
sample_statements[11]

"Tell my son I love him very much. God bless everybody. Continue to walk with God. Go Cowboys! Love ya'll man. Don't forget the T-ball. Ms. Mary, thank you for everything that you've done. You too, Brad, thank you. I can feel it, taste it, not bad."

In [None]:
appologize_tokens = [
    'sorry',
    'apologize',
    'forgive',
    'regret',
    'remorse',
]
def regret_dector(statement):
    """
    This function takes a statement and returns 1 if it contains any of the words
    'sorry', 'apologize', 'regret', or 'forgive'. Otherwise, it returns 0.
    """
    doc = nlp(statement)
    # 对doc进行分句
    sentences = list(doc.sents)
    for sentence in sentences:
        # 首先，道歉的主语应当是自己，而不是他人
        sen = nlp(sentence.text)
        # 检查道歉的主语是否是自己
        if not any(token.text.lower() in ["i", "me", "myself"] for token in sen if token.dep_ == "nsubj"):
            continue
        # 检查道歉的谓语是否为自己的家人，如果是，则跳过此循环
        if any(token.text.lower() in ["family", "kids", "children", "mom", "dad", "parents"] for token in sen if token.dep_ == "dobj"):
            continue
        # 最后，检查是否存在道歉行为，首先，看是否有道歉的相关谓词
        for token in sen:
            # 其次，token的词形应当是道歉相关的词
            if token.lemma_ in appologize_tokens:
                return 1
        # 检查句子中是否有承担责任的表达, 如果有，也返回1
        if any(token.text.lower() in ["responsibility", "accountable", "blame"] for token in sen):
            return 1
    return 0


In [None]:
regret_dector(sample_statements[12])

1