In [1]:
import re

def extract_quotes_with_speaker_regex(text):
    # Define the regular expression pattern to match speaker and quote pairs
    pattern = r'(?P<speaker>[\w\s]+): "(?P<quote>.*?)"'
    
    # Find all matches using the regular expression pattern
    matches = re.finditer(pattern, text)
    
    quotes_with_speaker = []
    for match in matches:
        speaker = match.group('speaker').strip()
        quote = match.group('quote').strip()
        quotes_with_speaker.append((speaker, quote))
    
    return quotes_with_speaker

# Test with your dataset
text = """
Justin Trudeau: "The people are revolting against carbon taxes."
Conservatives: "The planet burns!"
"""

quotes_with_speaker_regex = extract_quotes_with_speaker_regex(text)

# Output the speaker and quote pairs
for speaker, quote in quotes_with_speaker_regex:
    print(f"Speaker: {speaker.strip()}, Quote: {quote.strip()}")


Speaker: Justin Trudeau, Quote: The people are revolting against carbon taxes.
Speaker: Conservatives, Quote: The planet burns!


In [2]:
import spacy
import os

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

def extract_indirect_quotes_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Initialize a list to store indirect quotes
    indirect_quotes = []
    
    # Iterate through the tokens in the document
    for token in doc:
        # Check for verbs indicating communication
        if token.pos_ == "VERB" and token.dep_ in ("ccomp", "xcomp", "advcl"):
            # Check if the verb is in reported speech
            if any(child.text == "that" and child.dep_ == "mark" for child in token.children):
                # Extract the direct object as the reported speech
                direct_object = [child.text for child in token.children if child.dep_ in ("dobj", "attr", "ccomp", "xcomp", "advcl")]
                direct_quote = " ".join(direct_object)
                if direct_quote:
                    indirect_quotes.append(direct_quote.strip())
    
    return indirect_quotes if indirect_quotes else None

# Directory containing the text files
directory = "C:\\Users\\User\\SDA250Mywork\\A1_data"

# Iterate over each text file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        quotes = extract_indirect_quotes_from_file(file_path)
        print(f"Indirect quotes from {filename}:")
        if quotes:
            for quote in quotes:
                print(f"- \"{quote}\"")
        else:
            print("No indirect quotes found.")
        print()


Indirect quotes from 5c1452701e67d78e276ee126.txt:
- "children"
- "450,000"

Indirect quotes from 5c146e42795bd2fcce2ea8e5.txt:
- "group noting"

Indirect quotes from 5c149ffc1e67d78e276fbd44.txt:
- "less"
- "way"
- "make"
- "explode"

Indirect quotes from 5c15488f1e67d78e277161d7.txt:
- "hunger act"

Indirect quotes from 5c1548a31e67d78e2771624f.txt:
- "stays society million"

