In [1]:
import docx
import re
import pandas as pd

# Load the document
def load_docx_text(path):
    doc = docx.Document(path)
    full_text = [para.text for para in doc.paragraphs]
    return full_text

# Extract in-text citations
def extract_intext_citations(text_lines):
    pattern = r"\(([^)]+?,\s*\d{4})\)"
    matches = []
    for line in text_lines:
        found = re.findall(pattern, line)
        matches.extend(found)
    return list(set(matches))

# Extract references starting from "References"
def extract_reference_section(text_lines):
    try:
        start_idx = [i for i, line in enumerate(text_lines) if line.strip().lower() in ["references", "bibliography"]][0]
        return text_lines[start_idx + 1:]
    except IndexError:
        return []

def extract_reference_authors(ref_lines):
    authors = []
    for line in ref_lines:
        match = re.match(r"^([A-Z][a-zA-Z\-]+(?:,?\s[A-Z]\.)*)", line)
        if match:
            authors.append(match.group(1))
    return list(set(authors))

# Main
def reconcile_apa(path):
    text = load_docx_text(path)
    intext_cites = extract_intext_citations(text)
    references = extract_reference_section(text)
    ref_authors = extract_reference_authors(references)

    # Prepare data
    intext_df = pd.DataFrame(intext_cites, columns=["In-Text Citation"])
    ref_df = pd.DataFrame(ref_authors, columns=["Reference Author"])

    # Clean author names for matching
    intext_df["Author Last Name"] = intext_df["In-Text Citation"].apply(lambda x: x.split(',')[0])
    ref_df["Author Last Name"] = ref_df["Reference Author"].apply(lambda x: x.split(',')[0])

    # Merge and find issues
    merged = intext_df.merge(ref_df, on="Author Last Name", how="outer", indicator=True)

    missing_refs = merged[merged["_merge"] == "left_only"]
    uncited_refs = merged[merged["_merge"] == "right_only"]

    return missing_refs[["In-Text Citation"]], uncited_refs[["Reference Author"]]

# Run
doc_path = "Nuwan_Samarasinghe_23548431_INFO9012501_Research_v9.docx"  # Change to your file path
missing, uncited = reconcile_apa(doc_path)

print("🔍 In-text citations with missing references:")
print(missing.to_string(index=False))

print("\n📚 References not cited in text:")
print(uncited.to_string(index=False))


🔍 In-text citations with missing references:
                                           In-Text Citation
                                "Energy and industry”, 2022
   Alkhayat & Mehmood ,2021; Chen ,2022; Zhang et al., 2022
                                   Alkhayat & Mehmood ,2021
              Alkhayat and Mehmood, 2021; Wang et al., 2021
                                    Almazrouee et al. ,2020
                                    Chaturvedi et al., 2022
                                     Di Grande et al., 2024
                                                   EMI,2024
                                         Ewees et al., 2022
Fotis et al., 2023; Shohan et al., 2022; Islam et al., 2023
                       Fotis et al., 2023; Zhu et al., 2023
                     Fotis et al., 2023; Islam et al., 2023
                    Fotis et al., 2023; Shohan et al., 2022
                                         Fotis et al., 2023
                     Fotis et al., 2023; Shafi et al., 

In [3]:
import docx
import re
import pandas as pd

def load_docx_text(path):
    doc = docx.Document(path)
    return [para.text.strip() for para in doc.paragraphs if para.text.strip()]

def extract_intext_citations(text_lines):
    intext_citations = []
    citation_pattern = re.compile(r"\(([^()]+?,\s*\d{4}(?:;[^()]+?,\s*\d{4})*)\)")
    for line in text_lines:
        matches = citation_pattern.findall(line)
        for match in matches:
            parts = [p.strip() for p in match.split(';')]
            intext_citations.extend(parts)
    return list(set(intext_citations))

def extract_author_year(citation):
    match = re.match(r"([A-Za-z\-]+) et al\.,\s*(\d{4})", citation)
    if match:
        return match.group(1), match.group(2)
    match = re.match(r"([A-Za-z\-]+),\s*(\d{4})", citation)
    if match:
        return match.group(1), match.group(2)
    return None, None

def extract_reference_entries(ref_lines):
    references = []
    for line in ref_lines:
        match = re.match(r"([A-Z][a-zA-Z\-]+)(?:, [A-Z]\.)?(?: et al\.)?\s*\((\d{4})\)", line)
        if match:
            references.append((match.group(1), match.group(2)))
    return references

def reconcile_apa_references(docx_path):
    lines = load_docx_text(docx_path)
    
    # Split main text and reference section
    try:
        ref_start = [i for i, l in enumerate(lines) if l.lower() in ["references", "bibliography"]][0]
        main_text, ref_lines = lines[:ref_start], lines[ref_start+1:]
    except IndexError:
        print("⚠️ Reference section not found.")
        return

    intext_raw = extract_intext_citations(main_text)
    intext_pairs = [extract_author_year(c) for c in intext_raw if extract_author_year(c) != (None, None)]
    ref_pairs = extract_reference_entries(ref_lines)

    intext_df = pd.DataFrame(intext_pairs, columns=["Author", "Year"]).drop_duplicates()
    ref_df = pd.DataFrame(ref_pairs, columns=["Author", "Year"]).drop_duplicates()

    merged = pd.merge(intext_df, ref_df, on=["Author", "Year"], how='outer', indicator=True)

    missing_refs = merged[merged['_merge'] == 'left_only']
    uncited_refs = merged[merged['_merge'] == 'right_only']

    print("🔍 In-text citations with no matching reference entry:")
    print(missing_refs[["Author", "Year"]].to_string(index=False))

    print("\n📚 Reference list entries not cited in text:")
    print(uncited_refs[["Author", "Year"]].to_string(index=False))

# Run the script
#doc_path = "your_report.docx"  # <-- Replace with your actual Word file path
doc_path = "Nuwan_Samarasinghe_23548431_INFO9012501_Research_v9.docx"
reconcile_apa_references(doc_path)


🔍 In-text citations with no matching reference entry:
    Author Year
Chaturvedi 2022
       EMI 2024
     Ewees 2022
     Fotis 2023
    Gasore 2023
     Islam 2023
    Mishra 2020
     Moher 2009
Obahoundje 2024
    Rampal 2022
     Shafi 2023
    Shohan 2022
      Wang 2021
       Wen 2021
     Zhang 2022
     Zhang 2023
       Zhu 2023

📚 Reference list entries not cited in text:
 Author Year
   Chen 2022
Poletti 2021


In [3]:
pip install python-docx pandas regex

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
import docx
import re

def load_docx_text(path):
    doc = docx.Document(path)
    return [para.text.strip() for para in doc.paragraphs if para.text.strip()]

# === 1. Extract In-Text Citations ===
def extract_all_intext_citations(text_lines):
    all_citations = []

    # Patterns: parenthetical + narrative citations
    parenthetical_pattern = r"\(([^()]+?,\s*\d{4}(?:;[^()]+?,\s*\d{4})*)\)"
    narrative_pattern = r"\b([A-Z][a-zA-Z\-]+(?: et al\.| & [A-Z][a-zA-Z\-]+)?),?\s*\(?\b(\d{4})\)?"

    for line in text_lines:
        # Handle citations like (Author et al., 2022; Author2, 2023)
        paren_matches = re.findall(parenthetical_pattern, line)
        for match in paren_matches:
            for part in match.split(';'):
                all_citations.append(part.strip())

        # Handle narrative forms like: "Waqas & Humphries (2024) show..."
        narrative_matches = re.findall(narrative_pattern, line)
        for author, year in narrative_matches:
            all_citations.append(f"{author}, {year}")

    return sorted(set(all_citations))

# === 2. Extract Reference Section ===
def extract_reference_entries(text_lines):
    try:
        start = next(i for i, line in enumerate(text_lines) if line.lower() in ["references", "bibliography"])
        return text_lines[start + 1:]
    except StopIteration:
        return []

# === MAIN RUN ===
def run_extraction(doc_path):
    lines = load_docx_text(doc_path)
    
    # Extract citations
    citations = extract_all_intext_citations(lines)

    # Extract references
    references = extract_reference_entries(lines)

    # Show both lists
    print("\n🔍 In-Text Citations Found:")
    for c in citations:
        print(f"- {c}")

    print("\n📚 Reference Entries (Raw Lines):")
    for r in references:
        print(f"- {r}")

# ====== CHANGE FILE PATH HERE ======
run_extraction("Nuwan_Samarasinghe_23548431_INFO9012501_Research_v12.docx")



🔍 In-Text Citations Found:
- "Energy and industry”, 2022
- Abdulai et al., 2023
- Alhazrouee et al., 2020
- Alkhayat & Mehmood ,2021
- Alkhayat & Mehmood, 2021
- Alkhayat and Mehmood, 2021
- Allal et al., 2024
- Almazrouee et al. ,2020
- Almazrouee et al., 2020
- Asghar et al., 2024
- Chaturvedi et al., 2022
- Chen ,2022
- Chen, 2022
- Di Grande et al., 2024
- EMI, 2024
- EMI,2024
- Ewees et al., 2022
- Farah et al., 2022
- February, 2025
- Fotis et al. ,2023
- Fotis et al., 2023
- Gasore et al. ,2023
- Gasore et al., 2023
- Grande et al., 2024
- Gu et al., 2024
- Guardian, 2021
- Houran et al., 2023
- Islam et al. ,2023
- Islam et al., 2023
- Jan, 2025
- Jang et al., 2024
- Jang, 2024
- January, 2000
- January, 2015
- January, 2025
- Jebli et al., 2021
- Liao et al., 2021
- Liu et al., 2020
- Liu, 2021
- March, 2015
- Mayer, 2022
- Mehmood, 2021
- Mishra et al., 2020
- Moher et al., 2009
- Obahoundje et al. ,2024
- Obahoundje et al., 2024
- Ortiz-Lopez et al. ,2023
- Ortiz-Lopez et a