In [None]:
# Copyright 2025 The Contributors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# 0. Install and import libraries in Google Colab
!pip install python-docx lxml

from docx import Document
from google.colab import files
import csv, zipfile, io
from lxml import etree as ET   # robust XML parsing

In [None]:
# 1. Define helper function
def extract_comments(docx_path):
    """
    Returns a list of [author, comment_text] pairs from a .docx,
    using python‑docx first and raw‑XML fallback if needed.
    """
    comments = []

    # ----- Method A: python-docx (works when the comments part
    # is properly linked to the main document) ----------------
    try:
        doc = Document(docx_path)
        comments_part = getattr(doc.part, "comments_part", None)
        if comments_part is not None:
            for c in comments_part.comments:
                # `c.text` merges paragraph breaks; that’s fine here
                comments.append([c.author or "", c.text.strip()])
    except Exception as e:
        # don't crash if python-docx hits an edge‑case
        pass

    # ----- Method B: raw XML fallback -----------------------------------
    if not comments:
        with zipfile.ZipFile(docx_path) as z:
            # Word stores comments in these parts
            possible_parts = [
                "word/comments.xml",           # classic comments
                "word/commentsExtended.xml"    # modern comments (365)
            ]
            ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

            for part in possible_parts:
                if part in z.namelist():
                    xml_bytes = z.read(part)
                    root = ET.fromstring(xml_bytes)

                    for comm in root.findall(".//w:comment", ns):
                        author = comm.get("{%s}author" % ns["w"], "")
                        texts  = [t.text for t in comm.findall(".//w:t", ns) if t.text]
                        comments.append([author, " ".join(texts).strip()])

    return comments

In [None]:
# 2. Upload your .docx file(s)
uploaded = files.upload()

In [None]:
# 3. Process each upload and save to CSV
for fn in uploaded.keys():
    comments = extract_comments(fn)
    if comments:
        csv_name = f"{fn.rsplit('.',1)[0]}_comments.csv"
        with open(csv_name, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Author", "Comment"])
            writer.writerows(comments)

        print(f"✅ {len(comments)} comments found. Saved → {csv_name}")
        files.download(csv_name)  # trigger browser download
    else:
        print(f"⚠️  No comments detected in {fn}")