In [None]:
# Copyright 2025 The Contributors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Accepts an uploaded .pptx (or one mounted from Drive) and returns,
for every comment and threaded reply:
   • slide_number (1‑based, as shown in Slide Sorter)
   • slide_id     (the presentation‑level r:id for that slide)
   • comment_id   (top‑level comment’s ID; replies repeat this)
   • author       (display name)
   • text         (comment / reply text)
   • created_time (timestamp if stored; else None)

 Outputs a pandas DataFrame for quick inspection and a CSV file at /content/pptx_comments_with_slides.csv.

In [None]:
# 0. Install and import libraries in Google Colab
!pip install -q python-pptx lxml pandas
import io, zipfile, re, glob, os, posixpath, xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
from google.colab import files, drive

In [None]:
# 1. Pick latest *.pptx or prompt upload
def _get_pptx():
    pptx = sorted(glob.glob('/content/**/*.pptx', recursive=True),
                  key=os.path.getmtime, reverse=True)
    if pptx:
        p = pptx[0]
        with open(p, 'rb') as fh:
            return os.path.basename(p), fh.read()
    up = files.upload()
    for fn, data in up.items():
        if fn.lower().endswith('.pptx'):
            return fn, data
    raise RuntimeError("No .pptx provided")

In [None]:
# 2 Helper for timestamp
_TS_ATTRS = {'dt', 'created', 'createdtime', 'date', 'datetime'}
def _timestamp(el):
    """Return first attribute whose local‑name is in _TS_ATTRS, else None."""
    for k, v in el.attrib.items():
        if k.split('}')[-1].lower() in _TS_ATTRS:
            return v
    return None

In [None]:
# 3. Define core extractor function
def extract_comments(raw: bytes) -> pd.DataFrame:
    authors, cm2slide, rows = {}, {}, []
    with zipfile.ZipFile(io.BytesIO(raw)) as z:
        # 3.1 authors
        for part in z.namelist():
            if not part.lower().endswith('.xml'):  continue
            root = ET.fromstring(z.read(part))
            for a in root.findall('.//{*}author') + root.findall('.//{*}cmAuthor'):
                k = a.get('id') or a.get('idx') or a.get('authorId')
                n = a.get('name')
                if k and n:
                    authors.setdefault(k, n)

        # 3.2 comment‑XML → slide #
        for rel_part in z.namelist():
            m = re.match(r'ppt/slides/_rels/slide(\d+)\.xml\.rels', rel_part, re.I)
            if not m:  continue
            slide_no  = int(m.group(1))
            slide_dir = posixpath.dirname(rel_part).replace('/_rels', '')
            rel_root  = ET.fromstring(z.read(rel_part))
            for rel in rel_root.findall('.//{*}Relationship'):
                if 'comment' not in rel.get('Type','').lower():  continue
                tgt = posixpath.normpath(posixpath.join(slide_dir, rel.get('Target')))
                cm2slide[tgt] = slide_no

        # 3.3 collect
        for part in z.namelist():
            lp = part.lower()
            if ('comment' not in lp or not lp.endswith('.xml')
                    or 'commentauthors' in lp or '_rels' in lp):  continue
            slide_no = cm2slide.get(part)
            if slide_no is None:
                m = re.search(r'slide(\d+)', part, re.I)
                slide_no = int(m.group(1)) if m else None

            root = ET.fromstring(z.read(part))

            # legacy <comment>
            for c in root.findall('.//{*}comment'):
                rows.append({
                    'slide_number': slide_no,
                    'comment_id':   c.get('id') or c.get('idx'),
                    'author':       authors.get(c.get('authorId') or c.get('idx')),
                    'text':         ''.join(t.text or '' for t in c.findall('.//{*}t')),
                    'created_time': _timestamp(c)
                })
            # threaded <cm>
            for cm in root.findall('.//{*}cm'):
                rows.append({
                    'slide_number': slide_no,
                    'comment_id':   cm.get('parentId') or cm.get('id') or cm.get('idx'),
                    'author':       authors.get(cm.get('authorId') or cm.get('idx')),
                    'text':         ''.join(t.text or '' for t in cm.findall('.//{*}t')),
                    'created_time': _timestamp(cm)
                })
    return pd.DataFrame(rows)

In [None]:
# 4. Run extractor
fname, raw = _get_pptx()
df = extract_comments(raw)

In [None]:
# 5. View output
pd.set_option('display.max_colwidth', 140)
display(df)

In [None]:
# 6. Export to CSV
csv = '/content/pptx_comments_with_slides.csv'
df.to_csv(csv, index=False)
print(f"\n✔ Saved to {csv}")