In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import os
print(os.listdir('/content/drive/MyDrive'))

['Colab Notebooks', 'Untitled spreadsheet.gsheet', 'Get Together Options.gform', 'ME.jpg', 'FORM16_01_FY 2021-22_103746.PDF', "Copy of Mahi's Styling Business.gdoc", 'Me.jpeg', 'Resume_Nandkumar_Program Manager.pdf', 'nxtJob-resume.pdf', 'The NxtJob Artillery- Sonali.gsheet', 'Copy of 01 - Character Worksheet.gsheet', 'Copy of 02 - Personal Strategy.gsheet', 'Copy of ILH - AI Content Strategy (1).gsheet', 'Copy of ILH - AI Content Strategy.gsheet', 'Prompts.gdoc', 'Copy of 02 - MVP.gsheet', 'Copy of 03 - Website Content.gsheet', 'For the above case study, I have decided to alloc....gsheet', 'Copy of 3 - Niche Influence Codex.gdoc', 'Copy of Interpersonal Style Inventory.gform', 'Business Training', 'Business Training (1)']


In [11]:
# ---------- RUN THIS CELL (exactly) ----------
# Full Account Dossier generator (Option C) with your BASE preset.

# Install required packages (Colab only; may take ~1 min)
!pip -q install python-docx python-pptx openpyxl pdfminer.six pandas tika

# Python logic (do not edit below unless you know what you're doing)
import os, re, csv, io, sys, shutil
from pathlib import Path
from datetime import datetime
from collections import Counter
import pandas as pd

from pdfminer.high_level import extract_text as extract_pdf_text
from docx import Document as DocxDocument
from pptx import Presentation
import openpyxl

# try tika (optional)
try:
    from tika import parser as tika_parser
    TIKA_AVAILABLE = True
except Exception:
    TIKA_AVAILABLE = False

# ---------- CONFIG: set BASE to your Business Training folder ----------
BASE = "/content/drive/MyDrive/Business Training"   # <- your folder (exact)
OUT = "/content/account_dossiers_output"
os.makedirs(OUT, exist_ok=True)

EXT_MAP_READABLE = {'.pdf', '.docx', '.pptx', '.ppt', '.xlsx', '.xls', '.txt', '.md', '.csv'}

# ---------- helper functions ----------
def safe_read_text(path: Path):
    ext = path.suffix.lower()
    if ext == ".pdf":
        try:
            return extract_pdf_text(str(path)) or ""
        except Exception:
            pass
    if ext == ".docx":
        try:
            doc = DocxDocument(path)
            return "\n".join(p.text for p in doc.paragraphs if p.text) or ""
        except Exception:
            pass
    if ext in [".pptx", ".ppt"]:
        try:
            prs = Presentation(path)
            texts=[]
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        texts.append(shape.text)
            return "\n".join(texts) or ""
        except Exception:
            pass
    if ext in [".xlsx", ".xls"]:
        try:
            wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
            texts=[]
            for ws in wb.worksheets:
                for row in ws.iter_rows(values_only=True):
                    for cell in row:
                        if cell:
                            texts.append(str(cell))
            return "\n".join(texts) or ""
        except Exception:
            pass
    if ext in [".txt", ".md", ".csv"]:
        try:
            return open(path, "r", encoding="utf-8", errors="ignore").read() or ""
        except Exception:
            return ""
    # fallback: try tika if available
    if TIKA_AVAILABLE:
        try:
            raw = tika_parser.from_file(str(path))
            return (raw.get("content") or "")[:20000]
        except Exception:
            return ""
    return ""

def short_summary(text, max_sentences=3):
    if not text:
        return ""
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(sents) <= max_sentences:
        return " ".join(sents).replace("\n"," ")[:2000]
    words = re.findall(r'\w+', text.lower())
    freq = Counter(words)
    scored = []
    for s in sents:
        s_words = re.findall(r'\w+', s.lower())
        score = sum(freq.get(w,0) for w in s_words)
        scored.append((score, s))
    scored.sort(reverse=True)
    top = [s for _, s in scored[:max_sentences]]
    return " ".join(top).replace("\n"," ")[:2000]

def infer_status(text, filename):
    text_l = (text or "") + " " + filename.lower()
    if re.search(r"\b(draft|v\d|version|rdx|rev)\b", text_l): return "Draft"
    if re.search(r"\b(final|signed|approved|accepted|complete)\b", text_l): return "Final"
    if re.search(r"\b(invoice|bill|due|payment|paid|receipt)\b", text_l): return "Finance/Invoice"
    if re.search(r"\b(pending|todo|action|follow up|follow-up|awaiting)\b", text_l): return "Needs Attention"
    if re.search(r"\b(archive|archived|old)\b", text_l): return "Archived"
    return "Unknown"

def keywords(text, topn=6):
    if not text:
        return []
    words = [w.lower() for w in re.findall(r'\w{4,}', text)]
    for stop in ["the","this","that","when","where","which","with","your","from","have","will","project"]:
        words = [w for w in words if w!=stop]
    common = Counter(words).most_common(topn)
    return [w for w,_ in common]

# ---------- Main processing ----------
base = Path(BASE)
if not base.exists():
    raise SystemExit(f"ERROR: BASE path not found: {BASE}\nPlease verify the path and that Drive is mounted.")
index_rows=[]
accounts = [p for p in base.iterdir() if p.is_dir()]
if not accounts:
    raise SystemExit(f"No subfolders found under {BASE}. Ensure your Business Training folder contains subfolders per topic/module.")

for acct in sorted(accounts):
    acct_name = acct.name
    print("Processing:", acct_name)
    acct_files=[]
    for root, dirs, files in os.walk(acct):
        for fn in files:
            p = Path(root)/fn
            rel = p.relative_to(base)
            try:
                stat = p.stat()
            except Exception:
                stat = None
            mtime = datetime.fromtimestamp(stat.st_mtime).isoformat() if stat else ""
            size = stat.st_size if stat else 0
            ext = p.suffix.lower()
            text = ""
            if ext in EXT_MAP_READABLE:
                text = safe_read_text(p)
            summary = short_summary(text, max_sentences=2)
            status = infer_status(text, fn)
            kw = keywords(text, topn=6)
            acct_files.append({
                "file": str(rel),
                "path": str(p),
                "size": size,
                "modified": mtime,
                "status": status,
                "summary": summary,
                "keywords": ";".join(kw)
            })
    # assemble dossier
    dossier = []
    dossier.append(f"# {acct_name} — Dossier\n")
    dossier.append(f"_Generated: {datetime.utcnow().isoformat()}Z_\n")
    bigtext = " ".join([f["summary"] for f in acct_files if f["summary"]])
    context = short_summary(bigtext, max_sentences=3) if bigtext else "No readable text detected. Consider OCR for scanned PDFs."
    dossier.append("## Quick context (auto-generated)\n")
    dossier.append(context + "\n")
    dossier.append("## Top recent files\n")
    top_recent = sorted(acct_files, key=lambda x: x["modified"], reverse=True)[:25]
    for it in top_recent:
        dossier.append(f"- **{Path(it['file']).name}** — {it['status']} — {it['modified']}\n  - Summary: {it['summary'][:300]}\n  - Keywords: {it['keywords']}\n")
    dossier.append("\n## Suggested next actions (auto-generated)\n")
    actions=[]
    statuses=[it["status"] for it in acct_files]
    if "Needs Attention" in statuses or "Unknown" in statuses:
        actions.append("- Review files marked 'Needs Attention' or 'Unknown' and flag follow-ups.")
    if any("invoice" in (it["keywords"] or "").lower() for it in acct_files):
        actions.append("- Check finance/invoice items with Accounts/Finance.")
    actions.append("- Consolidate final deliverables and assign owners.")
    for a in actions:
        dossier.append(a + "\n")
    out_md = Path(OUT)/f"{acct_name}_dossier.md"
    with open(out_md, "w", encoding="utf-8") as f:
        f.write("\n".join(dossier))
    print(" Wrote:", out_md)
    for it in acct_files:
        it_row = {"account": acct_name, **it}
        index_rows.append(it_row)

# write index CSV
df = pd.DataFrame(index_rows)
idx_csv = Path(OUT)/"accounts_index.csv"
df.to_csv(idx_csv, index=False)
print("Wrote index:", idx_csv)
print("All output is in:", OUT)
# ---------- END ----------

Processing: ARM2


  dossier.append(f"_Generated: {datetime.utcnow().isoformat()}Z_\n")


 Wrote: /content/account_dossiers_output/ARM2_dossier.md
Processing: Amazon
 Wrote: /content/account_dossiers_output/Amazon_dossier.md
Processing: Anik Singal
 Wrote: /content/account_dossiers_output/Anik Singal_dossier.md
Processing: AskMethod
 Wrote: /content/account_dossiers_output/AskMethod_dossier.md
Processing: Copy Blogger
 Wrote: /content/account_dossiers_output/Copy Blogger_dossier.md
Processing: Darren Hardy
 Wrote: /content/account_dossiers_output/Darren Hardy_dossier.md
Processing: Deepak Kanakaraju
 Wrote: /content/account_dossiers_output/Deepak Kanakaraju_dossier.md
Processing: E-mail Marketing


2025-12-07 12:54:44,640 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar to /tmp/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar to /tmp/tika-server.jar.
2025-12-07 12:54:46,567 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar.md5 to /tmp/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/3.1.0/tika-server-standard-3.1.0.jar.md5 to /tmp/tika-server.jar.md5.
2025-12-07 12:54:47,747 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2025-12-07 12:54:52,750 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
  dossier.append(f"_Generated: {datetime

 Wrote: /content/account_dossiers_output/E-mail Marketing_dossier.md
Processing: Facebook




 Wrote: /content/account_dossiers_output/Facebook_dossier.md
Processing: Free Marketing
 Wrote: /content/account_dossiers_output/Free Marketing_dossier.md
Processing: GetResponse
 Wrote: /content/account_dossiers_output/GetResponse_dossier.md
Processing: Gopal Krishnan
 Wrote: /content/account_dossiers_output/Gopal Krishnan_dossier.md
Processing: Misc




 Wrote: /content/account_dossiers_output/Misc_dossier.md
Processing: Must Read
 Wrote: /content/account_dossiers_output/Must Read_dossier.md
Processing: Neil Patel
 Wrote: /content/account_dossiers_output/Neil Patel_dossier.md
Processing: Online Course
 Wrote: /content/account_dossiers_output/Online Course_dossier.md
Processing: Ramit Sethi




 Wrote: /content/account_dossiers_output/Ramit Sethi_dossier.md
Processing: Ray Higdon
 Wrote: /content/account_dossiers_output/Ray Higdon_dossier.md
Processing: Resources
 Wrote: /content/account_dossiers_output/Resources_dossier.md
Processing: Robin Sharma
 Wrote: /content/account_dossiers_output/Robin Sharma_dossier.md
Processing: SEO




 Wrote: /content/account_dossiers_output/SEO_dossier.md
Processing: ShoutMELoud
 Wrote: /content/account_dossiers_output/ShoutMELoud_dossier.md
Processing: Siddharth Rajsekhar


  warn(f"Print area cannot be set to Defined name: {defn.value}.")
  warn(msg)
  warn(msg)


 Wrote: /content/account_dossiers_output/Siddharth Rajsekhar_dossier.md
Processing: Swipes




 Wrote: /content/account_dossiers_output/Swipes_dossier.md
Processing: Thinkific
 Wrote: /content/account_dossiers_output/Thinkific_dossier.md
Processing: Traffic
 Wrote: /content/account_dossiers_output/Traffic_dossier.md
Processing: Unsolved
 Wrote: /content/account_dossiers_output/Unsolved_dossier.md
Processing: Webinar
 Wrote: /content/account_dossiers_output/Webinar_dossier.md
Processing: Zach Trainings
 Wrote: /content/account_dossiers_output/Zach Trainings_dossier.md
Processing: client-report-done-for-you




 Wrote: /content/account_dossiers_output/client-report-done-for-you_dossier.md
Processing: eBook
 Wrote: /content/account_dossiers_output/eBook_dossier.md
Processing: freesqueeze
 Wrote: /content/account_dossiers_output/freesqueeze_dossier.md
Wrote index: /content/account_dossiers_output/accounts_index.csv
All output is in: /content/account_dossiers_output
