In [15]:
import os
import pandas as pd
from pathlib import Path
from transformers import pipeline
import re

In [16]:
pd.set_option('display.max_rows', 1000)

In [17]:
# # Path to Downloads (Windows example)
# downloads_path = Path.home() / "Downloads"

# # Collect file info
# data = []
# for file in downloads_path.iterdir():
#     if file.is_file():
#         stat = file.stat()
#         data.append({
#             "name": file.name,
#             "size_bytes": stat.st_size,
#             "suffix": file.suffix,
#             "date_modified": pd.to_datetime(stat.st_mtime, unit="s").strftime("%Y-%m-%d"),
#             "date_created": pd.to_datetime(stat.st_birthtime, unit="s").strftime("%Y-%m-%d"),
#         })

# # Convert to DataFrame
# df = pd.DataFrame(data)

# print(df.head())


In [18]:
df = pd.read_csv("/home/kennym1/download-analysis/downloads_info.csv")
print(df.head())

                                      name  size_bytes suffix date_modified  \
0  +----------------------------------.txt        3112   .txt    2025-07-10   
1                                .Rhistory        1297    NaN    2025-12-22   
2                            ._ipp2-source         178    NaN    2022-09-07   
3      01 PCProcessorMicroarchitecture.pdf      170784   .pdf    2025-03-13   
4               01-probability-review.docx       19858  .docx    2025-12-09   

  date_created  
0   2025-07-10  
1   2025-12-22  
2   2025-11-04  
3   2025-03-13  
4   2025-12-09  


In [19]:
# df.to_csv("downloads_info.csv", index=False)

In [20]:
print(df.suffix.value_counts())

suffix
.pdf                                        570
.docx                                        68
.pptx                                        67
.png                                         45
.html                                        37
.txt                                         26
.csv                                         22
.sql                                         17
.jpg                                         14
.jfif                                        11
.xlsx                                        10
.bmpr                                         9
.jpeg                                         8
.json                                         6
.ans                                          6
.in                                           6
.py                                           6
.v                                            5
.java                                         5
.ipynb                                        4
.log                             

In [21]:
import re
from pathlib import Path

# --- Stage 1: Rule-based by extension (instant, no model needed) ---
EXTENSION_RULES = {
    # Software
    ".exe": "Software Installer", ".msi": "Software Installer",
    ".iso": "Software Installer", ".dmg": "Software Installer",
    ".apk": "Software Installer", ".deb": "Software Installer",
    ".msix": "Software Installer", ".appinstaller": "Software Installer",
    ".whl": "Software Installer",
    # Media
    ".mp4": "Media or Entertainment", ".mkv": "Media or Entertainment",
    ".avi": "Media or Entertainment", ".mp3": "Media or Entertainment",
    ".wav": "Media or Entertainment", ".flac": "Media or Entertainment",
    ".ogg": "Media or Entertainment",
    # Images
    ".jpg": "Photo or Image", ".jpeg": "Photo or Image",
    ".jfif": "Photo or Image",
    ".png": "Photo or Image",  ".gif": "Photo or Image",
    ".svg": "Photo or Image",  ".psd": "Creative Project",
    # Code / Data
    ".py": "Dataset or Code", ".js": "Dataset or Code",
    ".csv": "Dataset or Code", ".json": "Dataset or Code",
    ".ipynb": "Dataset or Code", ".sql": "Dataset or Code",
    ".c": "Dataset or Code", ".cpp": "Dataset or Code",
    ".java": "Dataset or Code", ".v": "Dataset or Code",
    ".tsv": "Dataset or Code",
    # Archives
    ".zip": "Archive or Backup", ".rar": "Archive or Backup",
    ".tar": "Archive or Backup", ".gz": "Archive or Backup",
    ".7z": "Archive or Backup", ".xz": "Archive or Backup",
    ".bak": "Archive or Backup", ".bac": "Archive or Backup",
    ".tmp": "Archive or Backup",
}

# --- Stage 2: Keyword rules applied to the cleaned filename stem ---
KEYWORD_RULES = {
    "Schoolwork": [
        "lecture", "homework", "assignment", "exam", "quiz", "lab",
        "midterm", "final", "chapter", "tutorial", "worksheet", "syllabus",
        "notes", "slide", "textbook", "problem set", "pset",
        "probability", "statistics", "algorithm", "microarchitecture",
        "hypothesis", "estimation", "likelihood", "regression", "prediction",
        "processor", "architecture", "operating system", "compiler",
        "database", "network", "machine learning", "data structure",
        "sorting", "search", "graph", "tree", "recursion",
        "prior", "posterior", "bayesian",
    ],
    "Research Paper": [
        "survey", "review", "study", "analysis", "evaluation",
        "proceedings", "conference", "acm", "ieee", "arxiv",
        "journal", "abstract", "methodology", "experiment",
    ],
    "Financial": [
        "invoice", "receipt", "bank", "statement", "tax", "budget",
        "payroll", "expense", "payment", "billing", "account",
    ],
    "Personal Documentation": [
        "resume", "cv", "cover letter", "passport", "license",
        "certificate", "transcript", "contract", "agreement", "lease",
        "insurance", "medical", "prescription",
    ],
}

# --- Stage 3: NLP zero-shot for files that pass through keyword stage ---
AMBIGUOUS_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".pptx", ".xlsx",
                        ".xls", ".html", ".rtf", ".tex", ".latex", ".md"}

NLP_LABELS = [
    "Personal Documentation",
    "Schoolwork or Course Material",
    "Financial Record",
    "Research Paper or Academic Article",
    "Creative Project",
    "Dataset or Code",
    "Archive or Backup",
]


def clean_for_nlp(name: str) -> str:
    """
    Produce a human-readable description from a raw filename so the
    zero-shot model has something meaningful to classify.
    Always returns a non-empty string.
    """
    stem = Path(name).stem
    original_stem = stem  # keep as fallback

    # Strip leading numeric prefix: "01 ", "02-", "03. ", etc.
    stripped = re.sub(r'^\d+[\s.\-_]+', '', stem)
    # Only accept the strip if it left something meaningful
    stem = stripped if stripped.strip() else original_stem

    # Strip UUID-like tokens (they add noise, not signal)
    stem = re.sub(
        r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b',
        '', stem, flags=re.I,
    )

    # Split CamelCase: "PCProcessorMicroarchitecture" → "PC Processor Microarchitecture"
    stem = re.sub(r'([a-z])([A-Z])', r'\1 \2', stem)
    stem = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', stem)

    # Replace common separators with spaces
    stem = stem.replace('_', ' ').replace('-', ' ').replace('.', ' ')

    # Collapse multiple spaces
    stem = re.sub(r'\s+', ' ', stem).strip()

    # Ultimate fallback: if all processing left nothing, use the raw name
    return stem if stem else name


def keyword_classify(cleaned_stem: str) -> str | None:
    """Return a label if any keyword matches, else None (needs NLP)."""
    lower = cleaned_stem.lower()
    for label, keywords in KEYWORD_RULES.items():
        if any(kw in lower for kw in keywords):
            return label
    return None


In [22]:
classifier = pipeline(
        "zero-shot-classification",
        model="cross-encoder/nli-deberta-v3-small",
        device=0,  # remove this line if you don't have a GPU
)

Loading weights: 100%|██████████| 106/106 [00:00<00:00, 619.34it/s, Materializing param=pooler.dense.weight]                                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: cross-encoder/nli-deberta-v3-small
Key                             | Status     |  | 
--------------------------------+------------+--+-
deberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [23]:
def classify_with_rules(row) -> tuple[str | None, str]:
    """Returns (category, cleaned_name). category is None if NLP is needed."""
    ext = str(row["suffix"]).lower() if pd.notna(row["suffix"]) else ""
    name = str(row["name"])
    cleaned = clean_for_nlp(name)

    # Stage 1: hard extension rules
    if ext in EXTENSION_RULES:
        return EXTENSION_RULES[ext], cleaned

    # Stage 2: keyword match on cleaned stem (fast, no model)
    if ext in AMBIGUOUS_EXTENSIONS:
        kw_label = keyword_classify(cleaned)
        if kw_label:
            return kw_label, cleaned
        return None, cleaned  # falls through to NLP

    return "Other", cleaned

results_tuple = df.apply(classify_with_rules, axis=1, result_type="expand")
df["category"] = results_tuple[0]
df["cleaned_name"] = results_tuple[1]

needs_nlp = df["category"].isna()
print(f"Extension rules : {(df['category'].notna() & df['category'].ne('Other')).sum()} files")
print(f"Keyword rules   : {needs_nlp.sum()} still need NLP")
print(f"Total needing NLP: {needs_nlp.sum()} / {len(df)}")

if needs_nlp.any():
    texts = df.loc[needs_nlp, "cleaned_name"].tolist()

    nlp_results = classifier(
        texts,
        candidate_labels=NLP_LABELS,
        # More natural framing for document filenames — avoids the generic
        # "This example is X" default which inflates Archive/Backup scores
        hypothesis_template="This file is a {}.",
        batch_size=16,
    )

    df.loc[needs_nlp, "category"] = [r["labels"][0] for r in nlp_results]

print("\nSample results:")
print(df[["name", "suffix", "cleaned_name", "category"]].head(25).to_string(index=True))


Extension rules : 407 files
Keyword rules   : 549 still need NLP
Total needing NLP: 549 / 1012

Sample results:
                                         name suffix                               cleaned_name                       category
0     +----------------------------------.txt   .txt                                          +                Dataset or Code
1                                   .Rhistory    NaN                                   Rhistory                          Other
2                               ._ipp2-source    NaN                                ipp2 source                          Other
3         01 PCProcessorMicroarchitecture.pdf   .pdf             PC Processor Microarchitecture                     Schoolwork
4                  01-probability-review.docx  .docx                         probability review                     Schoolwork
5                01. WPS  PQR SMAW E6013.json  .json                         WPS PQR SMAW E6013                Dataset or Code

In [27]:
df[df.category=="Schoolwork or Course Material"]

Unnamed: 0,name,size_bytes,suffix,date_modified,date_created,category,cleaned_name
11,04 BFS and DFS Variations.pdf,788476,.pdf,2025-09-23,2025-09-23,Schoolwork or Course Material,BFS and DFS Variations
23,1-3_FIrstExercise.pptx,15323934,.pptx,2025-03-19,2025-03-19,Schoolwork or Course Material,3 F Irst Exercise
84,5-2_MemoryManagement.pptx,31880847,.pptx,2025-04-14,2025-04-14,Schoolwork or Course Material,2 Memory Management
150,Blue Prince.txt,1136,.txt,2025-10-06,2025-10-06,Schoolwork or Course Material,Blue Prince
151,Book.pdf,27973,.pdf,2025-05-19,2025-05-19,Schoolwork or Course Material,Book
185,cheatsheet.pdf,137200,.pdf,2024-11-05,2024-11-05,Schoolwork or Course Material,cheatsheet
186,Cherry-Picking.pdf,157351,.pdf,2025-11-20,2025-11-20,Schoolwork or Course Material,Cherry Picking
188,Clear and concise sentences activity.pdf,197523,.pdf,2024-12-20,2024-12-20,Schoolwork or Course Material,Clear and concise sentences activity
191,cmd starsector.txt,1436,.txt,2025-07-26,2025-07-19,Schoolwork or Course Material,cmd starsector
197,CON.pdf,475662,.pdf,2025-05-02,2025-05-02,Schoolwork or Course Material,CON


In [28]:
df[df.category=="Dataset or Code"]

Unnamed: 0,name,size_bytes,suffix,date_modified,date_created,category,cleaned_name
0,+----------------------------------.txt,3112,.txt,2025-07-10,2025-07-10,Dataset or Code,+
5,01. WPS PQR SMAW E6013.json,1645,.json,2025-07-02,2025-07-02,Dataset or Code,WPS PQR SMAW E6013
6,02 RealCloudComputer.pdf,2709157,.pdf,2025-03-20,2025-03-18,Dataset or Code,Real Cloud Computer
7,02-statistical-process.docx,23934,.docx,2025-12-18,2025-12-18,Dataset or Code,statistical process
8,03 How TI Adopted VLIW in DSP.pdf,1644275,.pdf,2025-03-27,2025-03-27,Dataset or Code,How TI Adopted VLIW in DSP
14,05.06.25.pdf,46835,.pdf,2025-05-12,2025-05-12,Dataset or Code,06 25
15,05.08.25.pdf,67374,.pdf,2025-05-12,2025-05-12,Dataset or Code,08 25
18,08 Knuth-Morris-Pratt String Matching.pdf,137716,.pdf,2025-10-07,2025-10-07,Dataset or Code,Knuth Morris Pratt String Matching
20,"09-09-25, 1538 Microsoft Lens.pdf",390242,.pdf,2025-09-09,2025-09-09,Dataset or Code,"09 25, 1538 Microsoft Lens"
21,09-mc-integration.docx,22896,.docx,2026-01-13,2026-01-13,Dataset or Code,mc integration


In [32]:
df[df.category=="Schoolwork"]

Unnamed: 0,name,size_bytes,suffix,date_modified,date_created,category,cleaned_name
3,01 PCProcessorMicroarchitecture.pdf,170784,.pdf,2025-03-13,2025-03-13,Schoolwork,PC Processor Microarchitecture
4,01-probability-review.docx,19858,.docx,2025-12-09,2025-12-09,Schoolwork,probability review
9,03-likelihood.docx,21781,.docx,2025-12-18,2025-12-18,Schoolwork,likelihood
12,04-updating-priors.docx,19198,.docx,2025-12-18,2025-12-18,Schoolwork,updating priors
13,05-estimation.docx,24823,.docx,2025-12-18,2025-12-18,Schoolwork,estimation
16,06-prediction.docx,19859,.docx,2025-12-18,2025-12-18,Schoolwork,prediction
17,07-hypothesis-testing.docx,20173,.docx,2025-12-18,2025-12-18,Schoolwork,hypothesis testing
19,08-constructing-priors.docx,19643,.docx,2025-12-18,2025-12-18,Schoolwork,constructing priors
34,12 Fenwick Trees.pdf,90066,.pdf,2025-10-22,2025-10-22,Schoolwork,Fenwick Trees
45,2-1_ExampleAndSciMethodAndLogistics.pptx,9871282,.pptx,2025-03-19,2025-03-19,Schoolwork,1 Example And Sci Method And Logistics


In [33]:
df.to_csv("downloads_info_analyzed.csv", index=False)