In [1]:
import os
import pandas as pd
from pathlib import Path
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 1000)

In [None]:
# # Path to Downloads (Windows example)
# downloads_path = Path.home() / "Downloads"

# # Collect file info
# data = []
# for file in downloads_path.iterdir():
#     if file.is_file():
#         stat = file.stat()
#         data.append({
#             "name": file.name,
#             "size_bytes": stat.st_size,
#             "suffix": file.suffix,
#             "date_modified": pd.to_datetime(stat.st_mtime, unit="s").strftime("%Y-%m-%d"),
#             "date_created": pd.to_datetime(stat.st_birthtime, unit="s").strftime("%Y-%m-%d"),
#         })

# # Convert to DataFrame
# df = pd.DataFrame(data)

# print(df.head())


                                      name  size_bytes suffix date_modified  \
0  +----------------------------------.txt        3112   .txt    2025-07-10   
1                                .Rhistory        1297           2025-12-22   
2                            ._ipp2-source         178           2022-09-07   
3      01 PCProcessorMicroarchitecture.pdf      170784   .pdf    2025-03-13   
4               01-probability-review.docx       19858  .docx    2025-12-09   

  date_created  
0   2025-07-10  
1   2025-12-22  
2   2025-11-04  
3   2025-03-13  
4   2025-12-09  


In [7]:
df = pd.read_csv("/home/kennym1/download_analysis/downloads_info.csv")
print(df.head())

                                      name  size_bytes suffix date_modified  \
0  +----------------------------------.txt        3112   .txt    2025-07-10   
1                                .Rhistory        1297    NaN    2025-12-22   
2                            ._ipp2-source         178    NaN    2022-09-07   
3      01 PCProcessorMicroarchitecture.pdf      170784   .pdf    2025-03-13   
4               01-probability-review.docx       19858  .docx    2025-12-09   

  date_created  
0   2025-07-10  
1   2025-12-22  
2   2025-11-04  
3   2025-03-13  
4   2025-12-09  


In [None]:
# df.to_csv("downloads_info.csv", index=False)

In [8]:
print(df.suffix.value_counts())

suffix
.pdf                                        570
.docx                                        68
.pptx                                        67
.png                                         45
.html                                        37
.txt                                         26
.csv                                         22
.sql                                         17
.jpg                                         14
.jfif                                        11
.xlsx                                        10
.bmpr                                         9
.jpeg                                         8
.json                                         6
.ans                                          6
.in                                           6
.py                                           6
.v                                            5
.java                                         5
.ipynb                                        4
.log                             

In [9]:
# --- Stage 1: Rule-based by extension (instant, no model needed) ---
EXTENSION_RULES = {
    # Software
    ".exe": "Software Installer", ".msi": "Software Installer",
    ".iso": "Software Installer", ".dmg": "Software Installer",
    ".apk": "Software Installer", ".deb": "Software Installer",
    # Media
    ".mp4": "Media or Entertainment", ".mkv": "Media or Entertainment",
    ".avi": "Media or Entertainment", ".mp3": "Media or Entertainment",
    ".wav": "Media or Entertainment", ".flac": "Media or Entertainment",
    # Images
    ".jpg": "Photo or Image", ".jpeg": "Photo or Image",
    ".png": "Photo or Image",  ".gif": "Photo or Image",
    ".svg": "Photo or Image",  ".psd": "Creative Project",
    # Code / Data
    ".py": "Dataset or Code", ".js": "Dataset or Code",
    ".csv": "Dataset or Code", ".json": "Dataset or Code",
    ".ipynb": "Dataset or Code", ".sql": "Dataset or Code",
    # Archives
    ".zip": "Archive or Backup", ".rar": "Archive or Backup",
    ".tar": "Archive or Backup", ".gz": "Archive or Backup",
    ".7z": "Archive or Backup",
}

# --- Stage 2: NLP zero-shot for ambiguous extensions ---
AMBIGUOUS_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".pptx", ".xlsx", ".xls"}

NLP_LABELS = [
    "Personal Documentation",
    "Schoolwork",
    "Financial",
    "Research Paper",
    "Creative Project",
    "Dataset or Code",
    "Archive or Backup",
]

In [10]:
classifier = pipeline(
        "zero-shot-classification",
        model="cross-encoder/nli-deberta-v3-small",
        device=0,  # remove this line if you don't have a GPU
)

Loading weights: 100%|██████████| 106/106 [00:00<00:00, 368.18it/s, Materializing param=pooler.dense.weight]                                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: cross-encoder/nli-deberta-v3-small
Key                             | Status     |  | 
--------------------------------+------------+--+-
deberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [12]:
def classify_with_rules(row) -> str:
    ext = row["suffix"].lower() if pd.notna(row["suffix"]) else ""
    if ext in EXTENSION_RULES:
        return EXTENSION_RULES[ext]
    if ext in AMBIGUOUS_EXTENSIONS:
        return None  # needs NLP
    return "Other"

df["category"] = df.apply(classify_with_rules, axis=1)

needs_nlp = df["category"].isna()
print(f"Rule-based: {(~needs_nlp).sum()} files | NLP needed: {needs_nlp.sum()} files")

# Only load the model if there are ambiguous files
if needs_nlp.any():
    ambiguous_names = df.loc[needs_nlp, "name"].tolist()

    # Batch inference — much faster than calling one at a time
    cleaned = [n.replace("_", " ").replace("-", " ").replace(".", " ") for n in ambiguous_names]
    results = classifier(cleaned, candidate_labels=NLP_LABELS, batch_size=16)

    df.loc[needs_nlp, "category"] = [r["labels"][0] for r in results]

print(df[["name", "suffix", "category"]].head(20))


Rule-based: 269 files | NLP needed: 743 files
                                         name suffix           category
0     +----------------------------------.txt   .txt  Archive or Backup
1                                   .Rhistory    NaN              Other
2                               ._ipp2-source    NaN              Other
3         01 PCProcessorMicroarchitecture.pdf   .pdf  Archive or Backup
4                  01-probability-review.docx  .docx    Dataset or Code
5                01. WPS  PQR SMAW E6013.json  .json    Dataset or Code
6                    02 RealCloudComputer.pdf   .pdf    Dataset or Code
7                 02-statistical-process.docx  .docx    Dataset or Code
8           03 How TI Adopted VLIW in DSP.pdf   .pdf    Dataset or Code
9                          03-likelihood.docx  .docx    Dataset or Code
10  03485f00-95ad-4b43-9900-5dae10eda43f.jfif  .jfif              Other
11              04 BFS and DFS Variations.pdf   .pdf     Research Paper
12                