In [23]:
import json
from pathlib import Path
import dateutil.parser
import pwd
!PYTHONUSERBASE="$(pwd)/.ipython" pip install --user git+https://github.com/tqdm/tqdm.git@master#egg=tqdm
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import pandas as pd

def load_entry(path):
    user_id = Path(path).stat().st_uid
    username = pwd.getpwuid(user_id).pw_name
    try:
        raw = json.load(open(path))
    except:
        return None
    result = {}
    result["timestamp"] = dateutil.parser.parse(raw["timestamp"])
    result["environment"] = Path(raw["sys.executable"]).parent.parent.as_posix()
    result["hostname"] = raw["hostname"]
    result["modules"] = extract_modules(raw["modules"])
    result["username"] = username
    return result

def extract_modules(raw_dict):
    seen_names = set()
    
    for name, fs_path in raw_dict.items():
        submodules = name.split('.')
        if submodules[0] in seen_names:
            continue
        if fs_path and "site-packages" not in fs_path:
            continue
        if name == "sitecustomize":
            continue
        seen_names.add(submodules[0])
    return seen_names

Collecting tqdm
  Cloning https://github.com/tqdm/tqdm.git (to revision master) to /tmp/pip-install-r5sb8h94/tqdm_4e45b356a157484fb0869c447c93a010
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25h

In [24]:
def list_to_dataframe(logs_list):
    data = []
    for log in logs_list:
        if log:
            modules = log
            for module in log.pop("modules"):
                data.append({"module": module, **log})
    return pd.DataFrame(data)

def crawl_logs(top_dir, *patterns):
    top_dir = Path(top_dir)
    files_list, results = [], []
    for pattern in patterns:
        files_list.extend(top_dir.glob(pattern))
    n_files = len(files_list)
    print(f"Processing {n_files} files...")
    with ProcessPoolExecutor(max_workers=8) as pool:
        for data in tqdm(pool.map(load_entry, files_list, chunksize=100)): 
            results.append(data)
    return list_to_dataframe(results)

In [25]:
top = '/lus/theta-fs0/logs/pythonlogging/module_usage'
df = crawl_logs(top, "2020/12/04/*")

Processing 17664 files...


17664it [00:03, 5706.14it/s]


In [26]:
df.head()

Unnamed: 0,module,timestamp,environment,hostname,username
0,jsonschema,2020-12-04 03:00:33.551531,/soft/datascience/conda/miniconda3/latest,nid02568,berres
1,six,2020-12-04 03:00:33.551531,/soft/datascience/conda/miniconda3/latest,nid02568,berres
2,importlib_metadata,2020-12-04 03:00:33.551531,/soft/datascience/conda/miniconda3/latest,nid02568,berres
3,pvectorc,2020-12-04 03:00:33.551531,/soft/datascience/conda/miniconda3/latest,nid02568,berres
4,pandas,2020-12-04 03:00:33.551531,/soft/datascience/conda/miniconda3/latest,nid02568,berres


In [27]:
df.value_counts("module").head(20)

module
google                17651
mpl_toolkits          17579
six                   17023
numpy                 16886
dateutil              16834
pytz                  16829
pandas                16819
idna                  16810
mkl                   16755
jsonschema            16636
zipp                  16636
importlib_metadata    16636
pvectorc              16636
pyrsistent            16636
attr                  16636
radical                 445
colorama                391
msgpack                 355
zmq                     334
netifaces               333
dtype: int64