In [None]:
# default_exp utils

# Utilities
> Contains helpful functions used throughout `transfertab`.

In [None]:
#export
import bson
import json
from pathlib import Path

In [None]:
import pandas as pd

In [None]:
#export
def getcatdict(df, catcols=None, add_na=False):
    if catcols == None:
        columns = list(df.columns)
        catcols = list(map(lambda arg: columns[arg[0]], filter(lambda arg: arg[1] == 'object', enumerate(df.dtypes))))
    catdict = {}
    for cat in catcols:
        catdict[cat] = ["nan"] + list(df[cat].unique()) if add_na else list(df[cat].unique())
    return catdict

In [None]:
df = pd.DataFrame({"cat1": [1, 2, 3, 4, 5], "cat2": ['a', 'b', 'c', 'b', 'a'], "cat3": ['A', 'B', 'C', 'D', 'A']})
df

Unnamed: 0,cat1,cat2,cat3
0,1,a,A
1,2,b,B
2,3,c,C
3,4,b,D
4,5,a,A


In [None]:
catdict = getcatdict(df)
catdict['cat2'] == ['a', 'b', 'c']

True

In [None]:
#export
def store_bson(path, data):
    bdata = bson.dumps(data)
    with open(path, "wb") as fp:
        fp.write(bdata)

def load_bson(path):
    with open(path, "rb") as fp:
        bdata = fp.read()
    return bson.loads(bdata)

In [None]:
#export
def _load_extractembeds_from_dir(path):
    path = Path(path)
    json_file = {}
    if path.is_dir():
        for json_f in path.glob("*.json"):
            f = open(json_f, 'rb')
            file = json.load(f)
            for i in file.keys():
                json_file[i] = file[i]
        for bson_f in path.glob("*.bson"):
            file = load_bson(bson_f)
            for i in file.keys():
                json_file[i] = file[i]
    return json_file

In [None]:
#export
def _check_embdict_validity(embdict):
    for catcol, info in embdict.items():
        if set(list(info.keys())) == set(('classes', 'embeddings')) :
            class_len = len(info['classes'])
            embedding_len =  len(info['embeddings'])
            assert class_len == embedding_len, f"Class Size of {class_len} and Embedding size of {embedding_len} does not match for {catcol}"

In [None]:
#export
def generate_files_embedprojector_for_comparision(model1_path, model2_path, dir_path):
    model1_embedsdict = load_bson(model1_path)
    model2_embedsdict = load_bson(model2_path)
    for ((catcol, info1), (_, info2)) in zip(model1_embedsdict.items(), model2_embedsdict.items()):
        model1_data = [[clas + '_Model1'] for clas in info1['classes']]
        model2_data = [[clas + '_Model2'] for clas in info2['classes']]
        with open(dir_path + '/' + catcol + '_meta.tsv', 'w') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\n')
            writer.writerows(model1_data)
            writer.writerows(model2_data)
        with open(dir_path + '/' + catcol + '_embeds.tsv', 'w') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerows(info1['embeddings'])
            writer.writerows(info2['embeddings'])

## Export

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_extract.ipynb.
Converted 02_transfer.ipynb.
Converted 03_load_tests.ipynb.
Converted index.ipynb.
