In [None]:
from mpcontribs.client import Client
import gzip, json, os
import numpy as np
from pandas import DataFrame
from collections import defaultdict
from tqdm.notebook import tqdm
from unflatten import unflatten

In [None]:
name = 'carrier_transport'
client = Client()

**Retrieve and update project info**

In [None]:
# client.projects.update_entry(pk=project, project={
#     'long_title': 'Electronic Transport Properties'
# }).result()
client.get_project(name).pretty()

**Create contributions**

In [None]:
input_dir = '/project/projectdirs/matgen/fricci/transport_data/coarse'
variables = [
    {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
    {'key': 'seebeck_doping', 'name': 'S', 'unit': 'µV/K'},
    {'key': 'cond_doping', 'name': 'σ', 'unit': '1/Ω/m/s'},
]
eigs_keys = ['ε₁', 'ε₂', 'ε₃', 'ε̄']
props = {
    'seebeck_doping': ['S', 'µV/K'],
    'cond_doping': ['σ', '1/Ω/m/s'],
    'kappa_doping': ['κₑ', 'W/K/m/s']
}
pfkey = '⟨S²σ⟩'

In [None]:
files = [x for x in os.scandir(input_dir) if x.is_file()]
len(files)

In [None]:
contributions = []
total = len(files)

with tqdm(total=total) as pbar:
    for obj in files:
        identifier = obj.name.split('.', 1)[0].rsplit('_', 1)[-1]

        valid = bool(identifier.startswith('mp-') or identifier.startswith('mvc-'))
        if not valid:
            print(identifier, 'not valid')
            continue

        with gzip.open(obj.path, 'rb') as input_file:
            data = json.loads(input_file.read())
            task_type = list(data['gap'].keys())[0]
            gap = list(data['gap'].values())[0]
            
            cdata = {}
            cdata['task'] = list(data['task_id'].values())[0]
            cdata['type'] = task_type
            cdata['metal'] = 'Yes' if gap < 0.1 else 'No'        
            cdata['T'] = '300 K'
            cdata['doplvl'] = '1e18 cm⁻³'
            cdata['ΔE'] = ' '.join([str(gap), 'eV'])

            S2 = None
            for v in variables:
                for doping_type in ['p', 'n']:
                    d = data[task_type][v['key']].get(doping_type, {}).get('300', {}).get('1e+18', {})

                    if d:
                        eigs = d if isinstance(d, list) else d['eigs']
                        key = '.'.join([v['name'], doping_type])  # use unflatten later
                        cdata[key] = dict(
                            (eigs_keys[neig], ' '.join([str(eig), v['unit']]))
                            for neig, eig in enumerate(eigs)
                        )
                        cdata[key][eigs_keys[-1]] = ' '.join([str(np.mean(eigs)), v['unit']])
                        if v['key'] == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif v['key'] == 'cond_doping':
                            pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                            if pfkey not in cdata:
                                cdata[pfkey] = {}
                            cdata[pfkey][doping_type] = ' '.join([str(pf), 'µW/cm/K²/s'])

            # build data and max values for seebeck, conductivity and kappa
            tables = []
            for prop_name, (label, unit) in props.items():
                for doping_type in ['p', 'n']:
                    prop = data[task_type][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T [K]']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float, prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = f'{doping:.0e}'
                            if len(columns) <= len(dopings):
                                columns.append(f'{doping_str} cm⁻³ [{unit}]')
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append(row)

                    np_prop_averages = np.array(prop_averages)
                    df = DataFrame(np_prop_averages, columns=columns)
                    df.index.name = f'{label}({doping_type})'
                    tables.append(df)

                    arr_prop_avg = np.array(np_prop_averages)[:,1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg==max_v)[0]

                    elabel = label + 'ᵉ'
                    edoping_type = 'ⁿ' if doping_type == 'n' else 'ᵖ'
                    cdata[elabel] = {
                        doping_type: ' '.join([str(max_v), unit]),
                        f'T{edoping_type}': ' '.join([str(temps[arg_max[0]]), 'K']),
                        f'c{edoping_type}': ' '.join([str(dopings[arg_max[1]]), 'cm⁻³']),
                    }

            contrib = {'project': name, 'identifier': identifier, 'is_public': True}
            contrib["data"] = unflatten(cdata)
            contrib["tables"] = tables
            contributions.append(contrib)
            
        pbar.update(1)
        
# make sure that contributions with all columns come first
contributions = [d for d in sorted(
    contributions, key=lambda x: len(x["data"]), reverse=True
)]
len(contributions)

**Submit contributions**

In [None]:
# client.delete_contributions(name)
client.submit_contributions(contributions[:1])

**Query contributions**

In [None]:
query = {
    "project": name,
#     "data__formula__contains": "Li",
#     "data__type": "GGA+U",
#     "data__metal": "Yes",
    "_fields": ["id", "identifier", "formula", "data.mₑᶜᵒⁿᵈ.p.ε̄.value", "tables"]
}    
client.contributions.get_entries(**query).result()