In [None]:
from mpcontribs.client import Client
import gzip, json, os
import numpy as np
from pandas import DataFrame
from collections import defaultdict
from tqdm.notebook import tqdm
from unflatten import unflatten

In [None]:
name = 'carrier_transport'
client = Client()

**Retrieve and update project info**

In [None]:
# client.projects.update_entry(pk=name, project={
#     'description': 'Ab-initio electronic transport database for inorganic \
# materials. Here are reported the average of the eigenvalues of \
# conductivity effective mass (mₑᶜᵒⁿᵈ), \
# the Seebeck coefficient (S), the conductivity (σ), the electronic thermal \
# conductivity (κₑ), and the Power Factor (PF) \
# at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K for n- and p-type. \
# Also, the maximum values for S, σ, PF, and the minimum value for κₑ chosen among the temperatures \
# [100, 1300] K, the doping levels [10¹⁶, 10²¹] cm⁻³, and doping types are reported. \
# The properties that depend on the relaxation time are reported divided by the constant value 10⁻¹⁴. \
# The average of the eigenvalues for all the properties at all the temperatures, \
# doping levels, and doping types are reported in the tables for each entry. \
# A legend of the columns of the table is provided in the following.',
#     'other': {
#         'ΔE': 'Band gap',
#         'V' : 'Volume',
#         'mₑᶜᵒⁿᵈ': 'Eigenvalues (ε₁, ε₂, ε₃) of the conductivity effective mass and their average (ε̄ ) for n and p type at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K.',
#         'S': 'Average of the eigenvalues of the Seebeck coefficient for n and p type at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K.',
#         'σ' : 'Average of the eigenvalues of the conductivity for n and p type at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K.',
#         'PF': 'Average of the eigenvalues of the Power Factor for n and p type at a doping level of 10¹⁸ cm⁻³ and at a temperature of 300 K.',
#         #'T': 'Temperature at which the averaged properties are reported.',
#         #'doplvl': 'Doping level at which the averaged properties are reported.',
#         'Sᵉ': 'Maximum value of the average of the eigenvalues of the Seebeck coefficient chosen among all the tempertures, doping levels, and doping type.', 
#         'σᵉ': 'Maximum value of the average of the eigenvalues of the conductivity chosen among all the tempertures, doping levels, and doping type.',
#         'κₑᵉ': 'Minimum value of the average of the eigenvalues of the electrical thermal conductivity chosen among all the tempertures, doping levels, and doping type.',
#         'Tⁿᵖ': 'Temperature corresponding to the extreme value of the property. n or p superscript refer to the doping type.',
#         'cⁿᵖ': 'Doping level corresponding to the extreme value of the property. n or p superscript refer to the doping type',
#     }
# }).result()

In [None]:
client.get_project(name).pretty()

**Create and submit contributions**

In [None]:
input_dir = '/project/projectdirs/matgen/fricci/transport_data/coarse'
variables = [
    {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
    {'key': 'seebeck_doping', 'name': 'S', 'unit': 'µV/K'},
    {'key': 'cond_doping', 'name': 'σ', 'unit': '1/Ω/m/s'},
    {'key': 'kappa_doping', 'name': 'κₑ', 'unit': 'W/K/m/s'},
]
eigs_keys = ['ε₁', 'ε₂', 'ε₃', 'ε̄']
props = {
    'seebeck_doping': ['S', 'µV/K'],
    'cond_doping': ['σ', '1/Ω/m/s'],
    'pf': ['PF','µW/cm/K²/s'],
    'kappa_doping': ['κₑ', 'W/K/m/s'],
}
pfkey = 'PF'

In [None]:
files = [x for x in os.scandir(input_dir) if x.is_file()]
len(files)

In [None]:
def chunks(lst, n=1000):
    n = max(1, n)
    for i in range(0, len(lst), n):
        to = i + n
        yield lst[i:to]

In [None]:
client.delete_contributions(name, per_page=100)
total = len(files)
chunk_size = 1002

with tqdm(total=total) as pbar:
    for idx, chunk in enumerate(chunks(files, n=chunk_size)):
        contributions = []

        for obj in chunk:
            identifier = obj.name.split('.', 1)[0].rsplit('_', 1)[-1]
            valid = bool(identifier.startswith('mp-') or identifier.startswith('mvc-'))

            if not valid:
                print(identifier, 'not valid')
                continue

            with gzip.open(obj.path, 'rb') as input_file:
                data = json.loads(input_file.read())
                task_type = 'GGA+U' if 'GGA+U' in data['gap'] else 'GGA'
                gap = data['gap'][task_type]
    
                cdata = {}
                cdata['task'] = data['task_id'][task_type]
                cdata['type'] = task_type
                cdata['metal'] = 'Yes' if gap < 0.1 else 'No'        
                cdata['ΔE'] = ' '.join([str(gap), 'eV'])
                cdata['V'] = ' '.join([str(data['volume']), 'Å³'])

                S2 = None
                for doping_type in ['p', 'n']:
                    for v in variables:
                        d = data[task_type][v['key']].get(doping_type, {}).get('300', {}).get('1e+18', {})

                        if d:
                            eigs = d if isinstance(d, list) else d['eigs']
                            if v['key'] == 'cond_eff_mass':
                                key = '.'.join([v['name'], doping_type])
                                cdata[key] = dict(((eigs_keys[-1], ' '.join([str(np.mean(eigs)), v['unit']])),))
                                for neig, eig in enumerate(eigs):
                                    cdata[key][eigs_keys[neig]]= ' '.join([str(eig), v['unit']])
                            else:
                                key = v['name']
                                cdata.setdefault(key,{})
                                cdata[key][doping_type] = ' '.join([str(np.mean(eigs)), v['unit']])
                                if v['key'] == 'seebeck_doping':
                                    S2 = np.dot(d['tensor'], d['tensor'])
                                elif v['key'] == 'cond_doping':
                                    pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                                    if pfkey not in cdata:
                                        cdata[pfkey] = {}
                                    cdata[pfkey][doping_type] = ' '.join([str(pf), 'µW/cm/K²/s'])

                # build data and max values for seebeck, conductivity and kappa
                tables = [] 
                for doping_type in ['p', 'n']:
                    S2=[]
                    for prop_name, (label, unit) in props.items():
                        if prop_name != 'pf':
                            prop = data[task_type][prop_name][doping_type]
                            prop_averages, dopings, columns = [], None, ['T [K]']
                            pf_averages = []
                            temps = sorted(map(int, prop.keys()))
                            for it,temp in enumerate(temps):
                                row = [temp]
                                row_pf = [temp]
                                if dopings is None:
                                    dopings = sorted(map(float, prop[str(temp)].keys()))
                                for idop,doping in enumerate(dopings):
                                    doping_str = f'{doping:.0e}'
                                    if len(columns) <= len(dopings):
                                        columns.append(f'{doping_str} cm⁻³ [{unit}]')
                                    eigs = prop[str(temp)][doping_str]['eigs']
                                    row.append(np.mean(eigs))
                                    tensor = prop[str(temp)][doping_str]['tensor']
                                    if prop_name == 'seebeck_doping':
                                        S2.append(np.dot(tensor, tensor))
                                    elif prop_name == 'cond_doping':     
                                        pf = np.mean(np.linalg.eigh(
                                            np.dot(S2[it*len(dopings)+idop], tensor)
                                        )[0]) * 1e-8
                                        row_pf.append(pf)
                                prop_averages.append(row)
                                pf_averages.append(row_pf)

                        np_prop_averages = np.array(prop_averages)
                        if prop_name == 'pf': np_prop_averages = np.array(pf_averages)
                        df = DataFrame(np_prop_averages, columns=columns)
                        df.index.name = f'{label}({doping_type})'
                        for col in df.columns:
                            df[col] = df[col].astype(str)
                        tables.append(df)

                        arr_prop_avg = np_prop_averages[:,1:] #[:,[4,8,12]]
                        max_v = np.max(arr_prop_avg)
                        if prop_name[0] == 's' and doping_type == 'n':
                            max_v = np.min(arr_prop_avg)
                        if prop_name[0] == 'k':
                            max_v = np.min(arr_prop_avg)
                        arg_max = np.argwhere(arr_prop_avg==max_v)[0]

                        elabel = label + 'ᵉ'

                        edoping_type = 'ⁿ' if doping_type == 'n' else 'ᵖ'
                        cdata[elabel] = {
                            doping_type: ' '.join([str(max_v), unit]),
                            f'T{edoping_type}': ' '.join([str(temps[arg_max[0]]), 'K']),
                            f'c{edoping_type}': ' '.join([str(dopings[arg_max[1]]), 'cm⁻³']),
                        }

                contrib = {'project': name, 'identifier': identifier, 'is_public': True}
                contrib["data"] = unflatten(cdata)
                contrib["tables"] = tables

                # set the order of columns in the table
                cdata2 = {}
                for key in ['task', 'type', 'metal', 'ΔE', 'V',
                            'mₑᶜᵒⁿᵈ', 'S', 'σ', 'κₑ',
                            'PF','Sᵉ', 'σᵉ', 'κₑᵉ','PFᵉ']:
                    if key in cdata:
                        cdata2[key] = cdata[key]

                contrib2 = {'project': name, 'identifier': identifier, 'is_public': True}
                contrib2["data"] = unflatten(cdata2)
                contrib2["tables"] = tables

                contributions.append(contrib2)

            pbar.update(1)
            
        # make sure that contributions with all columns come first
        contributions = [d for d in sorted(
            contributions, key=lambda x: len(x["data"]), reverse=True
        )]

        client.submit_contributions(contributions, per_page=25)
        break # TODO remove to keep going until all ~48k submitted

**Query contributions**

In [None]:
query = {
    "project": name,
#     "formula_contains": "Zn",
#     "identifier__in": ["mp-10695", "mp-760381"], # ZnS, CuS
    "data__type__contains": "GGA+U",
    "data__metal__contains": "Yes",
    "data__mₑᶜᵒⁿᵈ__p__ε̄__value__lte": 1,
    "_order_by": "data__mₑᶜᵒⁿᵈ__p__ε̄__value",
    "_fields": ["id", "identifier", "formula", "data.mₑᶜᵒⁿᵈ.p.ε̄.value"]
}
client.contributions.get_entries(**query).result()