In [1]:
%matplotlib notebook
import os
import os.path as path
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from faker import Factory
from scipy import interpolate
from sklearn.preprocessing import MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import math 
import itertools 
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from ipywidgets import HBox, VBox
from ipywidgets import IntSlider, Output
from IPython.display import clear_output
import h5py
import re

# Local module and scripts
from pyccapt.calibration.calibration_tools import tools, data_tools, variables, calibration, data_loadcrop
from pyccapt.calibration.calibration_tools import widgets as wd
from pyccapt.calibration.calibration_tools import dataset_path_qt

In [2]:
# Disable showing logging in Jupyter notebook
import logging, sys
logging.disable(sys.maxsize)
save_fig=False
plot_fig=True 

In [3]:
button = widgets.Button(
    description='load dataset',
)

@button.on_click
def open_file_on_click(b):
    global dataset_path
    dataset_path = dataset_path_qt.gui_fname().decode('ASCII')
button

Button(description='load dataset', style=ButtonStyle())

In [4]:
# number_sample = 300
# save_fig=False
# plot_fig=True 

In [5]:
tdc, pulse_mode, flightPathLength_d, t0_d, max_tof = wd.dataset_tdc_selection()
display(tdc, pulse_mode)

Dropdown(description='TDC model:', options=('surface_concept', 'roentdec'), value='surface_concept')

Dropdown(description='Pulse mode:', options=('voltage', 'laser'), value='voltage')

$$\textbf{You can specify which dataset to use in below block}$$

In [6]:
tdc_model = tdc.value
pulse_mode_ini = pulse_mode.value


dataset_main_path = os.path.dirname(dataset_path)
dataset_name_with_extention = os.path.basename(dataset_path)
dataset_name = os.path.splitext(dataset_name_with_extention)[0]


variables.init()
# variables.path = os.path.join(p, 'tests//data')
variables.result_path = os.path.dirname(dataset_main_path) + '/ion_type_selection/'
if not os.path.isdir(variables.result_path):
    os.makedirs(variables.result_path, mode=0o777, exist_ok=True)
        
filename = dataset_path

head, tail = os.path.split(filename)
figname = os.path.splitext(tail)[0]

data = data_tools.read_hdf5_through_pandas(filename)

In [7]:
# tdc_model = tdc.value
# pulse_mode_ini = pulse_mode.value

# dataset_name = dataset.value

# flightPathLength = float(flightPathLength_d.value) # mm 
# # The initial value for t_0
# t0 = float(t0_d.value) # ns

# p = path.abspath(path.join("", "../../../.."))

# variables.init()

# path_main = os.path.join(p, 'tests//data')
# filename_main = path_main + '//' + dataset_name + '.h5'

# # variables.path = os.path.join(p, 'tests//results//mc_vol_bowl_calibratin')
# variables.path = os.path.join(p, 'tests//results//tof_calibration')
# variables.result_path = os.path.join(p, 'tests/results/ion_type_selection/' + dataset_name)
# if not os.path.isdir(variables.result_path):
#         os.makedirs(variables.result_path, mode=0o777, exist_ok=True)
        
# filename = variables.path + '//' + dataset_name + '//' + dataset_name + '.h5'



# head, tail = os.path.split(filename)
# figname = os.path.splitext(tail)[0]

# data = data_tools.read_hdf5_through_pandas(filename)


In [8]:
data

Unnamed: 0,high_voltage (V),pulse (V),start_counter,t (ns),mc (Da),x (mm),y (mm),pulse_pi,ion_pp),mc_c (Da)
0,5253.633789,5253.633789,10124.0,642.724902,27.441679,8.404898,27.411429,0.0,1.0,26.681165
1,5253.633789,5253.633789,10364.0,635.544576,28.032878,-11.429388,-10.665306,240.0,1.0,27.418730
2,5253.633789,5253.633789,10503.0,622.582956,27.324326,0.700408,2.928980,139.0,1.0,26.646873
3,5253.633789,5253.633789,11048.0,463.916268,14.008961,4.743673,14.262857,545.0,1.0,13.555568
4,5253.633789,5253.633789,11060.0,635.812038,27.200967,21.521633,13.053061,12.0,1.0,26.755766
...,...,...,...,...,...,...,...,...,...,...
1897669,5710.397949,5710.397949,25553.0,599.416632,27.361342,-0.413878,0.127347,254.0,1.0,26.688989
1897670,5710.397949,5710.397949,25612.0,622.150902,28.206176,-0.350204,25.119184,59.0,1.0,27.271514
1897671,5710.397949,5710.397949,25863.0,618.913926,27.800315,-25.533061,4.361633,251.0,1.0,26.785741
1897672,5710.397949,5710.397949,25900.0,609.443028,26.954508,20.821224,-14.231020,37.0,1.0,26.763381


In [9]:
variables.mc_calib = data['mc (Da)'].to_numpy()

In [None]:
def find_nearest(a, a0, num):
    "Element in nd array `a` closest to the scalar value `a0`"
    idx = []
    for i in range(num):
        idx.append(np.abs(a - a0).argmin())
        a[idx] = -200 #some dmmy negative value
    return idx

def find_close_element(target_elem, num_c, mode='element', aboundance_threshold=1, charge=4):
    if mode=='element'
        isotopeTableFile = '../../../files/isotopeTable.h5'
    elif:
        pass
    dataframe = data_tools.read_hdf5_through_pandas(isotopeTableFile)

    elements = dataframe['element'].to_numpy()
    isotope_number = dataframe['isotope'].to_numpy()
    abundance = dataframe['abundance'].to_numpy()
    element_abundance = np.repeat(abundance, charge)

    element_wights = np.zeros((len(elements), charge))
    elements_w = dataframe['weight'].to_numpy()
    for i in range(charge):
        element_wights[:,i] = elements_w/(i+1)
    element_wights = element_wights.flatten()

    elem = np.core.defchararray.add(elements.astype('U'), isotope_number.astype('U'))
    element_list = np.zeros(len(elem)*charge).astype('U')
    for i in range(len(elem)):
        for j in range(charge):
            element_list[i+j+((charge-1)*i)] = elem[i] + '+'*(j+1)

    idxs = find_nearest(np.copy(element_wights), target_elem, num_c)

    element_c = element_list[idxs]
    element_wights_c = element_wights[idxs]
    abundance_c = element_abundance[idxs]

    index_sort = np.argsort(abundance_c)
    index_sort = np.flip(index_sort)

    element_c = element_c[index_sort]
    element_wights_c = element_wights_c[index_sort]
    abundance_c = abundance_c[index_sort] 

    if aboundance_threshold < 1.0:
        element_c = element_c[abundance_c < aboundance_threshold]
        element_wights_c = element_wights_c[abundance_c < aboundance_threshold]
        abundance_c = abundance_c[abundance_c < aboundance_threshold] 
    
    df = pd.DataFrame({'element': element_c, 'weight': element_wights_c, 'abundance': abundance_c})
    return df

In [374]:
isotopeTableFile = '../../../files/isotopeTable.h5'
dataframe = data_tools.read_hdf5_through_pandas(isotopeTableFile)

molecule_isotop_list(dataframe,'CuO2')

Unnamed: 0,molecule,weight,abundance
0,Cu(63)O(16)2,94.92,0.4739103
1,Cu(63)O(17)2,96.93,9.97631e-15
2,Cu(63)O(18)2,98.93,7.655182e-12
3,Cu(65)O(16)2,96.92,0.09414725
4,Cu(65)O(17)2,98.93,1.981899e-15
5,Cu(65)O(18)2,100.93,1.520782e-12


In [376]:
dataframe

Unnamed: 0,atomicNumber,element,isotope,weight,abundance,atomDensity
0,1,H,1,1.01,99.98500,52.7
1,1,H,2,2.01,0.01500,52.7
2,2,He,3,3.02,0.00014,28.7
3,2,He,4,4.00,99.99990,28.7
4,3,Li,6,6.02,7.50000,46.3
...,...,...,...,...,...,...
303,99,Es,252,252.08,0.00000,21.1
304,100,Fm,257,257.10,0.00000,0.0
305,101,Md,258,258.10,0.00000,0.0
306,102,No,259,259.10,0.00000,0.0


In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
def molecule_isotop_list(dataframe, target_element):
    target_element = fix_prantesis(target_element)
        
    elements = dataframe['element'].to_numpy()
    isotope_number = dataframe['isotope'].to_numpy()
    abundance = dataframe['abundance'].to_numpy()
    weight = dataframe['weight'].to_numpy()

    molecule_formula = re.findall('(\d+|[A-Za-z]+)', target_element)
    molecule_formula = [re.split('(?<=.)(?=[A-Z])', item) for item in molecule_formula]
    molecule_formula = list(itertools.chain(*molecule_formula))
    
    elem_wights = []
    elem_aboundance = []
    elem_compo = []

    for i in range(len(molecule_formula)):
        if not molecule_formula[i].isnumeric():
            idx_element = np.where(elements == molecule_formula[i])
            elem_compo_temp = []
            elem_wights_tmp = []
            elem_aboundance_tmp = []
            for j in range(len(idx_element[0])):
                if i+1 < len(molecule_formula):
                    if molecule_formula[i+1].isnumeric():
                        number_of_elem = int(molecule_formula[i+1])
                        elem_compo_temp.append(elements[idx_element[0][j]] + '('+str(isotope_number[idx_element[0][j]])+')'+str(number_of_elem))
                    else:
                        number_of_elem = 1
                        elem_compo_temp.append(elements[idx_element[0][j]] + '('+str(isotope_number[idx_element[0][j]])+')')
                else:
                    number_of_elem = 1
                    elem_compo_temp.append(elements[idx_element[0][j]] + '('+str(isotope_number[idx_element[0][j]])+')')
                
                
                elem_wights_tmp.append(weight[idx_element[0][j]]*number_of_elem)
                aboundance_i = abundance[idx_element[0][j]]/100
                for k in range(number_of_elem):
                    aboundance_i = aboundance_i * aboundance_i
                elem_aboundance_tmp.append(aboundance_i)

            elem_compo.append(elem_compo_temp)
            elem_wights.append(elem_wights_tmp)
            elem_aboundance.append(elem_aboundance_tmp)

    list_elem_compo = list(itertools.product(*elem_compo))
    list_elem_wights = list(itertools.product(*elem_wights))
    list_elem_aboundance = list(itertools.product(*elem_aboundance))


    list_elem_compo = [''.join(item) for item in list_elem_compo]
    list_elem_wights = [sum(item) for item in list_elem_wights]
    list_elem_aboundance = [math.prod(item) for item in list_elem_aboundance]

    df = pd.DataFrame({'molecule': list_elem_compo, 'weight': list_elem_wights, 'abundance': list_elem_aboundance})
    return df

In [11]:
bin_size=widgets.FloatText(value=0.1, description='bin size:')
prominence=widgets.IntText(value=60, description='peak prominance:')
distance=widgets.IntText(value=50, description='peak distance:')
lim_tof=widgets.IntText(value=150, description='lim tof/mc:')
percent=widgets.IntText(value=50, description='percent MRP:')

def hist_plot(figname, plot):
    peaks_sides_p = np.zeros(0)
    with out:
        clear_output(True)

        bin_size_p = bin_size.value
        prominence_p = prominence.value
        distance_p = distance.value
        lim_tof_p = lim_tof.value
        percent_p = percent.value
        peaks_ini, peaks_y_ini, peak_widths_p_ini = tools.hist_plot(variables.mc_calib[variables.mc_calib < lim_tof_p], bin_size_p, distance=distance_p, percent=percent_p, prominence=prominence_p, selector='peak', plot=plot, label='mc', fig_name=figname)
        index_max_ini = np.argmax(peaks_y_ini)
        variables.max_peak = peaks_ini[index_max_ini]
        variables.peak = peaks_ini
        mrp = (peaks_ini[index_max_ini] / (peak_widths_p_ini[index_max_ini][2] - peak_widths_p_ini[index_max_ini][1]))
        print('Mass resolving power for the highest peak (MRP --> m/m_2-m_1):', mrp)
        for i in range(len(peaks_ini)):
            print('Peaks ', i, 'is at location and height: ({:.2f}, {:.2f})'.format(peaks_ini[i], peaks_y_ini[i]), 'peak window sides (half-maximum) are: ({:.2f}, {:.2f})'.format(peak_widths_p_ini[i][1], peak_widths_p_ini[i][2]))

In [31]:
peak_val = widgets.FloatText(value=1.1, description='peak value:')
charge = widgets.Dropdown(
    options=[('1', 1), ('2', 2), ('3', 3), ('4', 4)],
    value=3,
    description='charge:'
)
aboundance_threshold = widgets.FloatText(value=1, description='aboundance threshold:')
num_element = widgets.IntText(value=1, description='num element:')

num_molecule = widgets.IntText(value=1, description='num molecule:')


save_b = widgets.Dropdown(
    options=[('False', False), ('True', True)],
    description='save fig:'
)
def element_finder():
    with out:
        peak_val_s = peak_val.value
        charge_s = charge.value
        num_element_s = num_element.value
        num_molecule_s = num_molecule.value
        all_element_s = all_element.value
        element_threshold_s = element_threshold.value
        aboundance_threshold_s = aboundance_threshold.value
        clear_output(True)
        df = find_close_element(peak_val_s, num_element_s, aboundance_threshold_s, charge=charge_s)
        print(df)

In [368]:
isotope_formula = widgets.Text(
    value='',
    placeholder='Type a formula',
    description='Isotope formula:',
    disabled=False
)

def manual_formula_calculator():
    isotopeTableFile = '../../../files/isotopeTable.h5'
    dataframe = data_tools.read_hdf5_through_pandas(isotopeTableFile)
    df = molecule_isotop_list(dataframe, isotope_formula.value)
    with out:
        df = molecule_isotop_list(dataframe, isotope_formula.value)
        clear_output(True)
        print(df)

In [373]:
plot_button = widgets.Button(
    description='plot hist',
)

find_button = widgets.Button(
    description='find element',
)

formula_find_button = widgets.Button(
    description='find molecule',
)

peak_lable = widgets.Text(
    value='',
    placeholder='Type peak element',
    description='peak elem:',
    disabled=False
)
    
@plot_button.on_click
def plot_on_click(b, figname=figname, plot=True):
    hist_plot(figname, plot)

    
@find_button.on_click
def vol_on_click(b,):
    element_finder()
    
@formula_find_button.on_click
def manual_formula(b,):
    manual_formula_calculator()
    
tab1 = VBox(children=[bin_size, prominence, distance, lim_tof, percent])
tab2 = HBox(children=[VBox(children=[peak_val, num_element, num_molecule, charge, aboundance_threshold, all_element, element_threshold, 
                      find_button]),HBox(children=[isotope_formula, formula_find_button])])


tab = widgets.Tab(children=[tab1, tab2])
tab.set_title(0, 'mc plot')
tab.set_title(1, 'element finder')



display(VBox(children=[tab,HBox(children=[plot_button, peak_lable])]))
out = Output()
display(out)

VBox(children=(Tab(children=(VBox(children=(FloatText(value=0.01, description='bin size:'), IntText(value=60, …

Output()

In [47]:
peak_lable.value

'H+, Al++, Al+'

In [None]:
peaks_chos = []
for i in range(len(variables.peaks_idx)):
    peaks_chos.append(variables.peak[variables.peaks_idx[i]])

In [324]:

def fix_prantesis(c):
    index = []
    for i in range(len(c)):
        if c[i]== '(':
            index.append(i+1)
        if c[i]== ')':
            index.append(i)
            index.append(int(c[i+1]))
    index = list(chunks(index, 3))
    list_parantesis = []
    for i in range(len(index)):
        tmp = c[index[i][0]:index[i][1]]
        tmp = re.findall('[A-Z][^A-Z]*', tmp)
        for j in range(len(tmp)):
            if tmp[j].isalpha():
                tmp[j] = tmp[j] + str(index[i][-1])
            elif not tmp[j].isalpha():
                dd = int(re.findall(r'\d+', tmp[j])[0])*index[i][-1]
                tmp[j] = ''.join([p for p in tmp[j] if not p.isdigit()]) + str(dd)
        list_parantesis.append("".join(tmp))

    for i in range(len(list_parantesis)):
        gg = list_parantesis[i]
        c = list(c)
        c[index[i][0]-1-(2*i):index[i][1]+2] = list_parantesis[i]
    
    return ''.join(c)
    


In [360]:
isotopeTableFile = '../../../files/isotopeTable.h5'
dataframe = data_tools.read_hdf5_through_pandas(isotopeTableFile)
molecul_list_file = '../../../files/list_of_chemical.csv'
molecule_dataframe = pd.read_csv(molecul_list_file, encoding= 'utf-8', header = 0) 
molecule_dataframe= molecule_dataframe[molecule_dataframe['Chemical formula'].notna()]
molecule_dataframe = molecule_dataframe.reset_index(drop=True)
ff = molecule_dataframe['Chemical formula']
for i in range(len(ff)):
#     print(i, ff[i])
    if i == 0:
        df2 = molecule_isotop_list(dataframe, ff[i])
    else:
        df3 = molecule_isotop_list(dataframe, ff[i])
        df2 = pd.concat([df2, df3], ignore_index=True)


print(len(df2))

69331


In [None]:
# GMM number of componnents
from sklearn import mixture
tt = np.copy(mc[mc<100])
tt = np.expand_dims(tt, axis=1)
tt = np.float32(tt)

n_components = np.arange(2, 6)
models = [mixture.GaussianMixture(n, covariance_type='full', random_state=0).fit(tt)
          for n in n_components]

fig1, ax1 = plt.subplots(figsize=(4, 3))
bic_test = [m.bic(tt) for m in models]
aic_test = [m.aic(tt) for m in models]
plt.plot(n_components, bic_test, label='BIC')
plt.plot(n_components, aic_test, label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components')
plt.show()

In [None]:
n_components = min(np.argmin(bic_test), np.argmin(aic_test))
print(n_components)
# n_components = 5

In [None]:
tt = np.copy(mc[mc<100])
tt = np.expand_dims(tt, axis=1)
tt = np.float32(tt)


bins = 0.1
bins = np.linspace(np.min(tt), np.max(tt), round(np.max(tt) / bins))

gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full').fit(tt)
labels = gmm.predict(tt)


fig1, ax1 = plt.subplots(figsize=(6, 4))
for i in range(n_components):
    A = tt[labels==i]
    plt.hist(A, bins, log=True, histtype='step')
plt.show()

In [None]:
from sklearn import cluster
from scipy.spatial import distance
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
import numpy as np

def compute_bic(kmeans,X):
    """
    Computes the BIC metric for a given clusters

    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn

    X     :  multidimension np array of data points

    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 
             'euclidean')**2) for i in range(m)])

    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term

    return(BIC)





# IRIS DATA
iris = sklearn.datasets.load_iris()
X = iris.data[:, :4]  # extract only the features
#Xs = StandardScaler().fit_transform(X)
Y = iris.target

ks = range(1,10)

# run 9 times kmeans and save each result in the KMeans object
KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(X) for i in ks]

# now run for each cluster the BIC computation
BIC = [compute_bic(kmeansi,X) for kmeansi in KMeans]
plt.plot(ks, BIC, 'r-o')
plt.title("iris data  (cluster vs BIC)")
plt.xlabel("# clusters")
plt.ylabel("# BIC")
plt.show()

In [None]:
# K-means

tt = np.copy(dld_t[dld_t < 1500])
tt = np.expand_dims(tt, axis=1)
tt = np.float32(tt)
bins = 0.1
num_k = 4
bins = np.linspace(np.min(tt), np.max(tt), round(np.max(tt) / bins))
fig1, ax1 = plt.subplots(figsize=(8, 6))
plt.hist(tt, bins, log=True)
plt.show()


# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv2.KMEANS_RANDOM_CENTERS
# Apply KMeans
compactness,labels,centers = cv2.kmeans(tt , num_k, None, criteria,10,flags)

fig1, ax1 = plt.subplots(figsize=(8, 6))
for i in range(num_k):
    A = tt[labels==i]
    plt.hist(A, bins, log=True)
plt.hist(centers,bins, color='black')
plt.show()
print(centers)

Unnamed: 0,molecule,weight,abundance
0,Ba(130)Al(27)O(16)2,188.88,1.112941e-06
1,Ba(130)Al(27)O(17)2,190.89,2.3428589999999998e-20
2,Ba(130)Al(27)O(18)2,192.89,1.7977600000000002e-17
3,Ba(132)Al(27)O(16)2,190.88,1.010423e-06
4,Ba(132)Al(27)O(17)2,192.89,2.127047e-20
5,Ba(132)Al(27)O(18)2,194.89,1.63216e-17
6,Ba(134)Al(27)O(16)2,192.87,0.0005786472
7,Ba(134)Al(27)O(17)2,194.88,1.2181130000000001e-17
8,Ba(134)Al(27)O(18)2,196.88,9.347022e-15
9,Ba(135)Al(27)O(16)2,193.87,0.004304225


In [48]:
# isotopeTableFile = '../../../files/isotopeTable.h5'
# dataframe = data_tools.read_hdf5_through_pandas(isotopeTableFile)
# elementsList = dataframe['element']
# elementIsotopeList = dataframe['isotope']
# elementMassList =  dataframe['weight']
# abundanceList = dataframe['abundance']

# elements = list(zip(elementsList, elementIsotopeList, elementMassList, abundanceList))
# dropdownList = []
# for element in elements:
#     tupleElement = ("{} ({}) ({:.2f})".format(element[0],element[1],element[3]), "{}{}".format(element[0],element[1]))
#     dropdownList.append(tupleElement)

# chargeList = [(1,1,),(2,2,),(3,3,),(4,4,)]
# dropdown = wd.dropdownWidget(dropdownList, "Elements")
# dropdown.observe(wd.on_change_ions_selection)


# chargeDropdown = wd.dropdownWidget(chargeList, "Charge")
# chargeDropdown.observe(wd.on_change_charge_ions_selection)

# wd.compute_element_isotope_values_according_to_selected_charge(mode='ions_selection')

# buttonAdd = wd.buttonWidget("ADD")
# buttonDelete = wd.buttonWidget("DELETE")
# buttonReset = wd.buttonWidget("RESET")

# display(dropdown)
# display(chargeDropdown)
# display(buttonAdd)
# display(buttonDelete)
# display(buttonReset)
# listMaterial = buttonAdd.on_click(wd.onClickAdd)
# buttonDelete.on_click(wd.onClickDelete)
# buttonReset.on_click(wd.onClickReset)

Dropdown(description='Elements', options=(('H (1) (99.98)', 'H1'), ('H (2) (0.01)', 'H2'), ('He (3) (0.00)', '…

Dropdown(description='Charge', options=((1, 1), (2, 2), (3, 3), (4, 4)), value=1)

Button(description='ADD', icon='check', style=ButtonStyle(), tooltip='ADD')

Button(description='DELETE', icon='check', style=ButtonStyle(), tooltip='DELETE')

Button(description='RESET', icon='check', style=ButtonStyle(), tooltip='RESET')

Updated List :  ['O16(1+)', 'Ne21(1+)']                                          

In [212]:
molecule_dataframe

Unnamed: 0,Chemical formula,Synonyms,CAS number
0,Ac2O3,actinium(III) oxide,
1,AgBF4,silver tetrafluoroborate,14104-20-2
2,AgBr,silver bromide,7785-23-1
3,AgBrO3,silver bromate,7783-89-3
4,AgCl,silver chloride,7783-90-6
...,...,...,...
2074,ZrP2,zirconium phosphide,12037-80-8
2075,ZrS2,zirconium sulfide,12039-15-5
2076,ZrSi2,zirconium silicide,12039-90-6
2077,ZrSiO4,zirconium orthosilicate,10101-52-7


In [None]:
fake = Factory.create()
color = np.zeros(len(mc), dtype='object' )
element = np.zeros(len(mc), dtype='object')

for index, elemen in enumerate(variables.listMaterial):
    mask = np.logical_and((peaks_sides_p[index,2] < mc), (mc < peaks_sides_p[index,3])).squeeze()
    print(mask[mask==True].shape)

    index_true = np.where(mask==True)
    index_get_to_false = np.random.choice(index_true[0], size=int(index_true[0].shape[0] - index_true[0].shape[0]), replace=False)
    mask[index_get_to_false] = False
    print('ploted ions', mask[mask==True].shape)
    
    color[mask] = fake.hex_color()
    element[mask] = elemen



In [355]:
molecul_list_file = '../../../files/list_of_chemical.csv'
pd.read_csv(molecul_list_file, encoding= 'utf-8', header = 0) 

Unnamed: 0,Chemical formula,Synonyms,CAS number
0,(BiO)2CO3,bismuth oxycarbonate,10/4/5892
1,(C2H5)2NH,diethylamine,
2,(C6H5)4Ge,tetraphenylgermane,
3,(CH)3COH,t-butyl alcohol,
4,(CH3)2C2O4,dimethyl oxalate,
...,...,...,...
1885,ZrO32−,zirconate ion,
1886,ZrP2,zirconium phosphide,12037-80-8
1887,ZrS2,zirconium sulfide,12039-15-5
1888,ZrSi2,zirconium silicide,12039-90-6


In [None]:
data['element'] = element
data['color'] = color

In [None]:
data

In [None]:
data.dtypes

In [None]:
# save the new data
name_save_file = variables.result_path + '//' + dataset_name + '.h5'
data_tools.store_df_to_hdf(name_save_file, data, 'df')

In [None]:
# save data in csv format
data_tools.store_df_to_csv(data, variables.result_path + dataset_name + '.csv')