In [1]:
%load_ext autoreload
%autoreload

import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter
import argparse

import numpy as np
import pandas as pd

from data_structure import Instance

import matplotlib.pyplot as plt
%matplotlib inline

# configure

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-data_path', type=str, default='data/ti/df_pair_label_6F.pickle')
parser.add_argument('-output_path', type=str, default='data/ti/instances.pkl')
parser.add_argument('-large_output_path', type=str, default='data/ti/instances_large.pkl')

config = parser.parse_args('')

# load source

In [3]:
def prepare_instances(bows):
    instances = []
    for idx_bow, bow in enumerate(bows):
        instance = Instance()
        instance.idx = idx_bow
        instance.bow = bow
        instances.append(instance)
    return instances

# group by category

In [21]:
data_df = pd.read_pickle(config.data_path)

In [31]:
data_df

Unnamed: 0,material,use,category,vector
0,poly(tetrafluoroethylene)_membrane,selective_phase_separator,ptfe,"[-0.18909302, -0.4380685, -0.23772866, 0.15662..."
1,poly(vinylidene_fluoride),pyroelectric_transducer,pvdf,"[-0.49744186, 0.58450216, -0.67864054, 0.09927..."
2,porous_poly-(tetrafluoroethylene)_diaphragm_me...,sensor,ptfe,"[0.349454, 0.5534546, -1.4320804, -0.24467996,..."
3,perfluoropolyalkylether,potential_high_temperature_liquid_lubricants,pfpe,"[-0.81132436, 0.39675534, -0.5084254, 0.105702..."
4,perfluoropolyalkylether,gas_turbine_engines,pfpe,"[-0.8650705, -0.27476096, -0.2663428, 0.243225..."
...,...,...,...,...
86740,eptfe,endovascular_stent-graft_technology,ptfe,"[0.27829906, -1.0336405, -0.56483704, 0.352723..."
86741,eptfe,endovascular_stent-graft_technology,ptfe,"[0.27829906, -1.0336405, -0.56483704, 0.352723..."
86742,eptfe_samples,endovascular_stent-graft_technology,ptfe,"[0.27829906, -1.0336405, -0.56483704, 0.352723..."
86743,eptfe,endovascular_stent-graft_technology,ptfe,"[0.27829906, -1.0336405, -0.56483704, 0.352723..."


In [22]:
use_df = data_df[~data_df.duplicated(subset='use')]
# filtered_uses = [use for use, cnt in data_df.use.value_counts().items() if cnt > 300]
# use_df = use_df[use_df['use'].apply(lambda use: use not in filtered_uses)]
word_to_idx = {use: idx for idx, use in enumerate(use_df.use)}
idx_to_word = {idx: use for use, idx in word_to_idx.items()}
embeddings = np.array(use_df.vector.values.tolist())
assert len(word_to_idx) == len(idx_to_word) == len(embeddings)

In [23]:
category_df = data_df.groupby('category').agg({
    'use': lambda use_series: Counter(use_series)
})
instances_df = category_df.use.apply(lambda use_cnt: np.array([use_cnt[use] if use in use_cnt else 0 for _, use in idx_to_word.items()]))
bows = np.array(instances_df.values.tolist())
categories = instances_df.index
instances = prepare_instances(bows)

In [24]:
print('saving preprocessed instances...')
cPickle.dump((instances, categories, word_to_idx, idx_to_word, embeddings), open(config.output_path, 'wb'))

saving preprocessed instances...


In [25]:
category_df['use_set'] = category_df['use'].apply(lambda use_cnts: set(use_cnts.keys()))
for category in categories:
    category_df[category] = category_df['use_set'].apply(lambda use_set: len(use_set & category_df['use_set'][category]))
category_df

Unnamed: 0_level_0,use,use_set,etfe,fep,pctfe,pfa,pfpe,ptfe,pvdf
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
etfe,"{'navigation_instrument_backplane': 1, 'applic...","{high-temperature_service_conductors, commerci...",313,23,4,6,10,79,55
fep,"{'glass_battery_jar': 1, 'pipe': 1, 'flexible_...",{organic_white_paint_whereas_optical_solar_ref...,23,252,2,4,13,124,47
pctfe,"{'molecular_sensing_probes': 1, 'polymer_trans...","{""_at-risk_""_systems, coated_fabrics, molecula...",4,2,78,0,3,28,11
pfa,"{'transformer_oil': 1, 'ftfe_diaphragms': 2, '...","{cation_exchange_membranes, light_reflector, a...",6,4,0,21,0,13,4
pfpe,{'potential_high_temperature_liquid_lubricants...,"{top_mobile_lubricant_layer, excellent_lubrica...",10,13,3,0,592,95,40
ptfe,"{'selective_phase_separator': 1, 'sensor': 52,...","{gtr_treatment, permanent_vascular_accesses, a...",79,124,28,13,95,8075,555
pvdf,"{'pyroelectric_transducer': 3, 'hydrophones': ...","{stirred_flat_membrane_cell, superhydrophobic_...",55,47,11,4,40,555,6430


In [30]:
data_df[data_df['category'] == 'pvdf'].use.value_counts()[:30]

binder                                  610
sensor                                  577
sensors                                 462
electrodes                              393
hollow_fiber_membranes                  323
actuators                               302
electrode                               248
lithium-ion_batteries                   225
ultrafiltration                         218
ultrafiltration_membranes               211
separator                               189
membrane_distillation                   185
hollow_fiber_membrane                   178
pvdf_sensor                             150
pvdf_sensors                            149
electrolyte                             149
cathode                                 144
actuator                                138
microfiltration                         137
ferro-actuator                          125
direct_contact_membrane_distillation    124
cell                                    123
mf                              

# group by material

In [19]:
materials = [material for material, cnt in data_df.material.value_counts().items() if cnt > 100]
filtered_data_df = data_df[data_df['material'].apply(lambda material: material in materials)]
len(materials)

43

In [14]:
use_df = filtered_data_df[~filtered_data_df.duplicated(subset='use')]

word_to_idx = {use: idx for idx, use in enumerate(use_df.use)}
idx_to_word = {idx: use for use, idx in word_to_idx.items()}
embeddings = np.array(use_df.vector.values.tolist())
assert len(word_to_idx) == len(idx_to_word) == len(embeddings)

In [15]:
material_df = filtered_data_df.groupby('material').agg({
    'use': lambda use_series: Counter(use_series)
})
large_instances_df = material_df.use.apply(lambda use_cnt: np.array([use_cnt[use] if use in use_cnt else 0 for _, use in idx_to_word.items()]))
large_bows = np.array(large_instances_df.values.tolist())
materials = large_instances_df.index
large_instances = prepare_instances(large_bows)

In [16]:
material_df['use_set'] = material_df['use'].apply(lambda use_cnts: set(use_cnts.keys()))
for material in materials:
    material_df[material] = material_df['use_set'].apply(lambda use_set: len(use_set & material_df['use_set'][material]))
material_df

Unnamed: 0_level_0,use,use_set,eptfe,eptfe_graft,eptfe_grafts,eptfe_group,eptfe_membrane,etfe,expanded_polytetrafluoroethylene,fluorinated_ethylene_propylene,...,pvdf_binder,pvdf_fibers,pvdf_film,pvdf_films,pvdf_matrix,pvdf_membrane,pvdf_membranes,pvdf_nanofibers,pvdf_sensor,pvdf_sensors
material,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
eptfe,"{'transcatheter_closure': 1, 'autogenous_vein_...","{small_nerve_defects, interposition_prostheses...",1108,93,147,37,65,6,210,6,...,3,2,5,7,6,8,7,3,3,2
eptfe_graft,"{'vascular_graft': 5, 'palliative_treatment': ...","{vascular_access, ideal_venous_graft, femoral_...",93,102,29,0,0,0,20,0,...,0,0,0,0,0,0,0,0,0,0
eptfe_grafts,"{'artifical_heart_structures': 1, 'eptfe_vascu...","{bilateral_renal_artery_circulation, vascular_...",147,29,167,10,0,0,30,0,...,0,0,0,0,0,0,0,0,0,0
eptfe_group,"{'prosthetic_substitutes': 1, 'svc_substitute'...","{exclusion_barrier, limb_bypass_surgery, lumba...",37,0,10,42,7,0,15,0,...,0,0,0,0,0,0,0,0,0,0
eptfe_membrane,"{'gingival_flap_surgery': 1, 'regenerative_pro...","{gynecologic_surgery, molded_eptfe_valve, bone...",65,0,0,7,70,0,20,0,...,1,0,2,1,1,2,1,1,1,0
etfe,"{'applications': 1, 'wiring': 1, 'insulation_m...","{high-temperature_service_conductors, canine_c...",6,0,0,0,0,234,0,11,...,2,1,6,8,1,6,4,4,1,0
expanded_polytetrafluoroethylene,"{'vascular_grafts': 31, 'femoropopliteal_posit...","{vascular_access, frontalis_suspension_surgery...",210,20,30,15,20,0,339,0,...,0,0,0,0,0,1,1,0,0,0
fluorinated_ethylene_propylene,"{'pipe': 1, 'thickeners': 2, 'target_grease': ...","{spacecraft, high-temperature_service_conducto...",6,0,0,0,0,11,0,116,...,0,0,0,1,1,1,1,2,0,0
pctfe,"{'polymer_transfer_film': 3, 'hplc_supports': ...","{""_at-risk_""_systems, optical_viewports, valve...",0,0,0,0,0,2,0,2,...,0,1,0,1,0,1,1,1,0,0
perfluoropolyether,"{'acrylic_uv-curable_systems': 1, 'conventiona...","{top_mobile_lubricant_layer, liquid_lubricant,...",5,0,1,0,0,6,0,4,...,1,0,1,0,1,2,1,2,1,0


In [17]:
print('saving preprocessed instances...')
cPickle.dump((large_instances, materials, word_to_idx, idx_to_word, embeddings), open(config.large_output_path, 'wb'))

saving preprocessed instances...


In [18]:
len(word_to_idx)

11684

In [20]:
materials

['pvdf',
 'ptfe',
 'polytetrafluoroethylene',
 'polyvinylidene_fluoride',
 'eptfe',
 'poly(vinylidene_fluoride)',
 'ptfe_grafts',
 'pfpe',
 'pvdf_membrane',
 'pvdf_film',
 'expanded_polytetrafluoroethylene',
 'etfe',
 'pvdf_membranes',
 'perfluoropolyether',
 'eptfe_grafts',
 'polyvinylidene_difluoride',
 'pvdf_sensor',
 'pvdf_sensors',
 'pvdf_films',
 'ptfe_graft',
 'poly(tetrafluoroethylene)',
 'ptfe_membrane',
 'pvdf_/',
 'polytetrafluoroethylene_grafts',
 'polytetrafluoroethylene_graft',
 'pvdf-hfp',
 'pvdf_matrix',
 'eptfe_graft',
 'poly_(vinylidene_fluoride)',
 'ptfe_group',
 'eptfe_membrane',
 'pvdf_binder',
 'polyvinylidenefluoride',
 'fluorinated_ethylene_propylene',
 'piezoelectric_polyvinylidene_fluoride',
 'pctfe',
 'pvdf_nanofibers',
 'pfpes',
 'ptfe_particles',
 'ptfe_membranes',
 'eptfe_group',
 'porous_pvdf_membrane',
 'pvdf_fibers']