In [1]:
import numpy as np
import pandas as pd
import os
import sys
from glob import glob
sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle
from tqdm import tqdm_notebook as tqdm
import multiprocessing as mp

def func(data):
    graph_list = data["graph_list"]
    node_list = []
    for j in tqdm(range(len(graph_list))):
        graph_name = graph_list[j]
        graph_name = graph_name.split("/")[-1].replace(".pickle","")
        g = unpickle(graph_list[j])
        node_df = pd.concat([structure[structure.molecule_name==graph_name][["molecule_name", "atom_index"]].reset_index(drop=True), 
                   pd.DataFrame(np.concatenate(g.node, -1), columns=[f"node_{i}" for i in range(13)])], axis=1)
        node_list += [node_df]
    return node_list


structure = pd.read_csv("../input/structures.csv")
graph_list = glob("../input/graph/*.pickle")
n_split = mp.cpu_count()
unit = np.ceil(len(graph_list) / n_split).astype(int)
indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)]

split_graph_list = []
for idx in indexer:
    split_graph_list.append(graph_list[idx[0]:idx[1]])

mp_data = [{"graph_list": m} for m in split_graph_list]

num_workers = mp.cpu_count()
with mp.Pool(num_workers) as executor:
    features_chunk = executor.map(func, mp_data)
    
node_df = pd.concat(features_chunk, axis=0)
to_pickle("../processed/v003/node_df.pkl", node_df)



































































































ModuleNotFoundError: No module named 'lib.utility'

In [None]:
[]

In [3]:

node_df = unpickle("../processed/v003/node_df.pkl", )

In [4]:
node_df.shape

(2358657, 15)

In [5]:
node_df.head()

Unnamed: 0,molecule_name,atom_index,node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,node_8,node_9,node_10,node_11,node_12
0,dsgdb9nsd_113811,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,6.0
1,dsgdb9nsd_113811,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0
2,dsgdb9nsd_113811,2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0
3,dsgdb9nsd_113811,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0
4,dsgdb9nsd_113811,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0


In [7]:
node_df.columns

Index(['molecule_name', 'atom_index', 'node_0', 'node_1', 'node_2', 'node_3',
       'node_4', 'node_5', 'node_6', 'node_7', 'node_8', 'node_9', 'node_10',
       'node_11', 'node_12'],
      dtype='object')

In [10]:


def map_node(df, node_df, atom_idx):
    df = pd.merge(df, node_df, how='left',
                  left_on=['molecule_name', f'atom_index_{atom_idx}'],
                  right_on=['molecule_name', 'atom_index'])

    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={c:f"{c}_{atom_idx}" for c in node_df.columns[2:]})
    return df

In [9]:
train = pd.read_csv("../input/train.csv")

In [11]:

print("acsf_train 0")
train = map_node(train, node_df, 0)

acsf_train 0


In [12]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,node_0_0,node_1_0,node_2_0,node_3_0,node_4_0,node_5_0,node_6_0,node_7_0,node_8_0,node_9_0,node_10_0,node_11_0,node_12_0
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
