In [None]:
%matplotlib inline
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import math
import gc
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
ATOMIC_NUMBERS = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}

In [None]:
train_dtypes = {'molecule_name': 'category', 'atom_index_0': 'int8', 'atom_index_1': 'int8', 'type': 'category', 'scalar_coupling_constant': 'float32'}
train = pd.read_csv('../input/train.csv', index_col='id', dtype=train_dtypes)
train['molecule_index'] = train.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train = train[['molecule_index', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']]
train = train.reset_index()

In [None]:
test = pd.read_csv('../input/test.csv', index_col='id', dtype=train_dtypes)
test['molecule_index'] = test['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test = test[['molecule_index', 'atom_index_0', 'atom_index_1', 'type']]
test = test.reset_index()

In [None]:
structures_dtypes = {'molecule_name': 'category', 'atom_index': 'int8', 'atom': 'category', 'x': 'float32', 'y': 'float32', 'z': 'float32'}
structures = pd.read_csv('../input/structures.csv', dtype=structures_dtypes)
structures['molecule_index'] = structures.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures = structures[['molecule_index', 'atom_index', 'atom', 'x', 'y', 'z']]
structures['atom'] = structures['atom'].replace(ATOMIC_NUMBERS).astype('int8')

In [None]:
def build_type_dataframes(base, structures, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    
    return base, structures


def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={'atom': f'atom_{index}',
                            'x': f'x_{index}',
                            'y': f'y_{index}',
                            'z': f'z_{index}'})
    
    return df


def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner', on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    
    return df


def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left', left_on=['molecule_index'], right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    
    return df


def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

    
def add_distance_to_center(df):
    df['d_c'] = (((df['x_c'] - df['x'])**np.float32(2) + (df['y_c'] - df['y'])**np.float32(2) + (df['z_c'] - df['z'])**np.float32(2))**np.float32(0.5))
    
    
def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = (((df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) + (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2))**np.float32(0.5))
    
    
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)
            
            
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)


def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    
    return full


def take_n_atoms(df, n_atoms, four_start=4):
    labels = ['id', 'atom_index_0', 'atom_index_1']
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)

    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if 'scalar_coupling_constant' in df:
        labels.append('scalar_coupling_constant')
    return df[labels]

In [None]:
model_params = {'1JHN': 7, '2JHN': 7, '3JHN': 10, '1JHC': 10, '2JHH': 12, '2JHC': 12, '3JHH': 10, '3JHC': 13}

for type_ in ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']:
    full_train = build_couple_dataframe(train, structures, type_, n_atoms=model_params[type_])
    full_test = build_couple_dataframe(test, structures, type_, n_atoms=model_params[type_])
    df_train = take_n_atoms(full_train, model_params[type_])
    df_test = take_n_atoms(full_test, model_params[type_])
    df_train = df_train.drop('scalar_coupling_constant', axis=1)
    df_train.to_csv(f'./feature_output/each_type_geometry/train_geo_{type_}.csv', index=False)
    df_test.to_csv(f'./feature_output/each_type_geometry/test_geo_{type_}.csv', index=False)

In [None]:
def build_type_dataframes(base, structures):
    base = base.drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    
    return base, structures


def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={'atom': f'atom_{index}',
                            'x': f'x_{index}',
                            'y': f'y_{index}',
                            'z': f'z_{index}'})
    
    return df


def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner', on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    
    return df


def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left', left_on=['molecule_index'], right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    
    return df


def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

    
def add_distance_to_center(df):
    df['d_c'] = (((df['x_c'] - df['x'])**np.float32(2) + (df['y_c'] - df['y'])**np.float32(2) + (df['z_c'] - df['z'])**np.float32(2))**np.float32(0.5))
    
    
def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = (((df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) + (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2))**np.float32(0.5))
    
    
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)
            
            
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)


def build_couple_dataframe(some_csv, structures_csv, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    
    return full


def take_n_atoms(df, n_atoms, four_start=4):
    labels = ['id', 'atom_index_0', 'atom_index_1']
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)

    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if 'scalar_coupling_constant' in df:
        labels.append('scalar_coupling_constant')
    return df[labels]

In [None]:
n_atoms=10
full_train = build_couple_dataframe(train, structures, n_atoms=n_atoms)
full_test = build_couple_dataframe(test, structures, n_atoms=n_atoms)
df_train = take_n_atoms(full_train, n_atoms=n_atoms)
df_test = take_n_atoms(full_test, n_atoms=n_atoms)
df_train = df_train.drop('scalar_coupling_constant', axis=1)
df_train.to_pickle(f'./feature_output/train_geo.pkl')
df_test.to_pickle(f'./feature_output/test_geo.pkl')
df_train.to_csv(f'./feature_output/train_geo.csv', index=False)
df_test.to_csv(f'./feature_output/test_geo.csv', index=False)