In [1]:
%%html
<style type="text/css">
.CodeMirror pre, .output pre { font-family: Consolas, Monaco, monospace; }
.text_cell_render { font-family: Consolas, Monaco, monospace; }
</style>

In [None]:
# https://www.kaggle.com/marcelotamashiro/lgb-public-kernels-plus-more-features

In [28]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection._split import check_cv
from sklearn.base import clone, is_classifier
from scipy.stats import kurtosis, skew

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
# import lightgbm as lgb
import gc
from time import time
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import matrics_rotate
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle



class ClassifierTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator=None, n_classes=2, cv=3):
        self.estimator = estimator
        self.n_classes = n_classes
        self.cv = cv
    
    def _get_labels(self, y):
        y_labels = np.zeros(len(y))
        y_us = np.sort(np.unique(y))
        step = int(len(y_us) / self.n_classes)
        
        for i_class in range(self.n_classes):
            if i_class + 1 == self.n_classes:
                y_labels[y >= y_us[i_class * step]] = i_class
            else:
                y_labels[
                    np.logical_and(
                        y >= y_us[i_class * step],
                        y < y_us[(i_class + 1) * step]
                    )
                ] = i_class
        return y_labels
        
    def fit(self, X, y):
        X = X.replace([np.inf,-np.inf], np.nan)
        X = X.fillna(0)
        y_labels = self._get_labels(y)
        cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
        self.estimators_ = []
        
        for train, _ in cv.split(X, y_labels):
            X = np.array(X)
            self.estimators_.append(
                clone(self.estimator).fit(X[train], y_labels[train])
            )
        return self
    
    def transform(self, X, y=None):
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        X = X.replace([np.inf,-np.inf], np.nan)
        X = X.fillna(0)
        X = np.array(X)
        X_prob = np.zeros((X.shape[0], self.n_classes))
        X_pred = np.zeros(X.shape[0])
        
        for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
            X_prob[test] = estimator.predict_proba(X[test])
            X_pred[test] = estimator.predict(X[test])
        return np.hstack([X_prob, np.array([X_pred]).T])

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


class MoreStructureProperties(TransformerMixin, BaseEstimator):
    
    def __init__(self,atomic_radius,electronegativity):
        self.atomic_radius = atomic_radius
        self.electronegativity = electronegativity
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        atom_rad = [self.atomic_radius[x] for x in X['atom'].values]
        X['rad'] = atom_rad
        position = X[['x','y','z']].values
        p_temp = position
        molec_name = X['molecule_name'].values
        m_temp = molec_name
        radius = X['rad'].values
        r_temp = radius
        bond = 0
        dist_keep = 0
        dist_bond = 0 
        no_bond = 0
        dist_no_bond = 0
        dist_matrix = np.zeros((X.shape[0],2*29))
        dist_matrix_bond = np.zeros((X.shape[0],2*29))
        dist_matrix_no_bond = np.zeros((X.shape[0],2*29))
        
        for i in range(29):
            p_temp = np.roll(p_temp,-1,axis=0)
            m_temp = np.roll(m_temp,-1,axis=0)
            r_temp = np.roll(r_temp,-1,axis=0)
            mask = (m_temp==molec_name)
            dist = np.linalg.norm(position-p_temp,axis=1) * mask            
            dist_temp = np.roll(np.linalg.norm(position-p_temp,axis=1)*mask,i+1,axis=0)
            diff_radius_dist = (dist-(radius+r_temp)) * (dist<(radius+r_temp)) * mask
            diff_radius_dist_temp = np.roll(diff_radius_dist,i+1,axis=0)
            bond += (dist<(radius+r_temp)) * mask
            bond_temp = np.roll((dist<(radius+r_temp)) * mask,i+1,axis=0)
            no_bond += (dist>=(radius+r_temp)) * mask
            no_bond_temp = np.roll((dist>=(radius+r_temp)) * mask,i+1,axis=0)
            bond += bond_temp
            no_bond += no_bond_temp
            dist_keep += dist * mask
            dist_matrix[:,2*i] = dist
            dist_matrix[:,2*i+1] = dist_temp
            dist_matrix_bond[:,2*i] = dist * (dist<(radius+r_temp)) * mask
            dist_matrix_bond[:,2*i+1] = dist_temp * bond_temp
            dist_matrix_no_bond[:,2*i] = dist * (dist>(radius+r_temp)) * mask
            dist_matrix_no_bond[:,2*i+1] = dist_temp * no_bond_temp
        X['n_bonds'] = bond
        X['n_no_bonds'] = no_bond
        X['dist_mean'] = np.nanmean(np.where(dist_matrix==0,np.nan,dist_matrix), axis=1)
        X['dist_median'] = np.nanmedian(np.where(dist_matrix==0,np.nan,dist_matrix), axis=1)
        X['dist_std_bond'] = np.nanstd(np.where(dist_matrix_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_mean_bond'] = np.nanmean(np.where(dist_matrix_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_median_bond'] = np.nanmedian(np.where(dist_matrix_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_mean_no_bond'] = np.nanmean(np.where(dist_matrix_no_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_std_no_bond'] = np.nanstd(np.where(dist_matrix_no_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_median_no_bond'] = np.nanmedian(np.where(dist_matrix_no_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_std'] = np.nanstd(np.where(dist_matrix==0,np.nan,dist_matrix), axis=1)
        X['dist_min'] = np.nanmin(np.where(dist_matrix==0,np.nan,dist_matrix), axis=1)
        X['dist_max'] = np.nanmax(np.where(dist_matrix==0,np.nan,dist_matrix), axis=1)
        X['range_dist'] = np.absolute(X['dist_max']-X['dist_min'])
        X['dist_bond_min'] = np.nanmin(np.where(dist_matrix_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_bond_max'] = np.nanmax(np.where(dist_matrix_bond==0,np.nan,dist_matrix), axis=1)
        X['range_dist_bond'] = np.absolute(X['dist_bond_max']-X['dist_bond_min'])
        X['dist_no_bond_min'] = np.nanmin(np.where(dist_matrix_no_bond==0,np.nan,dist_matrix), axis=1)
        X['dist_no_bond_max'] = np.nanmax(np.where(dist_matrix_no_bond==0,np.nan,dist_matrix), axis=1)
        X['range_dist_no_bond'] = np.absolute(X['dist_no_bond_max']-X['dist_no_bond_min'])
        X['n_diff'] = pd.DataFrame(np.around(dist_matrix_bond,5)).nunique(axis=1).values  #5
        X = reduce_mem_usage(X,verbose=False)
        return X
        
    
class MakeMoreFeatures(TransformerMixin, BaseEstimator):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['distance'] = np.linalg.norm(X[['x_x','y_x','z_x']].values - X[['x_y','y_y','z_y']].values ,axis=1)
        X['x_dist'] = X['x_x'] - X['x_y']
        X['y_dist'] = X['y_x'] - X['y_y']
        X['z_dist'] = X['z_x'] - X['z_y']
        X['x_dist_abs'] = np.absolute(X['x_dist'])
        X['y_dist_abs'] = np.absolute(X['y_dist'])
        X['z_dist_abs'] = np.absolute(X['z_dist'])
        X['inv_distance3'] = 1/(X['distance']**3)
        X['dimension_x'] = np.absolute(X.groupby(['molecule_name'])['x_x'].transform('max') - X.groupby(['molecule_name'])['x_x'].transform('min'))
        X['dimension_y'] = np.absolute(X.groupby(['molecule_name'])['y_x'].transform('max') - X.groupby(['molecule_name'])['y_x'].transform('min'))
        X['dimension_z'] = np.absolute(X.groupby(['molecule_name'])['z_x'].transform('max') - X.groupby(['molecule_name'])['z_x'].transform('min'))
        X['molecule_dist_mean_x'] = X.groupby(['molecule_name'])['dist_mean_x'].transform('mean')
        X['molecule_dist_mean_y'] = X.groupby(['molecule_name'])['dist_mean_y'].transform('mean')
        X['molecule_dist_mean_bond_x'] = X.groupby(['molecule_name'])['dist_mean_bond_x'].transform('mean')
        X['molecule_dist_mean_bond_y'] = X.groupby(['molecule_name'])['dist_mean_bond_y'].transform('mean')
        X['molecule_dist_range_x'] = X.groupby(['molecule_name'])['dist_mean_x'].transform('max') - X.groupby(['molecule_name'])['dist_mean_x'].transform('min')
        X['molecule_dist_range_y'] = X.groupby(['molecule_name'])['dist_mean_y'].transform('max') - X.groupby(['molecule_name'])['dist_mean_y'].transform('min')
        X['molecule_dist_std_x'] = X.groupby(['molecule_name'])['dist_mean_x'].transform('std')
        X['molecule_dist_std_y'] = X.groupby(['molecule_name'])['dist_mean_y'].transform('std')
        X['molecule_atom_0_dist_mean'] = X.groupby(['molecule_name','atom_x'])['distance'].transform('mean')
        X['molecule_atom_1_dist_mean'] = X.groupby(['molecule_name','atom_y'])['distance'].transform('mean')
        X['molecule_atom_0_dist_std_diff'] = X.groupby(['molecule_name', 'atom_x'])['distance'].transform('std') - X['distance']
        X['molecule_atom_1_dist_std_diff'] = X.groupby(['molecule_name', 'atom_y'])['distance'].transform('std') - X['distance']
        X['molecule_type_dist_min'] = X.groupby(['molecule_name','type'])['distance'].transform('min') 
        X['molecule_type_dist_max'] = X.groupby(['molecule_name','type'])['distance'].transform('max') 
        X['molecule_dist_mean_no_bond_x'] = X.groupby(['molecule_name'])['dist_mean_no_bond_x'].transform('mean')
        X['molecule_dist_mean_no_bond_y'] = X.groupby(['molecule_name'])['dist_mean_no_bond_y'].transform('mean')
        X['molecule_atom_index_0_dist_min'] = X.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('min') #new variable - dont include
        X['molecule_atom_index_0_dist_std'] = X.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('std') #new variable - dont include
        X['molecule_atom_index_0_dist_min_div'] = X['molecule_atom_index_0_dist_min']/X['distance'] #new variable - include
        X['molecule_atom_index_0_dist_std_div'] = X['molecule_atom_index_0_dist_std']/X['distance'] #new variable - include
        X['molecule_atom_index_0_dist_mean'] = X.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('mean') #new variable - include
        X['molecule_atom_index_0_dist_max'] = X.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('max') #new variable - include
        X['molecule_atom_index_0_dist_mean_diff'] = X['molecule_atom_index_0_dist_mean'] - X['distance'] #new variable - include
        X['molecule_atom_index_1_dist_mean'] = X.groupby(['molecule_name', 'atom_index_1'])['distance'].transform('mean') #new variable - include
        X['molecule_atom_index_1_dist_max'] = X.groupby(['molecule_name', 'atom_index_1'])['distance'].transform('max') #new variable - include
        X['molecule_atom_index_1_dist_min'] = X.groupby(['molecule_name', 'atom_index_1'])['distance'].transform('min') #new variable - include
        X['molecule_atom_index_1_dist_std'] = X.groupby(['molecule_name', 'atom_index_1'])['distance'].transform('std') #new variable - dont include
        X['molecule_atom_index_1_dist_min_div'] = X['molecule_atom_index_1_dist_min']/X['distance'] #new variable - include
        X['molecule_atom_index_1_dist_std_diff'] = X['molecule_atom_index_1_dist_std'] - X['distance'] #new variable - include
        X['molecule_atom_index_1_dist_mean_div'] = X['molecule_atom_index_1_dist_mean']/X['distance'] #new variable - include
        X['molecule_atom_index_1_dist_min_diff'] = X['molecule_atom_index_1_dist_min_div'] - X['distance'] #new variable - include
        le = LabelEncoder()
        for feat in ['atom_x','atom_y']:
            le.fit(X[feat])
            X[feat] = le.transform(X[feat])
        X = reduce_mem_usage(X,verbose=False)
        return X


def map_atom_info(df_1, df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    return df

    
def find_dist(df):
    df_p_0 = df[['x_0', 'y_0', 'z_0']].values
    df_p_1 = df[['x_1', 'y_1', 'z_1']].values
    
    df['dist'] = np.linalg.norm(df_p_0 - df_p_1, axis=1)
    df['dist_inv2'] = 1/df['dist']**2
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2
    return df

def find_closest_atom(df):    
    df_temp = df.loc[:,["molecule_name",
                      "atom_index_0","atom_index_1",
                      "dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_ = df_temp.copy()
    df_temp_ = df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp_all = pd.concat((df_temp,df_temp_),axis=0)

    df_temp_all["min_distance"]=df_temp_all.groupby(['molecule_name', 
                                                     'atom_index_0'])['dist'].transform('min')
    df_temp_all["max_distance"]=df_temp_all.groupby(['molecule_name', 
                                                     'atom_index_0'])['dist'].transform('max')
    
    df_temp = df_temp_all[df_temp_all["min_distance"]==df_temp_all["dist"]].copy()
    df_temp = df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp = df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_closest',
                                         'dist': 'distance_closest',
                                         'x_1': 'x_closest',
                                         'y_1': 'y_closest',
                                         'z_1': 'z_closest'})
    df_temp = df_temp.drop_duplicates(subset=['molecule_name', 'atom_index'])
    
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
        
    df_temp= df_temp_all[df_temp_all["max_distance"]==df_temp_all["dist"]].copy()
    df_temp = df_temp.drop(['x_0','y_0','z_0','max_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_farthest',
                                         'dist': 'distance_farthest',
                                         'x_1': 'x_farthest',
                                         'y_1': 'y_farthest',
                                         'z_1': 'z_farthest'})
    df_temp = df_temp.drop_duplicates(subset=['molecule_name', 'atom_index'])
        
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_farthest': f'atom_index_farthest_{atom_idx}',
                                        'distance_farthest': f'distance_farthest_{atom_idx}',
                                        'x_farthest': f'x_farthest_{atom_idx}',
                                        'y_farthest': f'y_farthest_{atom_idx}',
                                        'z_farthest': f'z_farthest_{atom_idx}'})
    return df


def add_cos_features(df):
    
    df["distance_center0"] = np.sqrt((df['x_0']-df['c_x'])**2 \
                                   + (df['y_0']-df['c_y'])**2 \
                                   + (df['z_0']-df['c_z'])**2)
    df["distance_center1"] = np.sqrt((df['x_1']-df['c_x'])**2 \
                                   + (df['y_1']-df['c_y'])**2 \
                                   + (df['z_1']-df['c_z'])**2)
    
    df['distance_c0'] = np.sqrt((df['x_0']-df['x_closest_0'])**2 + \
                                (df['y_0']-df['y_closest_0'])**2 + \
                                (df['z_0']-df['z_closest_0'])**2)
    df['distance_c1'] = np.sqrt((df['x_1']-df['x_closest_1'])**2 + \
                                (df['y_1']-df['y_closest_1'])**2 + \
                                (df['z_1']-df['z_closest_1'])**2)
    
    df["distance_f0"] = np.sqrt((df['x_0']-df['x_farthest_0'])**2 + \
                                (df['y_0']-df['y_farthest_0'])**2 + \
                                (df['z_0']-df['z_farthest_0'])**2)
    df["distance_f1"] = np.sqrt((df['x_1']-df['x_farthest_1'])**2 + \
                                (df['y_1']-df['y_farthest_1'])**2 + \
                                (df['z_1']-df['z_farthest_1'])**2)
    
    vec_center0_x = (df['x_0']-df['c_x'])/(df["distance_center0"]+1e-10)
    vec_center0_y = (df['y_0']-df['c_y'])/(df["distance_center0"]+1e-10)
    vec_center0_z = (df['z_0']-df['c_z'])/(df["distance_center0"]+1e-10)
    
    vec_center1_x = (df['x_1']-df['c_x'])/(df["distance_center1"]+1e-10)
    vec_center1_y = (df['y_1']-df['c_y'])/(df["distance_center1"]+1e-10)
    vec_center1_z = (df['z_1']-df['c_z'])/(df["distance_center1"]+1e-10)
    
    vec_c0_x = (df['x_0']-df['x_closest_0'])/(df["distance_c0"]+1e-10)
    vec_c0_y = (df['y_0']-df['y_closest_0'])/(df["distance_c0"]+1e-10)
    vec_c0_z = (df['z_0']-df['z_closest_0'])/(df["distance_c0"]+1e-10)
    
    vec_c1_x = (df['x_1']-df['x_closest_1'])/(df["distance_c1"]+1e-10)
    vec_c1_y = (df['y_1']-df['y_closest_1'])/(df["distance_c1"]+1e-10)
    vec_c1_z = (df['z_1']-df['z_closest_1'])/(df["distance_c1"]+1e-10)
    
    vec_f0_x = (df['x_0']-df['x_farthest_0'])/(df["distance_f0"]+1e-10)
    vec_f0_y = (df['y_0']-df['y_farthest_0'])/(df["distance_f0"]+1e-10)
    vec_f0_z = (df['z_0']-df['z_farthest_0'])/(df["distance_f0"]+1e-10)
    
    vec_f1_x = (df['x_1']-df['x_farthest_1'])/(df["distance_f1"]+1e-10)
    vec_f1_y = (df['y_1']-df['y_farthest_1'])/(df["distance_f1"]+1e-10)
    vec_f1_z = (df['z_1']-df['z_farthest_1'])/(df["distance_f1"]+1e-10)
    
    vec_x = (df['x_1']-df['x_0'])/df['dist']
    vec_y = (df['y_1']-df['y_0'])/df['dist']
    vec_z = (df['z_1']-df['z_0'])/df['dist']
    
    df["cos_c0_c1"] = vec_c0_x*vec_c1_x + vec_c0_y*vec_c1_y + vec_c0_z*vec_c1_z
    df["cos_f0_f1"] = vec_f0_x*vec_f1_x + vec_f0_y*vec_f1_y + vec_f0_z*vec_f1_z
    
    df["cos_c0_f0"] = vec_c0_x*vec_f0_x + vec_c0_y*vec_f0_y + vec_c0_z*vec_f0_z
    df["cos_c1_f1"] = vec_c1_x*vec_f1_x + vec_c1_y*vec_f1_y + vec_c1_z*vec_f1_z
    
    df["cos_center0_center1"] = vec_center0_x*vec_center1_x \
                              + vec_center0_y*vec_center1_y \
                              + vec_center0_z*vec_center1_z
    
    df["cos_c0"] = vec_c0_x*vec_x + vec_c0_y*vec_y + vec_c0_z*vec_z
    df["cos_c1"] = vec_c1_x*vec_x + vec_c1_y*vec_y + vec_c1_z*vec_z
    
    df["cos_f0"] = vec_f0_x*vec_x + vec_f0_y*vec_y + vec_f0_z*vec_z
    df["cos_f1"] = vec_f1_x*vec_x + vec_f1_y*vec_y + vec_f1_z*vec_z
    
    df["cos_center0"] = vec_center0_x*vec_x + vec_center0_y*vec_y + vec_center0_z*vec_z
    df["cos_center1"] = vec_center1_x*vec_x + vec_center1_y*vec_y + vec_center1_z*vec_z

    return df

def dummies(df, list_cols):
    for col in list_cols:
        df_dummies = pd.get_dummies(df[col], drop_first=True, 
                                    prefix=(str(col)))
        df = pd.concat([df, df_dummies], axis=1)
    return df


def add_qm9_features(df):
    data_qm9 = pd.read_pickle('../input/data.covs.pickle')
    to_drop = ['type', 
               'linear', 
               'atom_index_0', 
               'atom_index_1', 
               'scalar_coupling_constant', 
               'U', 'G', 'H', 
               'mulliken_mean', 'r2', 'U0']
    data_qm9 = data_qm9.drop(columns = to_drop, axis=1)
    data_qm9 = reduce_mem_usage(data_qm9,verbose=False)
    df = pd.merge(df, data_qm9, how='left', on=['molecule_name','id'])
    del data_qm9
    
    df = dummies(df, ['type', 'atom_1'])
    return df

def get_features(df, struct):
    for atom_idx in [0,1]:
        df = map_atom_info(df, struct, atom_idx)
        df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
        struct['c_x'] = struct.groupby('molecule_name')['x'].transform('mean')
        struct['c_y'] = struct.groupby('molecule_name')['y'].transform('mean')
        struct['c_z'] = struct.groupby('molecule_name')['z'].transform('mean')

    df = find_dist(df)
    df = find_closest_atom(df)
    df = add_cos_features(df)
    df = add_qm9_features(df)
    return df

def comp_score (y_true, y_pred, jtype):
    df = pd.DataFrame()
    df['y_true'] , df['y_pred'], df['jtype'] = y_true , y_pred, jtype
    score = 0 
    for t in jtype.unique():
        score_jtype = np.log(mean_absolute_error(df[df.jtype==t]['y_true'],df[df.jtype==t]['y_pred']))
        score += score_jtype
        print(f'{t} : {score_jtype}')
    score /= len(jtype.unique())
    return score

def feat_from_structures(df, st):
    df = pd.merge(df,st,how='left',left_on=['molecule_name','atom_index_0'], right_on=['molecule_name','atom_index'])
    df = pd.merge(df,st,how='left',left_on=['molecule_name','atom_index_1'], right_on=['molecule_name','atom_index'])
    n_atoms = st.groupby(['molecule_name','atom'])['atom'].size().to_frame(name = 'count').reset_index()
    n_atoms_df = n_atoms.pivot_table('count',['molecule_name'], 'atom')
    n_atoms_df.fillna(0,inplace=True)
    df = pd.merge(df,n_atoms_df,on=['molecule_name'],how='left')
    del n_atoms
    gc.collect()
    return df    

In [18]:
all_features = ['type',  'atom_x', 'x_x', 'y_x','z_x', 'n_bonds_x', 'atom_y', 'x_y', 'y_y',
       'z_y', 'n_bonds_y', 'C', 'F', 'H', 'N', 'O', 'distance', 'dist_mean_x','dist_mean_y',
       'x_dist', 'y_dist', 'z_dist', 'x_dist_abs', 'y_dist_abs', 'z_dist_abs','inv_distance3']
cat_features = ['type','atom_x','atom_y']
atomic_radius = {'H': 0.43, 'C': 0.82, 'N': 0.8, 'O': 0.78, 'F': 0.76}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}

In [19]:
# kaggle datasets download kenmatsu4/yukawa_interaction

In [20]:
t0 = time()
struct = pd.read_csv('../input/structures.csv')
pipeline_model1 = make_pipeline(MoreStructureProperties(atomic_radius,electronegativity))
pipeline_model2 = make_pipeline(MakeMoreFeatures())
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
struct = pd.read_csv('../input/structures.csv')
structures_yukawa = pd.read_csv('../input/structures_yukawa.csv')
struct = pd.concat([struct, structures_yukawa], axis=1)
del structures_yukawa
struct = reduce_mem_usage(struct,verbose=False)
gc.collect()
train = get_features(train, struct.copy())
test = get_features(test, struct.copy())
y = train['scalar_coupling_constant']
del struct
gc.collect()

struct = pd.read_csv('../input/structures.csv')
struct = pipeline_model1.fit_transform(struct)
train = feat_from_structures(train,struct)
train = pipeline_model2.fit_transform(train.drop(['scalar_coupling_constant'],axis=1), train['scalar_coupling_constant'])
test = feat_from_structures(test,struct)
test = pipeline_model2.transform(test)
train = reduce_mem_usage(train,verbose=False)
test = reduce_mem_usage(test,verbose=False)


  keepdims=keepdims)
  r = func(a, **kwargs)


In [21]:
# kaggle datasets download scaomath/giba-molecular-features

In [23]:
giba_columns = ['inv_dist0', 'inv_dist1', 'inv_distP', 'inv_dist0R', 'inv_dist1R', 'inv_distPR', 'inv_dist0E', 'inv_dist1E', 'inv_distPE', 'linkM0',
         'linkM1', 'min_molecule_atom_0_dist_xyz', 'mean_molecule_atom_0_dist_xyz', 'max_molecule_atom_0_dist_xyz', 'sd_molecule_atom_0_dist_xyz', 'min_molecule_atom_1_dist_xyz',
         'mean_molecule_atom_1_dist_xyz', 'max_molecule_atom_1_dist_xyz', 'sd_molecule_atom_1_dist_xyz', 'coulomb_C.x', 'coulomb_F.x', 'coulomb_H.x', 'coulomb_N.x',
         'coulomb_O.x', 'yukawa_C.x', 'yukawa_F.x', 'yukawa_H.x', 'yukawa_N.x', 'yukawa_O.x', 'vander_C.x', 'vander_F.x', 'vander_H.x', 'vander_N.x', 'vander_O.x',
         'coulomb_C.y', 'coulomb_F.y', 'coulomb_H.y', 'coulomb_N.y', 'coulomb_O.y', 'yukawa_C.y', 'yukawa_F.y', 'yukawa_H.y', 'yukawa_N.y', 'yukawa_O.y', 'vander_C.y',
         'vander_F.y', 'vander_H.y', 'vander_N.y', 'vander_O.y', 'distC0', 'distH0', 'distN0', 'distC1', 'distH1', 'distN1', 'adH1', 'adH2', 'adH3', 'adH4', 'adC1',
         'adC2', 'adC3', 'adC4', 'adN1', 'adN2', 'adN3', 'adN4', 'NC', 'NH', 'NN', 'NF', 'NO']

train_giba_t = pd.read_csv('../input/train_giba.csv.gz', header=0,  usecols=giba_columns)
test_giba_t = pd.read_csv('../input/test_giba.csv.gz', header=0,  usecols=giba_columns)
train_giba_t = reduce_mem_usage(train_giba_t, verbose=False)
test_giba_t = reduce_mem_usage(test_giba_t, verbose=False)

train = pd.concat((train,train_giba_t),axis=1)
test = pd.concat((test,test_giba_t),axis=1)

In [24]:
all_features = ['type',   'x_x', 'y_x','z_x', 'atom_y', 'x_y', 'y_y',
       'z_y', 'n_bonds_y', 'C', 'F', 'H', 'N', 'O', 'distance', 'dist_mean_x','dist_mean_y',
        'x_dist_abs', 'y_dist_abs', 'z_dist_abs','inv_distance3',
       'molecule_atom_1_dist_std_diff','molecule_dist_mean_x',
       'molecule_dist_mean_y','molecule_dist_std_x','molecule_dist_std_y','molecule_atom_0_dist_mean',
       'molecule_atom_1_dist_mean','dist_mean_bond_y',
       'n_no_bonds_x','n_no_bonds_y', 'dist_std_x', 'dist_std_y','dist_min_x','dist_min_y','dist_max_x', 'dist_max_y',
       'molecule_dist_range_x','molecule_dist_range_y', 'dimension_x', 'dimension_y','dimension_z','molecule_dist_mean_bond_x',
       'molecule_dist_mean_bond_x','dist_mean_no_bond_x','dist_mean_no_bond_y',
       'dist_std_bond_y','dist_bond_min_y','dist_bond_max_y',
       'range_dist_bond_y','dist_std_no_bond_x','dist_std_no_bond_y', 'dist_no_bond_min_x','dist_no_bond_min_y','dist_no_bond_max_x',
       'dist_no_bond_max_y', 'range_dist_no_bond_x','range_dist_no_bond_y','dist_median_bond_y','dist_median_x',
       'dist_median_y','dist_median_no_bond_x','dist_median_no_bond_y','molecule_type_dist_min','molecule_type_dist_max',
       'molecule_dist_mean_no_bond_x','molecule_dist_mean_no_bond_y', 'n_diff_y','molecule_atom_index_0_dist_min_div','molecule_atom_index_0_dist_std_div',
        'molecule_atom_index_0_dist_mean','molecule_atom_index_0_dist_max','molecule_atom_index_1_dist_mean','molecule_atom_index_1_dist_max',
       'molecule_atom_index_1_dist_min','molecule_atom_index_1_dist_min_div','molecule_atom_index_1_dist_std_diff','molecule_atom_index_0_dist_mean_diff',
        'molecule_atom_index_1_dist_mean_div','molecule_atom_index_1_dist_min_diff', 'rc_A', 'rc_B', 'rc_C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'zpve', 'Cv',
         'freqs_min', 'freqs_max', 'freqs_mean', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0', 'mulliken_atom_1',
         'dist_C_0_x', 'dist_C_1_x', 'dist_C_2_x', 'dist_C_3_x', 'dist_C_4_x', 'dist_F_0_x', 'dist_F_1_x', 'dist_F_2_x', 'dist_H_0_x',
         'dist_H_1_x', 'dist_H_2_x', 'dist_H_3_x', 'dist_H_4_x', 'dist_N_0_x', 'dist_N_1_x', 'dist_N_2_x', 'dist_N_3_x', 'dist_N_4_x', 'dist_O_0_x', 'dist_O_1_x',
         'dist_O_2_x', 'dist_O_3_x', 'dist_O_4_x', 'dist_C_0_y', 'dist_C_1_y', 'dist_C_2_y', 'dist_C_3_y', 'dist_C_4_y', 'dist_F_0_y', 'dist_F_1_y', 'dist_F_2_y',
         'dist_F_3_y', 'dist_F_4_y', 'dist_H_0_y', 'dist_H_1_y', 'dist_H_2_y', 'dist_H_3_y', 'dist_H_4_y', 'dist_N_0_y', 'dist_N_1_y', 'dist_N_2_y', 'dist_N_3_y',
         'dist_N_4_y', 'dist_O_0_y', 'dist_O_1_y', 'dist_O_2_y', 'dist_O_3_y', 'dist_O_4_y','distance_closest_0', 'distance_closest_1', 'distance_farthest_0',
         'distance_farthest_1','cos_c0_c1', 'cos_f0_f1','cos_c0_f0', 'cos_c1_f1', 'cos_center0_center1', 'cos_c0', 'cos_c1', 'cos_f0', 'cos_f1',
         'cos_center0', 'cos_center1'] + giba_columns

cat_features = ['atom_y']


In [35]:

#カラム内の文字数。デフォルトは50
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
#

In [66]:


to_pickle("../processed/v003/train_kernel_plus_more_feats.pkl", train[[c for c in all_features if c not in c2_list+["type"]]])
to_pickle("../processed/v003/test_kernel_plus_more_feats.pkl", test[[c for c in all_features if c not in c2_list+["type"]]])


In [65]:
ll ../processed/v003/train_kernel_plus_more_feats.pkl

-rw-rw-r-- 1 kenichi.matsui 2026313456 Aug  8 15:12 ../processed/v003/train_kernel_plus_more_feats.pkl


-rw-rw-r-- 1 kenichi.matsui 2194007425 Aug  8 14:58 ../processed/v003/train_kernel_plus_more_feats.pkl


In [64]:
ll ../processed/v003/test_kernel_plus_more_feats.pkl

-rw-rw-r-- 1 kenichi.matsui 1142540652 Aug  8 15:12 ../processed/v003/test_kernel_plus_more_feats.pkl


In [36]:
train[all_features].head()

Unnamed: 0,type,x_x,y_x,z_x,atom_y,x_y,y_y,z_y,n_bonds_y,C,F,H,N,O,distance,dist_mean_x,dist_mean_y,x_dist_abs,y_dist_abs,z_dist_abs,inv_distance3,molecule_atom_1_dist_std_diff,molecule_dist_mean_x,molecule_dist_mean_y,molecule_dist_std_x,molecule_dist_std_y,molecule_atom_0_dist_mean,molecule_atom_1_dist_mean,dist_mean_bond_y,n_no_bonds_x,n_no_bonds_y,dist_std_x,dist_std_y,dist_min_x,dist_min_y,dist_max_x,dist_max_y,molecule_dist_range_x,molecule_dist_range_y,dimension_x,dimension_y,dimension_z,molecule_dist_mean_bond_x,molecule_dist_mean_bond_x.1,dist_mean_no_bond_x,dist_mean_no_bond_y,dist_std_bond_y,dist_bond_min_y,dist_bond_max_y,range_dist_bond_y,dist_std_no_bond_x,dist_std_no_bond_y,dist_no_bond_min_x,dist_no_bond_min_y,dist_no_bond_max_x,dist_no_bond_max_y,range_dist_no_bond_x,range_dist_no_bond_y,dist_median_bond_y,dist_median_x,dist_median_y,dist_median_no_bond_x,dist_median_no_bond_y,molecule_type_dist_min,molecule_type_dist_max,molecule_dist_mean_no_bond_x,molecule_dist_mean_no_bond_y,n_diff_y,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std_div,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_max,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std_diff,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_min_diff,rc_A,rc_B,rc_C,mu,alpha,homo,lumo,gap,zpve,Cv,freqs_min,freqs_max,freqs_mean,mulliken_min,mulliken_max,mulliken_atom_0,mulliken_atom_1,dist_C_0_x,dist_C_1_x,dist_C_2_x,dist_C_3_x,dist_C_4_x,dist_F_0_x,dist_F_1_x,dist_F_2_x,dist_H_0_x,dist_H_1_x,dist_H_2_x,dist_H_3_x,dist_H_4_x,dist_N_0_x,dist_N_1_x,dist_N_2_x,dist_N_3_x,dist_N_4_x,dist_O_0_x,dist_O_1_x,dist_O_2_x,dist_O_3_x,dist_O_4_x,dist_C_0_y,dist_C_1_y,dist_C_2_y,dist_C_3_y,dist_C_4_y,dist_F_0_y,dist_F_1_y,dist_F_2_y,dist_F_3_y,dist_F_4_y,dist_H_0_y,dist_H_1_y,dist_H_2_y,dist_H_3_y,dist_H_4_y,dist_N_0_y,dist_N_1_y,dist_N_2_y,dist_N_3_y,dist_N_4_y,dist_O_0_y,dist_O_1_y,dist_O_2_y,dist_O_3_y,dist_O_4_y,distance_closest_0,distance_closest_1,distance_farthest_0,distance_farthest_1,cos_c0_c1,cos_f0_f1,cos_c0_f0,cos_c1_f1,cos_center0_center1,cos_c0,cos_c1,cos_f0,cos_f1,cos_center0,cos_center1,inv_dist0,inv_dist1,inv_distP,inv_dist0R,inv_dist1R,inv_distPR,inv_dist0E,inv_dist1E,inv_distPE,linkM0,linkM1,min_molecule_atom_0_dist_xyz,mean_molecule_atom_0_dist_xyz,max_molecule_atom_0_dist_xyz,sd_molecule_atom_0_dist_xyz,min_molecule_atom_1_dist_xyz,mean_molecule_atom_1_dist_xyz,max_molecule_atom_1_dist_xyz,sd_molecule_atom_1_dist_xyz,coulomb_C.x,coulomb_F.x,coulomb_H.x,coulomb_N.x,coulomb_O.x,yukawa_C.x,yukawa_F.x,yukawa_H.x,yukawa_N.x,yukawa_O.x,vander_C.x,vander_F.x,vander_H.x,vander_N.x,vander_O.x,coulomb_C.y,coulomb_F.y,coulomb_H.y,coulomb_N.y,coulomb_O.y,yukawa_C.y,yukawa_F.y,yukawa_H.y,yukawa_N.y,yukawa_O.y,vander_C.y,vander_F.y,vander_H.y,vander_N.y,vander_O.y,distC0,distH0,distN0,distC1,distH1,distN1,adH1,adH2,adH3,adH4,adC1,adC2,adC3,adC4,adN1,adN2,adN3,adN4,NC,NH,NN,NF,NO
0,1JHC,0.00215,-0.006031,0.001976,0,-0.012695,1.085938,0.008003,4,1.0,0.0,4.0,0.0,0.0,1.091797,1.610352,1.091797,0.014847,1.091797,0.006027,0.768066,-1.091797,1.610352,1.40332,0.0,0.267822,1.506836,1.091797,1.091797,3,0,0.299316,3e-06,1.091797,1.091797,1.783203,1.091797,0.0,0.518555,1.552734,1.469727,1.783203,1.091797,1.091797,1.783203,,3e-06,1.091797,1.091797,7e-06,1.6e-05,,1.783203,,1.783203,,3.7e-05,,1.091797,1.783203,1.091797,1.783203,,1.091797,1.091797,1.783203,1.783203,2,1.0,0.316406,1.610352,1.783203,1.091797,1.091797,1.091797,1.0,-1.091797,0.518555,1.0,-0.091797,157.711807,157.75,157.75,0.0,13.210938,-0.387695,0.117126,0.504883,0.044739,6.46875,1341.0,3152.0,2182.0,-0.535645,0.133911,0.133911,-0.535645,1.614258,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,,,,,,,,,,,0.743652,0.743652,0.743652,0.743652,,,,,,,,,,,,1.091797,1.091797,1.783203,1.091797,-1.0,-0.816406,0.816406,1.0,,-1.0,1.0,-0.816406,1.0,-1.0,,0.770996,0.325439,0.228882,0.003338,0.000843,0.000673,0.107544,0.052856,0.035431,0.0,0.0,1.091797,1.610352,1.783203,0.345703,1.091797,1.091797,1.091797,3e-06,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,0.0,0.0,3.664062,0.0,0.0,0.0,0.0,1.341797,0.0,0.0,0.0,0.0,3.072266,0.0,0.0,1.091797,,,0.272949,,,1.783203,1.783203,1.783203,,,,,,,,,,1.0,4,,,
1,2JHH,0.00215,-0.006031,0.001976,1,1.011719,1.463867,0.000277,1,1.0,0.0,4.0,0.0,0.0,1.783203,1.610352,1.610352,1.009766,1.469727,0.001699,0.17627,-1.783203,1.610352,1.40332,0.0,0.267822,1.506836,1.783203,1.091797,3,3,0.299316,0.299316,1.091797,1.091797,1.783203,1.783203,0.0,0.518555,1.552734,1.469727,1.783203,1.091797,1.091797,1.783203,1.783203,0.0,1.091797,1.091797,0.0,1.6e-05,1.6e-05,1.783203,1.783203,1.783203,1.783203,3.7e-05,3.8e-05,1.091797,1.783203,1.783203,1.783203,1.783203,1.782227,1.783203,1.783203,1.783203,2,0.612305,0.193726,1.610352,1.783203,1.783203,1.783203,1.783203,1.0,,-0.172852,1.0,-0.783203,157.711807,157.75,157.75,0.0,13.210938,-0.387695,0.117126,0.504883,0.044739,6.46875,1341.0,3152.0,2182.0,-0.535645,0.133911,0.133911,0.133911,1.614258,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.614258,,,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.091797,1.091797,1.783203,1.783203,-0.333496,-0.500488,0.816406,0.816895,-0.333496,-0.816406,0.817383,-1.0,0.500488,-0.816406,0.817383,0.770996,5.667969,0.678711,0.003338,1.046875,0.003326,0.107544,0.656738,0.092407,0.0,-0.070435,1.091797,1.610352,1.783203,0.345703,1.783203,1.783203,1.783203,,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,,1.783203,,,5.347656,,1.783203,1.783203,,,1.091797,,,,,,,,1.0,4,,,
2,2JHH,0.00215,-0.006031,0.001976,1,-0.541016,1.447266,-0.876465,1,1.0,0.0,4.0,0.0,0.0,1.782227,1.610352,1.610352,0.542969,1.453125,0.878418,0.176636,-1.78125,1.610352,1.40332,0.0,0.267822,1.506836,1.783203,1.091797,3,3,0.299316,0.299316,1.091797,1.091797,1.783203,1.783203,0.0,0.518555,1.552734,1.469727,1.783203,1.091797,1.091797,1.783203,1.783203,0.0,1.091797,1.091797,0.0,1.6e-05,5e-06,1.783203,1.783203,1.783203,1.783203,3.7e-05,1e-05,1.091797,1.783203,1.783203,1.783203,1.783203,1.782227,1.783203,1.783203,1.783203,2,0.612793,0.193848,1.610352,1.783203,1.783203,1.783203,1.782227,1.0,-1.78125,-0.171875,1.000977,-0.782227,157.711807,157.75,157.75,0.0,13.210938,-0.387695,0.117126,0.504883,0.044739,6.46875,1341.0,3152.0,2182.0,-0.535645,0.133911,0.133911,0.133911,1.614258,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.614258,,,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.091797,1.091797,1.783203,1.783203,-0.333252,-4.1e-05,0.816406,0.816406,-0.333252,-0.816406,0.816895,-0.499756,0.500488,-0.816895,0.816895,0.770996,2.833984,0.605957,0.003338,0.523438,0.003317,0.107544,0.328369,0.081055,0.0,-0.048798,1.091797,1.610352,1.783203,0.345703,1.783203,1.783203,1.783203,7e-06,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,,1.783203,,,2.673828,,1.783203,1.783203,,,1.091797,,,,,,,,1.0,4,,,
3,2JHH,0.00215,-0.006031,0.001976,1,-0.523926,1.4375,0.90625,1,1.0,0.0,4.0,0.0,0.0,1.783203,1.610352,1.610352,0.525879,1.443359,0.904297,0.17627,-1.783203,1.610352,1.40332,0.0,0.267822,1.506836,1.783203,1.091797,3,3,0.299316,0.299316,1.091797,1.091797,1.783203,1.783203,0.0,0.518555,1.552734,1.469727,1.783203,1.091797,1.091797,1.783203,1.783203,0.0,1.091797,1.091797,0.0,1.6e-05,4e-06,1.783203,1.783203,1.783203,1.783203,3.7e-05,9e-06,1.091797,1.783203,1.783203,1.783203,1.783203,1.782227,1.783203,1.783203,1.783203,2,0.612305,0.193726,1.610352,1.783203,1.783203,1.783203,1.782227,0.999512,-1.782227,-0.172852,1.0,-0.783691,157.711807,157.75,157.75,0.0,13.210938,-0.387695,0.117126,0.504883,0.044739,6.46875,1341.0,3152.0,2182.0,-0.535645,0.133911,0.133911,0.133911,1.614258,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.614258,,,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,1.091797,1.091797,1.783203,1.783203,-0.333008,-0.5,0.816406,0.816406,-0.333008,-0.816406,0.816406,-0.5,0.999512,-0.816406,0.816406,0.770996,1.889648,0.547363,0.003338,0.348877,0.003305,0.107544,0.218994,0.072144,0.0,0.0,1.091797,1.610352,1.783203,0.345703,1.783203,1.783203,1.783203,5e-06,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,,1.783203,,,1.783203,,1.783203,1.783203,,,1.091797,,,,,,,,1.0,4,,,
4,1JHC,1.011719,1.463867,0.000277,0,-0.012695,1.085938,0.008003,4,1.0,0.0,4.0,0.0,0.0,1.091797,1.610352,1.091797,1.024414,0.37793,0.007729,0.768066,-1.091797,1.610352,1.40332,0.0,0.267822,1.506836,1.091797,1.091797,3,0,0.299316,3e-06,1.091797,1.091797,1.783203,1.091797,0.0,0.518555,1.552734,1.469727,1.783203,1.091797,1.091797,1.783203,,3e-06,1.091797,1.091797,7e-06,1.6e-05,,1.783203,,1.783203,,3.8e-05,,1.091797,1.783203,1.091797,1.783203,,1.091797,1.091797,1.783203,1.783203,2,1.0,0.365479,1.552734,1.783203,1.091797,1.091797,1.091797,1.0,-1.091797,0.460938,1.0,-0.091797,157.711807,157.75,157.75,0.0,13.210938,-0.387695,0.117126,0.504883,0.044739,6.46875,1341.0,3152.0,2182.0,-0.535645,0.133911,0.133911,-0.535645,1.614258,,,,,,,,0.075928,0.075928,0.075867,,,,,,,,,,,,,,,,,,,,,,,0.743652,0.743652,0.743652,0.743652,,,,,,,,,,,,1.091797,1.091797,1.783203,1.091797,0.333496,0.000189,0.816895,1.0,,-1.000977,-0.333496,-0.816895,-0.333496,-1.000977,,0.89209,0.325439,0.238525,0.003347,0.000843,0.000673,0.128662,0.052856,0.037445,0.0,0.0,1.091797,1.552734,1.783203,0.39917,1.091797,1.091797,1.091797,3e-06,0.916016,0.0,1.682617,0.0,0.0,0.335449,0.0,0.504395,0.0,0.0,0.768066,0.0,0.529297,0.0,0.0,0.0,0.0,3.664062,0.0,0.0,0.0,0.0,1.341797,0.0,0.0,0.0,0.0,3.072266,0.0,0.0,1.091797,,,0.272949,,,1.783203,1.783203,,,,,,,,,,,1.0,4,,,


In [31]:
train[all_features].columns.tolist()

['type',
 'x_x',
 'y_x',
 'z_x',
 'atom_y',
 'x_y',
 'y_y',
 'z_y',
 'n_bonds_y',
 'C',
 'F',
 'H',
 'N',
 'O',
 'distance',
 'dist_mean_x',
 'dist_mean_y',
 'x_dist_abs',
 'y_dist_abs',
 'z_dist_abs',
 'inv_distance3',
 'molecule_atom_1_dist_std_diff',
 'molecule_dist_mean_x',
 'molecule_dist_mean_y',
 'molecule_dist_std_x',
 'molecule_dist_std_y',
 'molecule_atom_0_dist_mean',
 'molecule_atom_1_dist_mean',
 'dist_mean_bond_y',
 'n_no_bonds_x',
 'n_no_bonds_y',
 'dist_std_x',
 'dist_std_y',
 'dist_min_x',
 'dist_min_y',
 'dist_max_x',
 'dist_max_y',
 'molecule_dist_range_x',
 'molecule_dist_range_y',
 'dimension_x',
 'dimension_y',
 'dimension_z',
 'molecule_dist_mean_bond_x',
 'molecule_dist_mean_bond_x',
 'dist_mean_no_bond_x',
 'dist_mean_no_bond_y',
 'dist_std_bond_y',
 'dist_bond_min_y',
 'dist_bond_max_y',
 'range_dist_bond_y',
 'dist_std_no_bond_x',
 'dist_std_no_bond_y',
 'dist_no_bond_min_x',
 'dist_no_bond_min_y',
 'dist_no_bond_max_x',
 'dist_no_bond_max_y',
 'range_dist_no

In [33]:
!ls -ltr ../processed/v003/v003_078/

total 8154752
-rw-rw-r-- 1 kenichi.matsui kenichi.matsui 5455898831 Jul 26 18:39 train_compact_v003_078_yiemon_123J_HnJ_H123J.pkl
-rw-rw-r-- 1 kenichi.matsui kenichi.matsui 2894553802 Jul 26 18:40 test_compact_v003_078_yiemon_123J_HnJ_H123J.pkl


In [68]:
train.head()[[c for c in all_features if c not in c2_list+["type"]]].columns.tolist()

['x_x',
 'y_x',
 'z_x',
 'x_y',
 'y_y',
 'z_y',
 'C',
 'F',
 'H',
 'N',
 'O',
 'dist_mean_x',
 'dist_mean_y',
 'x_dist_abs',
 'y_dist_abs',
 'z_dist_abs',
 'inv_distance3',
 'molecule_dist_mean_x',
 'molecule_dist_mean_y',
 'molecule_dist_std_x',
 'molecule_dist_std_y',
 'molecule_atom_0_dist_mean',
 'molecule_atom_1_dist_mean',
 'dist_mean_bond_y',
 'n_no_bonds_x',
 'n_no_bonds_y',
 'dist_std_x',
 'dist_std_y',
 'dist_min_y',
 'dist_max_x',
 'dist_max_y',
 'molecule_dist_range_x',
 'molecule_dist_range_y',
 'dimension_x',
 'dimension_y',
 'dimension_z',
 'dist_mean_no_bond_x',
 'dist_mean_no_bond_y',
 'dist_std_bond_y',
 'dist_bond_min_y',
 'dist_bond_max_y',
 'range_dist_bond_y',
 'dist_std_no_bond_x',
 'dist_std_no_bond_y',
 'dist_no_bond_min_y',
 'dist_no_bond_max_x',
 'dist_no_bond_max_y',
 'range_dist_no_bond_x',
 'range_dist_no_bond_y',
 'dist_median_bond_y',
 'dist_median_x',
 'dist_median_y',
 'dist_median_no_bond_x',
 'dist_median_no_bond_y',
 'molecule_dist_mean_no_bond_x',


In [34]:
train_all = unpickle("../processed/v003/v003_078/train_compact_v003_078_yiemon_123J_HnJ_H123J.pkl")

In [44]:
train_all_sampled.head()

Unnamed: 0,1J1st_AveSmallestBondAngle_diff,1J1st_AverageBondAngle,1J1st_IsAxial,1J1st_IsPolarHydrogen,1J1st_MemberOfRingCount,1J1st_MemberOfRingSize,1J1st_SmallestBondAngle,1J_ex1_Angle_0_1_max,1J_ex1_Angle_0_1_mean,1J_ex1_Angle_0_1_min,1J_ex1_Angle_0_1_std,1J_ex1_AverageBondAngle_max,1J_ex1_SmallestBondAngle_max,1J_ex1_angle_fromEx1_max,1J_ex1_angle_fromEx1_max_min_diff,1J_ex1_angle_fromEx1_mean,1J_ex1_angle_fromEx1_min,1J_ex1_angle_fromEx1_std,1J_ex1_cos2T_F_L_EX1_mean,1J_ex1_cos2T_F_L_EX1_std,1J_ex1_dist_0_max_min_diff,1J_ex1_dist_0_mean,1J_ex1_dist_0_min,1J_ex1_dist_0_std,1J_ex1_dist_1_max_min_diff,1J_ex1_dist_1_mean,1J_ex1_dist_1_min,1J_ex1_dist_1_std,1J_ex1_dist_from_first_max,1J_ex1_dist_from_first_max_min_diff,1J_ex1_dist_from_first_mean,1J_ex1_dist_from_first_min,1J_ex1_dist_from_first_std,1Jlast_GetHeteroValence,1Jlast_GetPartialCharge,2J2nd_AverageBondAngle,2J2nd_MemberOfRingSize,2J2nd_SmallestBondAngle,2JExplicitHydrogenCount,2JGetHeteroValence,2JGetPartialCharge,2JHasAlphaBetaUnsat,2Jangle_from2nd_max,2Jangle_from2nd_mean,2Jangle_from2nd_min,2Jd_idx0_2nd,2Jd_idx1_2nd,2Jdist_from2nd_min,2Jdist_from2nd_var,3J2nd_MemberOfRingSize,3J2nd_SmallestBondAngle,3J3rd_AverageBondAngle,3J3rd_MemberOfRingSize,3J3rd_SmallestBondAngle,3JExplicitHydrogenCount,3JGetPartialCharge,3Jangle_from2nd_max,3Jangle_from2nd_mean,3Jd_idx1_2nd,3Jdist_from2nd_mean,3Jdist_from2nd_var,3Jlast_AverageBondAngle,3Jlast_SmallestBondAngle,Angle,a0_nb_inring5,a1_degree,a1_hybridization,a1_inring3,a1_inring4,a1_nb_o,cos2T,cosT,d_O_from1st_0,d_O_from1st_ratio_0,dist,dist_C_0_x,dist_C_0_y,dist_C_1_x,dist_C_1_y,dist_C_2_x,dist_C_2_y,dist_C_3_x,dist_C_3_y,dist_C_4_x,dist_C_4_y,dist_H_0_x,dist_H_0_y,dist_H_1_x,dist_H_1_y,dist_H_2_x,dist_H_2_y,dist_H_3_y,dist_H_4_y,dist_N_0_x,dist_N_0_y,dist_O_0_x,dist_O_0_y,dist_O_1_x,dist_O_1_y,dist_to_type_mean,eem2015ba_1,eem2015bm_0,eem2015bm_1,eem2015bn_1,eem2015ha_1,eem2015hm_1,eem_0,eem_1,f006:dist_from_origin_1,gasteiger_0,gasteiger_1,interBond_EquibLength,interBond_Length,mean_angle_C_from2nd,mean_angle_O_from2nd,mean_dist_C_from1st,mean_dist_C_from2nd,mean_dist_ratio_C_from2nd,mean_dist_ratio_O_from1st,mmff94_0,mmff94_1,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std_diff,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std_diff,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std_diff,molecule_type_0_dist_std_diff,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std_diff,pca_exp_3,qeq_1,qtpie_1,sum_circle_size,tda_max_radius,type,scalar_coupling_constant,fc,molecule_name
2018361,,,,,,,,117.354234,115.454584,113.564212,1.547284,103.329465,60.243186,122.109469,62.596198,103.029488,59.513272,25.002846,0.453793,0.727104,0.017454,2.193193,2.183876,0.007174,0.029978,1.49266,1.477086,0.012266,3.525028,2.436048,2.292019,1.088979,0.839578,,,107.667733,3.0,59.851326,1.0,1.0,0.054829,0.0,117.354234,115.454584,113.564212,1.088979,1.493832,1.088979,0.030668,,,,,,,,,,,,,,,115.445304,0,4,4,1,0,0,,,,,2.194374,0.843258,0.448122,0.209674,0.435307,0.207672,0.434388,0.139358,0.140516,0.136653,0.075331,0.180952,0.230728,0.177073,0.215464,0.164748,0.21513,0.213781,0.213512,0.206362,0.455189,0.08841,0.102764,0.0,0.0,1.001816,0.226199,0.192209,0.143033,0.227433,0.270393,0.224857,0.120723,0.251042,1.471788,0.111633,-0.041196,,,113.564212,,2.183876,1.507064,1.383923,,0.1,-0.032,-1.110603,0.493886,-1.411701,3.525028,1.330654,1.606393,1.111031,1.088979,-1.105395,0.49626,-1.48228,2.194374,1.0,2.081851,-0.112524,0.948722,-2.161606,-2.002502,2.253001,1.893454,-2.109758,0.995786,-2.92602,2.780774,6,0.348445,2,0.620025,0.694536,dsgdb9nsd_063628
736101,11.770002,119.999697,0.0,0.0,1.0,5.0,108.229695,134.634992,125.884697,117.134402,8.750295,112.103732,106.309562,129.676753,25.468501,114.098626,104.208252,11.148215,0.999954,5.3e-05,0.155753,2.16874,2.090864,0.077877,0.011806,1.362497,1.356593,0.005903,3.298918,2.223401,2.339039,1.075517,1.033831,1.0,0.115077,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,3,3,0,0,1,,,2.090864,1.944055,1.075517,0.864502,0.543376,0.198126,0.217268,0.105499,0.149626,0.104485,0.108185,0.10068,0.055866,0.091575,0.864502,0.080174,0.103406,0.061447,0.100401,0.093446,0.087202,0.091888,0.20067,0.228743,0.53404,0.132632,0.099923,0.984095,0.36652,0.214527,0.047959,0.001858,0.458467,0.117794,0.168143,0.067513,1.302852,0.200236,0.082166,,,134.634992,117.134402,2.246617,1.356593,1.261341,1.944055,0.15,-0.01,0.0,1.0,-0.238874,3.298918,2.223401,3.067287,2.39258,1.075517,0.0,1.0,-0.142539,3.386387,3.148614,1.075517,0.0,1.0,0.002224,-1.06739,1.097168,1.075517,-1.06739,0.374686,-0.207481,0.180242,8,0.749941,0,142.45,141.009,dsgdb9nsd_024154
34478,,,,,,,,109.495447,108.278769,106.368607,1.367472,72.955393,190.383316,113.115572,5.506844,110.369371,107.608729,1.838133,0.327868,0.755366,0.398206,2.020405,1.757735,0.185768,0.426962,1.383063,1.099068,0.200816,3.393559,2.297015,2.236695,1.096544,0.848115,,,109.43424,0.0,106.368607,2.0,0.0,-0.001434,0.0,109.495447,108.278769,106.368607,1.096544,1.52603,1.096544,0.045638,,,,,,,,,,,,,,,109.495447,0,4,4,0,0,1,,,,,2.155941,0.831664,0.429412,0.216829,0.426829,0.215142,0.154157,0.131888,0.0,0.0,0.0,0.323663,0.830936,0.161853,0.261545,0.155443,0.215142,0.214447,0.213961,0.0,0.0,0.146419,0.484859,0.086834,0.170198,0.98427,0.303388,0.22744,-0.042185,0.059516,0.377946,0.050408,0.16823,0.074659,0.606113,0.049925,0.020367,,,108.972252,,2.14754,1.52409,1.389904,,0.0,0.28,-1.060351,0.508172,-1.516519,2.753577,0.597636,1.277204,0.991676,1.096544,-1.059397,0.508615,-1.519018,2.800362,1.298905,1.097024,-1.058917,0.508838,-1.655637,-1.994182,2.171334,1.906584,-2.058499,0.546429,-0.923394,0.920034,0,0.198328,2,1.66054,1.52336,dsgdb9nsd_001581
4223141,,,,,,,,109.149608,108.965397,108.610363,0.251106,111.382022,109.665265,115.470514,7.495544,111.51273,107.97497,2.648591,0.307527,0.714445,0.096885,2.130258,2.066914,0.044817,0.103662,1.499248,1.431628,0.04785,3.072382,1.969992,2.231972,1.10239,0.764448,,,,,,,,,,,,,,,,,0.0,107.34119,109.455825,0.0,107.564512,1.0,0.064096,109.149608,108.965397,2.168947,1.400034,0.031248,0.0,360.0,,0,1,1,0,0,0,0.999865,-0.999966,2.066914,1.874939,3.072382,0.822867,0.834637,0.214323,0.21257,0.213583,0.130482,0.153145,0.080465,0.132268,0.05513,0.188146,0.317808,0.16085,0.316904,0.157836,0.145022,0.114608,0.105937,0.097587,0.043547,0.234075,0.134997,0.081289,0.03768,1.137026,0.071789,0.197983,0.228075,0.236794,0.057562,0.232001,0.112411,0.180544,2.940701,0.09864,0.034832,1.52,1.530827,109.142914,108.610363,2.16193,1.533058,1.390668,1.874939,0.0,0.0,-1.323593,0.569197,-2.547861,3.072382,0.0,1.0,0.795435,1.10239,-1.969992,0.358806,-2.509388,3.072382,1.0,1.773854,-1.298528,0.577355,-2.323405,-2.770443,3.079241,2.478097,-2.791973,0.507826,-0.821836,0.723888,0,0.490086,6,10.2263,10.9611,dsgdb9nsd_118195
3971515,,,,,,,,118.513196,116.730192,115.028963,1.423607,72.720098,159.931661,131.972192,72.216174,102.501815,59.756018,30.488196,0.395167,0.750678,0.401744,2.091797,1.829722,0.185449,0.419711,1.363088,1.084714,0.196848,3.561094,2.476677,2.201298,1.084417,0.883915,,,107.647018,3.0,60.205017,2.0,0.0,-0.032567,0.0,118.513196,116.730192,115.028963,1.084417,1.504426,1.084417,0.043623,,,,,,,,,,,,,,,116.648416,0,4,4,1,1,0,,,,,2.214203,0.850368,0.445511,0.20397,0.441833,0.200826,0.440365,0.119516,0.430706,0.078856,0.145767,0.298696,0.20397,0.155466,0.20352,0.102539,0.20284,0.201891,0.200571,0.0,0.0,0.096012,0.227063,0.064838,0.157968,1.010868,-0.083771,0.2367,-0.082288,-0.02282,-0.084,-0.0941,0.178962,-0.042062,0.910799,0.055605,0.131285,,,118.513196,,2.231466,1.500125,1.383346,,0.1,-0.076,-1.129823,0.489738,-1.408287,3.561094,1.346892,1.608297,1.082446,1.084417,-1.129785,0.489755,-1.294363,3.034117,1.370298,2.214203,0.0,1.0,-1.877787,-2.044902,2.232882,1.926431,-2.129906,0.76432,1.060685,-0.966911,7,0.536217,2,-1.18808,-1.22537,dsgdb9nsd_112215


In [38]:
train_all_sampled = train_all.sample(frac=0.05)

In [46]:
train_target = train[all_features].loc[train_all_sampled.index]

In [58]:
c1_list = []
c2_list = []
for c1 in train_all_sampled.columns:
    for c2 in train_target.columns:
        if c2 in c2_list: continue
        corr = pd.concat([train_all_sampled[c1], train_target[c2]], axis=1).corr()
        if corr.shape[0]!=2: continue
        corr_val = corr.iloc[1,0]
        if corr_val >=0.99:
            print(c1, c2)
            c1_list += [c1]
            c2_list += [c2]
            break

1J1st_IsPolarHydrogen atom_y
1J_ex1_Angle_0_1_max dist_O_4_x
1J_ex1_dist_0_min dist_no_bond_min_x
1J_ex1_dist_from_first_min dist_min_x
a1_degree n_bonds_y
dist distance
dist_to_type_mean dist_O_4_y
molecule_atom_1_dist_std_diff molecule_atom_1_dist_std_diff
molecule_atom_index_0_dist_max molecule_atom_index_0_dist_max
molecule_atom_index_0_dist_min distance_closest_0
molecule_atom_index_0_dist_min_div molecule_atom_index_0_dist_min_div
molecule_atom_index_1_dist_max molecule_atom_index_1_dist_max
molecule_atom_index_1_dist_min molecule_atom_index_1_dist_min
molecule_atom_index_1_dist_min_div molecule_atom_index_1_dist_min_div
molecule_atom_index_1_dist_std_diff molecule_atom_index_1_dist_std_diff
molecule_type_dist_max molecule_type_dist_max
molecule_type_dist_min molecule_type_dist_min
molecule_name molecule_dist_mean_bond_x


In [59]:
for c1, c2 in zip(c1_list, c2_list):
    corr = pd.concat([train_all_sampled[c1], train_target[c2]], axis=1).corr()
    corr_val = corr.iloc[1,0]
    print(c1, c2, corr_val)

1J1st_IsPolarHydrogen atom_y 1.0
1J_ex1_Angle_0_1_max dist_O_4_x 1.0000000000000002
1J_ex1_dist_0_min dist_no_bond_min_x 0.9994878373364879
1J_ex1_dist_from_first_min dist_min_x 0.9998797562993752
a1_degree n_bonds_y 0.9998930508796093
dist distance 0.9999991883437129
dist_to_type_mean dist_O_4_y 0.9995328415199541
molecule_atom_1_dist_std_diff molecule_atom_1_dist_std_diff 0.9999990198779521
molecule_atom_index_0_dist_max molecule_atom_index_0_dist_max 0.9999895722370652
molecule_atom_index_0_dist_min distance_closest_0 0.999987764707625
molecule_atom_index_0_dist_min_div molecule_atom_index_0_dist_min_div 0.9999993163300608
molecule_atom_index_1_dist_max molecule_atom_index_1_dist_max 0.9999980235271952
molecule_atom_index_1_dist_min molecule_atom_index_1_dist_min 0.9999991059660387
molecule_atom_index_1_dist_min_div molecule_atom_index_1_dist_min_div 0.9999990540474628
molecule_atom_index_1_dist_std_diff molecule_atom_index_1_dist_std_diff 0.9999990341548115
molecule_type_dist_max m

In [61]:
c2_list

['atom_y',
 'dist_O_4_x',
 'dist_no_bond_min_x',
 'dist_min_x',
 'n_bonds_y',
 'distance',
 'dist_O_4_y',
 'molecule_atom_1_dist_std_diff',
 'molecule_atom_index_0_dist_max',
 'distance_closest_0',
 'molecule_atom_index_0_dist_min_div',
 'molecule_atom_index_1_dist_max',
 'molecule_atom_index_1_dist_min',
 'molecule_atom_index_1_dist_min_div',
 'molecule_atom_index_1_dist_std_diff',
 'molecule_type_dist_max',
 'molecule_type_dist_min',
 'molecule_dist_mean_bond_x']

In [48]:
corr

Unnamed: 0,1J1st_AveSmallestBondAngle_diff
1J1st_AveSmallestBondAngle_diff,1.0


In [43]:
pd.concat([train_all_sampled[c1], train_target[c2]], axis=1).head()

Unnamed: 0,1J1st_AveSmallestBondAngle_diff,type
20,,
40,,
50,2.49908,
56,1.747527,
70,0.0,


In [13]:
data_qm9 = pd.read_pickle('../input/data.covs.pickle')

In [14]:
data_qm9.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,rc_A,rc_B,rc_C,mu,...,Cv,freqs_min,freqs_max,freqs_mean,linear,mulliken_min,mulliken_max,mulliken_mean,mulliken_atom_0,mulliken_atom_1
0,5174511,dsgdb9nsd_033805,11,7,2JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.088328
1,5174510,dsgdb9nsd_033805,11,6,3JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.101114
2,5174509,dsgdb9nsd_033805,11,5,2JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.088328
3,5174508,dsgdb9nsd_033805,11,4,1JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.139609
4,5174507,dsgdb9nsd_033805,11,3,2JHC,,3.54257,1.50643,1.34544,4.4029,...,27.528,155.249,3252.0483,1324.421867,1.0,-0.342191,0.15667,-6.25e-08,0.075457,-0.221362


In [15]:
data_qm9.shape


(7163689, 30)

In [16]:
4658147 + 2505542

7163689