## Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import sys

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
# pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import HTML
import json
# import altair as alt

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

# alt.renderers.enable('notebook')

sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle

## All function used in this kernel

In [2]:
import os
import time
import datetime
import json
import gc
from numba import jit

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import metrics

from itertools import product

# import altair as alt
# from altair.vega import v3
from IPython.display import HTML

# # using ideas from this kernel: https://www.kaggle.com/notslush/altair-visualization-2018-stackoverflow-survey
# def prepare_altair():
#     """
#     Helper function to prepare altair for working.
#     """

#     vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v3.SCHEMA_VERSION
#     vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
#     vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
#     vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
#     noext = "?noext"
    
#     paths = {
#         'vega': vega_url + noext,
#         'vega-lib': vega_lib_url + noext,
#         'vega-lite': vega_lite_url + noext,
#         'vega-embed': vega_embed_url + noext
#     }
    
#     workaround = f"""    requirejs.config({{
#         baseUrl: 'https://cdn.jsdelivr.net/npm/',
#         paths: {paths}
#     }});
#     """
    
#     return workaround
    

def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
           

# @add_autoincrement
# def render(chart, id="vega-chart"):
#     """
#     Helper function to plot altair visualizations.
#     """
#     chart_str = """
#     <div id="{id}"></div><script>
#     require(["vega-embed"], function(vg_embed) {{
#         const spec = {chart};     
#         vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
#         console.log("anything?");
#     }});
#     console.log("really...anything?");
#     </script>
#     """
#     return HTML(
#         chart_str.format(
#             id=id,
#             chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
#         )
#     )
    

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    

@jit
def fast_auc(y_true, y_prob):
    """
    fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc


def eval_auc(y_true, y_pred):
    """
    Fast auc eval function for lgb.
    """
    return 'auc', fast_auc(y_true, y_pred), True


def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
    

def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict
    


def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns == None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                        'catboost_metric_name': 'AUC',
                        'sklearn_scoring_function': metrics.roc_auc_score},
                    }
    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros((len(X), len(set(y.values))))
    
    # averaged predictions on train data
    prediction = np.zeros((len(X_test), oof.shape[1]))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                    verbose=verbose, early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            
            y_pred = model.predict_proba(X_test)
        
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid
        scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid[:, 1]))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict

# # setting up altair
# workaround = prepare_altair()
# HTML("".join((
#     "<script>",
#     workaround,
#     "</script>",
# )))

In [3]:

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

def angle_feat(df):
    df_feat = pd.DataFrame({"id":df.id.values}, index=df.index.values)
    for axis in ["x", "y", "z"]:
        df_feat[f"{axis}_diff"] = df[f"{axis}_0"] - df[f"{axis}_1"]

    df_feat["diff_norm"] = (df_feat.x_diff**2 + df_feat.y_diff**2 + df_feat.z_diff**2)**0.5
    df_feat["zero_norm"] = (df.x_0**2 + df.y_0**2 + df.z_0**2)**0.5

    for axis in ["x", "y", "z"]:
        df_feat[f"{axis}_diff"] = df_feat[f"{axis}_diff"].values / df_feat["diff_norm"].values
        df_feat[f"{axis}_0"] = df[f"{axis}_0"].values / df_feat["zero_norm"].values

    df_feat["f004:angle"] = df_feat.x_diff*df_feat.x_0 + df_feat.x_diff*df_feat.y_0 + df_feat.x_diff*df_feat.z_0
    df_feat["f004:angle_abs"] = np.abs(df_feat["f004:angle"])
    return df_feat[["id", "f004:angle", "f004:angle_abs"]]


def angle_feature_conv():
    train_ = pd.read_csv('../input/train.csv')
    test_ = pd.read_csv('../input/test.csv')
    train_ = map_atom_info(train_, 0)
    train_ = map_atom_info(train_, 1)

    test_ = map_atom_info(test_, 0)
    test_ = map_atom_info(test_, 1)
    angle_df_train = angle_feat(train_)
    angle_df_test  = angle_feat(test_)
    return angle_df_train, angle_df_test

def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    df = reduce_mem_usage(df)
    return df


In [27]:

good_columns = [
'molecule_atom_index_0_dist_min',
'molecule_atom_index_0_dist_max',
'molecule_atom_index_1_dist_min',
'molecule_atom_index_0_dist_mean',
'molecule_atom_index_0_dist_std',
'dist',
'molecule_atom_index_1_dist_std',
'molecule_atom_index_1_dist_max',
'molecule_atom_index_1_dist_mean',
'molecule_atom_index_0_dist_max_diff',
'molecule_atom_index_0_dist_max_div',
'molecule_atom_index_0_dist_std_diff',
'molecule_atom_index_0_dist_std_div',
'atom_0_couples_count',
'molecule_atom_index_0_dist_min_div',
'molecule_atom_index_1_dist_std_diff',
'molecule_atom_index_0_dist_mean_div',
'atom_1_couples_count',
'molecule_atom_index_0_dist_mean_diff',
'molecule_couples',
'atom_index_1',
'molecule_dist_mean',
'molecule_atom_index_1_dist_max_diff',
'molecule_atom_index_0_y_1_std',
'molecule_atom_index_1_dist_mean_diff',
'molecule_atom_index_1_dist_std_div',
'molecule_atom_index_1_dist_mean_div',
'molecule_atom_index_1_dist_min_diff',
'molecule_atom_index_1_dist_min_div',
'molecule_atom_index_1_dist_max_div',
'molecule_atom_index_0_z_1_std',
'y_0',
'molecule_type_dist_std_diff',
'molecule_atom_1_dist_min_diff',
'molecule_atom_index_0_x_1_std',
'molecule_dist_min',
'molecule_atom_index_0_dist_min_diff',
'molecule_atom_index_0_y_1_mean_diff',
'molecule_type_dist_min',
'molecule_atom_1_dist_min_div',
'atom_index_0',
'molecule_dist_max',
'molecule_atom_1_dist_std_diff',
'molecule_type_dist_max',
'molecule_atom_index_0_y_1_max_diff',
'molecule_type_0_dist_std_diff',
'molecule_type_dist_mean_diff',
'molecule_atom_1_dist_mean',
'molecule_atom_index_0_y_1_mean_div',
'molecule_type_dist_mean_div',
'type', "f004:angle", "f004:angle_abs",
"f003:cos_0_1", "f003:cos_1",]


## Data loading and overview

In [5]:
ls ../input

[0m[01;31mdipole_moments.csv.zip[0m                 [01;32mstructures.csv[0m*
[01;31mmagnetic_shielding_tensors.csv.zip[0m     [01;31mstructures.csv.zip[0m
[01;31mmulliken_charges.csv.zip[0m               [01;31mstructures.zip[0m
[01;31mpotential_energy.csv.zip[0m               [01;32mtest.csv[0m*
[01;32msample_submission.csv[0m*                 [01;31mtest.csv.zip[0m
[01;31msample_submission.csv.zip[0m              [01;32mtrain.csv[0m*
[01;32mscalar_coupling_contributions.csv[0m*     [01;31mtrain.csv.zip[0m
[01;31mscalar_coupling_contributions.csv.zip[0m


In [6]:
file_folder = '../input'
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')

train = pd.merge(train, scalar_coupling_contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])


In [7]:
# potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
# mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
# magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
# dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')

# fig, ax = plt.subplots(figsize = (20, 10))
# for i, t in enumerate(train['type'].unique()):
#     plt.subplot(2, 4, i + 1);
#     plt.scatter(train.loc[train['type'] == t, 'fc'], train.loc[train['type'] == t, 'scalar_coupling_constant'], label=t);
#     plt.title(f'fc vs target \n for {t} type');

## Feature generation

In the hidden cell below I generate features as in my kernel: https://www.kaggle.com/artgor/brute-force-feature-engineering

In [8]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

train = create_features(train)
test = create_features(test)

angle_df_train, angle_df_test = angle_feature_conv()
train = train.merge(angle_df_train, on="id", how="left")
test = test.merge(angle_df_test, on="id", how="left")


Mem. usage decreased to 861.82 Mb (68.5% reduction)
Mem. usage decreased to 430.10 Mb (68.8% reduction)


In [17]:
train_cos = unpickle("../processed/v001/train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
test_cos = unpickle("../processed/v001/test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

In [18]:
train.shape, test.shape

((4658147, 78), (2505542, 73))

In [19]:
train_cos.shape, test_cos.shape

((4658147, 3), (2505542, 3))

In [20]:
train_cos.head()

Unnamed: 0,id,f003:cos_0_1,f003:cos_1
0,0,0.333335,-0.333335
1,1,-0.333287,0.816482
2,2,-0.333335,0.816496
3,3,-0.333347,0.8165
4,4,0.333352,-0.333352


In [21]:
test_cos.head()

Unnamed: 0,id,f003:cos_0_1,f003:cos_1
0,4658147,1.0,-1.0
1,4658148,-1.0,1.0
2,4658149,-1.0,1.0
3,4658150,-1.0,1.0
4,4658151,1.0,-1.0


In [22]:
train = train.merge(train_cos, on="id", how="left")
test = test.merge(test_cos, on="id", how="left")

In [23]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

Mem. usage decreased to 866.26 Mb (8.9% reduction)
Mem. usage decreased to 432.49 Mb (9.5% reduction)


In [24]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,f004:angle,f004:angle_abs,f003:cos_0_1,f003:cos_1
0,0,dsgdb9nsd_000001,1,0,0,84.8125,83.0,0.254639,1.258789,0.271973,...,-3e-06,1.0,1.091797,1.091797,3e-06,-1.091797,-0.003864,0.003864,0.333252,-0.333252
1,1,dsgdb9nsd_000001,1,2,3,-11.257812,-11.03125,0.353027,2.857422,-3.433594,...,2.7e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203,0.160889,0.160889,-0.333252,0.816406
2,2,dsgdb9nsd_000001,1,3,3,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,-1e-06,1.0,1.783203,1.783203,1.4e-05,-1.783203,-0.086548,0.086548,-0.333252,0.816406
3,3,dsgdb9nsd_000001,1,4,3,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,-1e-05,1.0,1.783203,1.783203,1.4e-05,-1.783203,-0.083862,0.083862,-0.333252,0.816406
4,4,dsgdb9nsd_000001,2,0,0,84.8125,83.0,0.254639,1.258789,0.271973,...,-2e-06,1.0,1.091797,1.091797,3e-06,-1.091797,1.305664,1.305664,0.333252,-0.333252


In [25]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,...,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,f004:angle,f004:angle_abs,f003:cos_0_1,f003:cos_1
0,4658147,dsgdb9nsd_000004,2,0,2,H,-1.662109,0.0,1.0,C,...,0.0,1.0,2.261719,2.261719,0.0,-2.261719,0.341064,0.341064,1.0,-1.0
1,4658148,dsgdb9nsd_000004,2,1,0,H,-1.662109,0.0,1.0,C,...,0.0,1.0,1.0625,1.0625,0.0,-1.0625,0.341064,0.341064,-1.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,6,H,-1.662109,0.0,1.0,H,...,0.0,1.0,3.324219,3.324219,,,0.341064,0.341064,-1.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,0,H,1.662109,0.0,1.0,C,...,0.0,1.0,1.0625,1.0625,0.0,-1.0625,1.37207,1.37207,-1.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2,H,1.662109,0.0,1.0,C,...,0.0,1.0,2.261719,2.261719,0.0,-2.261719,1.37207,1.37207,1.0,-1.0


In [29]:
#mkdir ../processed/v003/

In [38]:
to_pickle("../processed/v003/train_v003.pkl", train)
to_pickle("../processed/v003/test_v003.pkl", test)

In [31]:
X = train[good_columns].copy()
y = train['scalar_coupling_constant']
y_fc = train['fc']
X_test = test[good_columns].copy()

In [32]:
# del train, test
# gc.collect()

## Basic model


In [33]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

In [34]:
# params = {'num_leaves': 128,
#           'min_child_samples': 79,
#           'objective': 'regression',
#           'max_depth': 9,
#           'learning_rate': 0.2,
#           "boosting_type": "gbdt",
#           "subsample_freq": 1,
#           "subsample": 0.9,
#           "bagging_seed": 11,
#           "metric": 'mae',
#           "verbosity": -1,
#           'reg_alpha': 0.1,
#           'reg_lambda': 0.3,
#           'colsample_bytree': 1.0
#          }
# result_dict_lgb = train_model_regression(X=X, X_test=X_test, y=y, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
#                                                       verbose=500, early_stopping_rounds=200, n_estimators=1500)

## Create out of fold feature

In [None]:
params = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'max_depth': 9,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0,
          'num_threads' : -1,
         }
result_dict_lgb1 = train_model_regression(X=X, 
                                          X_test=X_test, 
                                          y=y_fc, 
                                          params=params, 
                                          folds=folds, 
                                          model_type='lgb', 
                                          eval_metric='group_mae', 
                                          plot_feature_importance=False,
                                          verbose=500, 
                                          early_stopping_rounds=200, 
                                          n_estimators=5000)



In [39]:
X['oof_fc'] = result_dict_lgb1['oof']
X_test['oof_fc'] = result_dict_lgb1['prediction']

In [40]:
to_pickle("../processed/v003/train_oof_fc.pkl", X['oof_fc'])
to_pickle("../processed/v003/test_oof_fc.pkl", X_test['oof_fc'])

## Training separate models for each type

In [41]:
X_short = pd.DataFrame({'ind': list(X.index), 'type': X['type'].values, 'oof': [0] * len(X), 'target': y.values})
X_short_test = pd.DataFrame({'ind': list(X_test.index), 'type': X_test['type'].values, 'prediction': [0] * len(X_test)})
for t in X['type'].unique():
    print(f'Training of type {t}')
    X_t = X.loc[X['type'] == t]
    X_test_t = X_test.loc[X_test['type'] == t]
    y_t = X_short.loc[X_short['type'] == t, 'target']
    
    result_dict_lgb3 = train_model_regression(X=X_t, 
                                              X_test=X_test_t, 
                                              y=y_t, 
                                              params=params, 
                                              folds=folds, 
                                              model_type='lgb', 
                                              eval_metric='group_mae', 
                                              plot_feature_importance=False,
                                              verbose=500, 
                                              early_stopping_rounds=200, 
                                              n_estimators=10000)
    X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb3['oof']
    X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb3['prediction']
    
sub['scalar_coupling_constant'] = X_short_test['prediction']
sub.to_csv('submission_t.csv', index=False)
sub.head()

Training of type 0
Fold 1 started at Sun Jun  9 14:08:18 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 1.3439	valid_1's l1: 1.56969
[1000]	training's l1: 1.1373	valid_1's l1: 1.5436
[1500]	training's l1: 0.972497	valid_1's l1: 1.52225
[2000]	training's l1: 0.840186	valid_1's l1: 1.50481
[2500]	training's l1: 0.731405	valid_1's l1: 1.49131
[3000]	training's l1: 0.639926	valid_1's l1: 1.48156
[3500]	training's l1: 0.563047	valid_1's l1: 1.47215
[4000]	training's l1: 0.497852	valid_1's l1: 1.4652
[4500]	training's l1: 0.442307	valid_1's l1: 1.45987
[5000]	training's l1: 0.393986	valid_1's l1: 1.45527
[5500]	training's l1: 0.351996	valid_1's l1: 1.45104
[6000]	training's l1: 0.315039	valid_1's l1: 1.44829
[6500]	training's l1: 0.282614	valid_1's l1: 1.44573
[7000]	training's l1: 0.254175	valid_1's l1: 1.44337
[7500]	training's l1: 0.228931	valid_1's l1: 1.44134
[8000]	training's l1: 0.206604	valid_1's l1: 1.43969
[8500]	training's l1: 0.186769	val

[4500]	training's l1: 0.0448155	valid_1's l1: 0.308275
[5000]	training's l1: 0.0377257	valid_1's l1: 0.307806
[5500]	training's l1: 0.0319015	valid_1's l1: 0.307427
[6000]	training's l1: 0.0270154	valid_1's l1: 0.307131
[6500]	training's l1: 0.0229938	valid_1's l1: 0.306911
[7000]	training's l1: 0.0196614	valid_1's l1: 0.306722
[7500]	training's l1: 0.0168808	valid_1's l1: 0.306595
[8000]	training's l1: 0.014508	valid_1's l1: 0.306489
[8500]	training's l1: 0.0125456	valid_1's l1: 0.306391
[9000]	training's l1: 0.0109062	valid_1's l1: 0.306319
[9500]	training's l1: 0.00949863	valid_1's l1: 0.306268
[10000]	training's l1: 0.00834099	valid_1's l1: 0.306209
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.00834099	valid_1's l1: 0.306209
Fold 3 started at Sun Jun  9 14:44:08 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.265349	valid_1's l1: 0.357548
[1000]	training's l1: 0.191935	valid_1's l1: 0.333481
[1500]	training's l1

[6000]	training's l1: 0.00170905	valid_1's l1: 0.774939
[6500]	training's l1: 0.00157441	valid_1's l1: 0.774933
[7000]	training's l1: 0.00147252	valid_1's l1: 0.774927
[7500]	training's l1: 0.00139234	valid_1's l1: 0.774921
Early stopping, best iteration is:
[7506]	training's l1: 0.00139158	valid_1's l1: 0.774921
CV mean score: -0.2647, std: 0.0104.
Training of type 4
Fold 1 started at Sun Jun  9 15:03:01 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.151242	valid_1's l1: 0.300783
[1000]	training's l1: 0.0852687	valid_1's l1: 0.288796
[1500]	training's l1: 0.0515278	valid_1's l1: 0.28476
[2000]	training's l1: 0.0317767	valid_1's l1: 0.282879
[2500]	training's l1: 0.0201597	valid_1's l1: 0.282115
[3000]	training's l1: 0.0131542	valid_1's l1: 0.281571
[3500]	training's l1: 0.00895598	valid_1's l1: 0.281342
[4000]	training's l1: 0.00632401	valid_1's l1: 0.28122
[4500]	training's l1: 0.00473522	valid_1's l1: 0.281162
[5000]	training's l1: 0.00368

[7000]	training's l1: 0.147198	valid_1's l1: 0.535459
[7500]	training's l1: 0.136107	valid_1's l1: 0.534111
[8000]	training's l1: 0.126059	valid_1's l1: 0.53288
[8500]	training's l1: 0.116847	valid_1's l1: 0.53182
[9000]	training's l1: 0.108439	valid_1's l1: 0.530922
[9500]	training's l1: 0.100719	valid_1's l1: 0.530092
[10000]	training's l1: 0.0936075	valid_1's l1: 0.529335
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.0936075	valid_1's l1: 0.529335
Fold 3 started at Sun Jun  9 15:26:07 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.572911	valid_1's l1: 0.649735
[1000]	training's l1: 0.488002	valid_1's l1: 0.618528
[1500]	training's l1: 0.423814	valid_1's l1: 0.597557
[2000]	training's l1: 0.373397	valid_1's l1: 0.58326
[2500]	training's l1: 0.333175	valid_1's l1: 0.572775
[3000]	training's l1: 0.29912	valid_1's l1: 0.564557
[3500]	training's l1: 0.270255	valid_1's l1: 0.55837
[4000]	training's l1: 0.245371	valid_1

[4500]	training's l1: 0.06734	valid_1's l1: 0.278761
[5000]	training's l1: 0.0591526	valid_1's l1: 0.277845
[5500]	training's l1: 0.0521366	valid_1's l1: 0.277107
[6000]	training's l1: 0.0460583	valid_1's l1: 0.276559
[6500]	training's l1: 0.0408439	valid_1's l1: 0.276133
[7000]	training's l1: 0.036367	valid_1's l1: 0.275768
[7500]	training's l1: 0.0324444	valid_1's l1: 0.27551
[8000]	training's l1: 0.0289656	valid_1's l1: 0.275257
[8500]	training's l1: 0.025915	valid_1's l1: 0.27506
[9000]	training's l1: 0.0232508	valid_1's l1: 0.274882
[9500]	training's l1: 0.020905	valid_1's l1: 0.274715
[10000]	training's l1: 0.0188335	valid_1's l1: 0.274592
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.0188335	valid_1's l1: 0.274592
Fold 5 started at Sun Jun  9 16:04:48 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.273996	valid_1's l1: 0.337295
[1000]	training's l1: 0.209368	valid_1's l1: 0.310392
[1500]	training's l1: 0.16915

[8500]	training's l1: 0.15921	valid_1's l1: 0.517846
[9000]	training's l1: 0.149783	valid_1's l1: 0.516482
[9500]	training's l1: 0.141163	valid_1's l1: 0.515402
[10000]	training's l1: 0.132999	valid_1's l1: 0.514307
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.132999	valid_1's l1: 0.514307
CV mean score: -0.6653, std: 0.0015.
Training of type 7
Fold 1 started at Sun Jun  9 16:47:37 2019
Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.123994	valid_1's l1: 0.225977
[1000]	training's l1: 0.0733401	valid_1's l1: 0.215576
[1500]	training's l1: 0.0468084	valid_1's l1: 0.211943
[2000]	training's l1: 0.0312781	valid_1's l1: 0.210149
[2500]	training's l1: 0.0213837	valid_1's l1: 0.209178
[3000]	training's l1: 0.0148414	valid_1's l1: 0.208638
[3500]	training's l1: 0.0105434	valid_1's l1: 0.20832
[4000]	training's l1: 0.00768349	valid_1's l1: 0.208135
[4500]	training's l1: 0.00576789	valid_1's l1: 0.208026
[5000]	training's l1: 0.0

Unnamed: 0,id,scalar_coupling_constant
0,4658147,11.224702
1,4658148,192.232365
2,4658149,7.763173
3,4658150,190.803494
4,4658151,9.660514


In [42]:
ll

total 66356
-rw-rw-r-- 1 kenichi.matsui    11215 Jun  6 01:31 feature_eng_001.ipynb
-rw-rw-r-- 1 kenichi.matsui   273097 Jun  7 12:03 predict_v001_005.ipynb
-rw-rw-r-- 1 kenichi.matsui 67136431 Jun  9 17:02 submission_t.csv
-rw-rw-r-- 1 kenichi.matsui   387558 Jun  6 01:31 train_v001_001.ipynb
-rw-r--r-- 1 kenichi.matsui   133413 Jun  9 17:01 Using meta-features to improve model.ipynb


In [None]:
print(1)

In [None]:
# 

# sandbox

## Model with oof feature

In [None]:
# params = {'num_leaves': 128,
#           'min_child_samples': 79,
#           'objective': 'regression',
#           'max_depth': 9,
#           'learning_rate': 0.2,
#           "boosting_type": "gbdt",
#           "subsample_freq": 1,
#           "subsample": 0.9,
#           "bagging_seed": 11,
#           "metric": 'mae',
#           "verbosity": -1,
#           'reg_alpha': 0.1,
#           'reg_lambda': 0.3,
#           'colsample_bytree': 1.0
#          }
# result_dict_lgb2 = train_model_regression(X=X, X_test=X_test, y=y, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
#                                                       verbose=500, early_stopping_rounds=200, n_estimators=1500)

## Deeper model

In [None]:
# params = {'num_leaves': 128,
#           'min_child_samples': 79,
#           'objective': 'regression',
#           'max_depth': 9,
#           'learning_rate': 0.2,
#           "boosting_type": "gbdt",
#           "subsample_freq": 1,
#           "subsample": 0.9,
#           "bagging_seed": 11,
#           "metric": 'mae',
#           "verbosity": -1,
#           'reg_alpha': 0.1,
#           'reg_lambda': 0.3,
#           'colsample_bytree': 1.0
#          }
# result_dict_lgb2 = train_model_regression(X=X, X_test=X_test, y=y, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True,
#                                                       verbose=500, early_stopping_rounds=200, n_estimators=3000)

In [None]:
# sub['scalar_coupling_constant'] = result_dict_lgb2['prediction']
# sub.to_csv('submission.csv', index=False)
# sub.head()

## Plot oof predictions vs target

In [None]:
plot_data = pd.DataFrame(y)
plot_data.index.name = 'id'
plot_data['yhat'] = result_dict_lgb3['oof']
plot_data['type'] = lbl.inverse_transform(X['type'])

def plot_oof_preds(ctype, llim, ulim):
        plt.figure(figsize=(6,6))
        sns.scatterplot(x='scalar_coupling_constant',y='yhat',
                        data=plot_data.loc[plot_data['type']==ctype,
                        ['scalar_coupling_constant', 'yhat']]);
        plt.xlim((llim, ulim))
        plt.ylim((llim, ulim))
        plt.plot([llim, ulim], [llim, ulim])
        plt.xlabel('scalar_coupling_constant')
        plt.ylabel('predicted')
        plt.title(f'{ctype}', fontsize=18)
        plt.show()

plot_oof_preds('1JHC', 0, 250)
plot_oof_preds('1JHN', 0, 100)
plot_oof_preds('2JHC', -50, 50)
plot_oof_preds('2JHH', -50, 50)
plot_oof_preds('2JHN', -25, 25)
plot_oof_preds('3JHC', -25, 100)
plot_oof_preds('3JHH', -20, 20)
plot_oof_preds('3JHN', -15, 15)