In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook as tqdm
import gc
import time

from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
type_ = '1JHC'
fc_predict = False


train = pd.read_csv(f'./feature_engineering/feature_output/each_type/train_{type_}.csv')
test = pd.read_csv(f'./feature_engineering/feature_output/each_type/test_{type_}.csv')
train_geo = pd.read_csv(f'./feature_engineering/feature_output/each_type_geo/train_geo_{type_}.csv')
test_geo = pd.read_csv(f'./feature_engineering/feature_output/each_type_geo/test_geo_{type_}.csv')
oof_submission = pd.DataFrame(train['id'])
submission = pd.DataFrame(test['id'])

fc_train = pd.read_csv(f'./fc_predict/tuned/oof_prediction_{type_}.csv')
fc_test = pd.read_csv(f'./fc_predict/tuned/submission_{type_}.csv')
train['fc_predicted'] = fc_train['oof']
test['fc_predicted'] = fc_test['scalar_coupling_constant']



if type_[0] == '1':
    train = train.drop(['Angle', 'cosA', 'cos2A', 'Torsion', 'cosT', 'cos2T'], axis=1)
    test = test.drop(['Angle', 'cosA', 'cos2A', 'Torsion', 'cosT', 'cos2T'], axis=1)
if type_[0] == '2':
    train = train.drop(['Torsion', 'cosT', 'cos2T'], axis=1)
    test = test.drop(['Torsion', 'cosT', 'cos2T'], axis=1)
if type_[0] == '3':
    train = train.drop(['Angle', 'cosA', 'cos2A'], axis=1)
    test = test.drop(['Angle', 'cosA', 'cos2A'], axis=1)
 

    
def drop_all_null_cols(df_train, df_test):
    drop_cols = []
    cols = df_train.columns

    for atom_idx in ['atom_0', 'atom_1']:
        for atom in ['H', 'C', 'N', 'O', 'F']:
            for i in range(20):
                col = 'potential_' + atom + '_' + str(i) + '_' + atom_idx
                if col in cols:
                    if df_train[col].isnull().all() or df_test[col].isnull().all():
                        drop_cols.append(col)
   
    df_train = df_train.drop(drop_cols, axis=1)
    df_test = df_test.drop(drop_cols, axis=1)
                    
    return df_train, df_test

train, test = drop_all_null_cols(train, test)



train = train.fillna(0)
test = test.fillna(0)



train = pd.merge(train, train_geo, on=['id', 'atom_index_0', 'atom_index_1'], how='left')
test = pd.merge(test, test_geo, on=['id', 'atom_index_0', 'atom_index_1'], how='left')


drop_cols = ['id', 'atom_index_0', 'atom_index_1', 'type']
X = train.drop(drop_cols + ['scalar_coupling_constant', 'fc'], axis=1)
y = train['scalar_coupling_constant']
y = train['fc']
X_test = test.drop(drop_cols + ['molecule_name'], axis=1)

if fc_predict:
    y_fc = train['fc']



del train, test
gc.collect()

In [None]:
print(X.shape, X_test.shape)

In [None]:
def mean_log_mae(y_true, y_pred, floor=1e-9):
    mae = (y_true-y_pred).abs().mean()
    return np.log(max(mae, floor))


def train_model(X, y, X_test, params, is_plot):
    result_dict = {}
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    mae_scores = []
    feature_importance = pd.DataFrame()

    folds = GroupKFold(n_splits=5)

    grouping = LabelEncoder()
    grouping.fit(list(X['molecule_name'].values))
    X['molecule_name'] = grouping.transform(list(X['molecule_name'].values))
    groups = X['molecule_name'].values
    X = X.drop('molecule_name', axis=1)

    features = X.columns


    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y, groups)):
        print(f'\nFold {fold + 1} started at {time.ctime()}')

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)

        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  eval_metric='mae',
                  early_stopping_rounds=200,
                  verbose=1000)

        y_pred_val = model.predict(X_val)
        y_pred = model.predict(X_test)
        oof[val_idx] = y_pred_val.reshape(-1)

        scores.append(mean_log_mae(y_val, y_pred_val))
        mae_scores.append(model.best_score_['valid_1']['l1'])

        prediction += y_pred

        fold_importance = pd.DataFrame()
        fold_importance['feature'] = features
        fold_importance['importance'] = model.feature_importances_
        fold_importance['fold'] = fold + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits

    print('\nCV mean mae: {0: .4f}, std: {1: .4f}.'.format(np.mean(mae_scores), np.std(mae_scores)))
    print('\nCV mean score: {0: .4f}, std: {1: .4f}.'.format(np.mean(scores), np.std(scores)))

    cols = feature_importance[['feature', 'importance']].groupby('feature').mean().sort_values(by='importance', ascending=False).index
    best_features = feature_importance[['feature', 'importance']].groupby('feature').mean().sort_values(by='importance', ascending=False).reset_index()

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    result_dict['mae'] = mae_scores
    result_dict['feature_importance'] = best_features
    
    if is_plot:
        plt.figure(figsize=(16,128))
        sns.barplot(x='importance', y='feature', data=best_features)
        plt.title('LGB Features (avg over folds)')
        
    return result_dict

In [None]:
params = {
    'n_estimators': 5000,
    'metric': 'mae'
}


if fc_predict:
    fc_result = train_model(X, y_fc, X_test, params, is_plot=False)
    X['fc_predicted'] = fc_result['oof']
    X_test['fc_predicted'] = fc_result['prediction']
    print(X.shape, X_test.shape)


result_dict = train_model(X, y, X_test, params, is_plot=True)

In [None]:
submission['scalar_coupling_constant'] = result_dict['prediction']
submission.to_csv(f'submission_{type_}.csv', index=False)
result_dict['feature_importance'].to_csv(f'feature_importance_{type_}.csv', index=False)
oof_submission['oof'] = result_dict['oof']
oof_submission.to_csv(f'oof_prediction_{type_}.csv', index=False)