# LGBM001　ベースラインモデル
- オプティカルフローの予測値を使いたい。
- フローの速さを用いたい。
- 過去の発電量をid指定で用いたい。
- Interpolated_Dataset(Dataset)などをutils.pyにまとめたい。

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
import random
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
import lightgbm as lgb
import utils


class CFG:
    make_folder = True
    note_num = "StackingOpt/LGBM001"
    seed = 42


OUTPUT_DIR = f'H:/study/output/{CFG.note_num}/'

if CFG.make_folder:
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

class Interpolated_Dataset(Dataset):
    def __init__(self, dates, input_dir):
        self.input_dir = input_dir
        self.date_list = dates

    def __len__(self):
        return len(self.date_list)

    def __getitem__(self, idx):
        target_date = self.date_list[idx]
        input_date = (datetime.strptime(target_date, "%Y%m%d%H%M") - timedelta(minutes=30)).strftime("%Y%m%d%H%M")

        input_csv = os.path.join(self.input_dir, f"interpolated_mesh_data_{input_date}.csv")
        target_csv = os.path.join(self.input_dir, f"interpolated_mesh_data_{target_date}.csv")

        input_data = pd.read_csv(input_csv, index_col=0)
        target_data = pd.read_csv(target_csv, index_col=0)

        return input_data, target_data

def prepare_data_for_interpolated(train_dates, valid_dates, test_dates, input_dir):
    train_dataset = Interpolated_Dataset(train_dates, input_dir)
    valid_dataset = Interpolated_Dataset(valid_dates, input_dir)
    test_dataset = Interpolated_Dataset(test_dates, input_dir)

    def get_data_from_dataset(dataset):
        X_data, y_data = [], []
        for i in range(len(dataset)):
            input_data, target_data = dataset[i]
            X_data.append(input_data.values.flatten())
            y_data.append(target_data.values.flatten())
        return np.array(X_data), np.array(y_data)

    X_train, y_train = get_data_from_dataset(train_dataset)
    X_valid, y_valid = get_data_from_dataset(valid_dataset)
    X_test, y_test = get_data_from_dataset(test_dataset)

    return X_train, y_train, X_valid, y_valid, X_test, y_test

def time_series_split(start_date, end_date, input_dir, n_splits):
    start_date = datetime.strptime(start_date, "%Y%m%d%H%M")
    end_date = datetime.strptime(end_date, "%Y%m%d%H%M")
    delta = (end_date - start_date) // n_splits

    splits = []
    for i in range(n_splits):
        train_start = start_date + delta * i
        train_end = train_start + delta
        valid_start = train_end
        valid_end = valid_start + delta

        splits.append(((train_start.strftime("%Y%m%d%H%M"), train_end.strftime("%Y%m%d%H%M")),
                       (valid_start.strftime("%Y%m%d%H%M"), valid_end.strftime("%Y%m%d%H%M"))))

    return splits

def compute_mse(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

def create_time_series_data(start_date, end_date):

    start_date = datetime.strptime(start_date, "%Y%m%d%H%M")
    end_date = datetime.strptime(end_date, "%Y%m%d%H%M")
    delta = timedelta(minutes=30)

    data_list = []
    date = start_date
    while date <= end_date:
        if date.hour > 6 and date.hour < 18:
            data_list.append(date.strftime("%Y%m%d%H%M"))
        elif date.hour == 18 and date.minute == 0:
            data_list.append(date.strftime("%Y%m%d%H%M"))
        date += delta
    
    return data_list

def train_lgbm(X_train, y_train, X_valid, y_valid, lgb_params):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    model = lgb.train(lgb_params, lgb_train , valid_sets=lgb_valid, early_stopping_rounds=10, keep_training_booster=True)
    return model


def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def main():
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mse',
        'num_boost_round':100000,
        'early_stopping_rounds':100,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0,
        'device_type': 'gpu',
        'seed':CFG.seed
    }

    # 入力ディレクトリと日付の設定
    input_dir = "H:\study\output\StackingOpt\EDA004"
    train_start_date = "201406010000"
    train_end_date = "201407010000"
    test_start_date = "201407010000"
    test_end_date = "201408010000"

    # train_start_date = "201308150000"
    # train_end_date = "20130817000"
    # test_start_date = "201308170000"
    # test_end_date = "20130818000"

    # 時系列の分割設定
    n_splits = 3
    train_data_list = create_time_series_data(train_start_date,train_end_date)
    train_date_list_split = np.array_split(train_data_list, n_splits)

    test_dates = create_time_series_data(test_start_date,test_end_date)



    # 変数の初期化
    oof_preds = None
    oof_true = None
    test_preds_ensemble = None


    #特徴量
    features = ['two_weeks_max', 'id', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'year_sin', 'year_cos',\
                'prev_30m_generation', 'id_lat', 'id_lng', 'id_lat_mesh', 'id_lng_mesh', 'pvrate', 'observed_max']

    use_interpolated_features = True

    target = 'generation'

    #データセットの読み込み
    unique_id = ['10000095', '10000269', '1020000002', '1110000001', '1110000010', '1110000011', '1110000012', '1110000013', '1110000014', '1110000015', '1160000025', '1160000090', '1160000091', '1160000182', '1160000185', '1160000253', '1160000387', '1160000402', '1160000419', '1160000420', '1160000423', '1270000026', '1280000048', '1550000001', '1650000004', '1680000001', '1680000002', '1680000003', '1680000004', '1680000010', '1680000017', '1680000021', '1680000033', '1680000047', '1680000054', '1680000057', '1680000063', '1680000067', '1680000080', '1680000081', '1680000097', '1680000107', '1680000108', '1680000112', '1680000151', '1680000152', '1680000213', '1680000216', '1680000217', '1680000218', '1680000223', '1680000228', '1680000285', '1680000287', '1680000327', '1680000364', '2220000001', '2220000002', '2220000003', '2730000001', '2910000002', '3000000007', '3000000012', '3000000042', '5000000044', '5000000045', '6000000016', '6000000017', '6060000016', '6060000017', '6060000018', '6170000016', '6170000123', '6170000124', '6170000125', '6620000065', '6620000066', '6620000088', '6620000089', '6620000111', '6620000117', '6620000118', '6620000121', '6620000122', '6620000123', '6620000124', '6620000131', '6620000132', '6910000180', '6910000198', '6910000200', '6910000206', '6910000216', '6910000217', '6910000239', '6910000240', '6910000249', '6910000250', '6910000421', '6910000424', '6910000425', '6910000469', '6910000470']
    unique_id = [int(i) for i in unique_id]
    to_unique_id = [str(num).zfill(10) for num in unique_id]
    df = utils.get_preprocessing_data(to_unique_id)
    df.drop_duplicates(inplace=True)
    df.drop_duplicates(subset=["id","datetime"],inplace=True)
    df = df.groupby('id').apply(utils.prev_30m_generation).reset_index(level=0, drop=True)
    id_all_data = pd.read_csv("H:\study\preprocessing_data\id_all_data.csv",encoding='shift_jis')
    df = df.merge(id_all_data,on=["id"],how="left")
    df.dropna(subset=["year"],inplace=True) #utils.prev_30m_generationで30分間隔のデータセットになっているため欠損が出ている。
    df["generation"] /= df["observed_max"]

    set_seeds()
    for fold in range(len(train_date_list_split)):
        print(f"\nFold {fold + 1}")
        train_dates = np.concatenate(train_date_list_split[:fold] + train_date_list_split[fold+1:])
        valid_dates = train_date_list_split[fold]

        X_train, y_train = df.loc[df.datetime.isin(train_dates),features+["datetime"]],df.loc[df.datetime.isin(train_dates),target]
        X_valid, y_valid = df.loc[df.datetime.isin(valid_dates),features+["datetime"]],df.loc[df.datetime.isin(valid_dates),target]
        X_test, y_test = df.loc[df.datetime.isin(test_dates),features+["datetime"]],df.loc[df.datetime.isin(test_dates),target]

        if use_interpolated_features:
            X_train_interpolated, y_train_interpolated, X_valid_interpolated, y_valid_interpolated,\
                X_test_interpolated, y_test_interpolated = prepare_data_for_interpolated(train_dates, valid_dates, test_dates, input_dir)
            
            X_train_interpolated = pd.DataFrame(X_train_interpolated)
            X_train_interpolated["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

            X_valid_interpolated = pd.DataFrame(X_valid_interpolated)
            X_valid_interpolated["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

            X_test_interpolated = pd.DataFrame(X_test_interpolated)
            X_test_interpolated["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

            X_train = X_train.merge(X_train_interpolated,on=["datetime"],how="left")
            X_valid = X_valid.merge(X_valid_interpolated,on=["datetime"],how="left")
            X_test = X_test.merge(X_test_interpolated,on=["datetime"],how="left")

        X_train.drop("datetime",axis=1,inplace=True)
        X_valid.drop("datetime",axis=1,inplace=True)
        X_test.drop("datetime",axis=1,inplace=True)

        # Train LightGBM model
        model = train_lgbm(X_train, y_train, X_valid, y_valid, lgb_params)
        save_path = OUTPUT_DIR + f"/lgbm_fold{fold}.txt"
        model.save_model(save_path)

        # Evaluate model
        valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
        mse = compute_mse(y_valid, valid_preds)
        print(f"Fold {fold + 1} MSE: {mse}")

        # Make predictions for the test set
        test_preds = model.predict(X_test, num_iteration=model.best_iteration)

        if oof_preds is None:
            oof_preds = valid_preds
            oof_true = y_valid
            test_preds_ensemble = test_preds
            test_true = y_test  # 追加：テストセットの真のラベルを更新
            oof_dates = valid_dates  # 追加：Out-of-Fold 予測の日付を更新
        else:
            oof_preds = np.concatenate([oof_preds, valid_preds])
            oof_true = np.concatenate([oof_true, y_valid])
            test_preds_ensemble += test_preds
            oof_dates = np.concatenate([oof_dates, valid_dates])  # 追加：Out-of-Fold 予測の日付を更新

        #break

    test_preds_ensemble /= len(train_date_list_split)

    oof_mse = compute_mse(oof_true, oof_preds)
    test_mse = compute_mse(test_true, test_preds_ensemble)
    print(f"\nOverall Out-of-Fold MSE: {oof_mse:.4f}")
    print(f"\nOverall Test MSE: {test_mse:.4f}")

    np.save(f"{OUTPUT_DIR}/oof_preds.npy", oof_preds)
    np.save(f"{OUTPUT_DIR}/oof_true.npy", oof_true)
    np.save(f"{OUTPUT_DIR}/test_preds_ensemble.npy", test_preds_ensemble)
    np.save(f"{OUTPUT_DIR}/test_true.npy", test_true)

    np.save(f"{OUTPUT_DIR}/oof_dates.npy", oof_dates)
    np.save(f"{OUTPUT_DIR}/test_dates_ensemble", test_dates)



In [2]:
if __name__ == "__main__":
    main()


Fold 1




[1]	valid_0's l2: 0.069464
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0640706
[3]	valid_0's l2: 0.0592416
[4]	valid_0's l2: 0.0548928
[5]	valid_0's l2: 0.0508525
[6]	valid_0's l2: 0.0472251
[7]	valid_0's l2: 0.0439941
[8]	valid_0's l2: 0.0411708
[9]	valid_0's l2: 0.0382833
[10]	valid_0's l2: 0.0357152
[11]	valid_0's l2: 0.0334911
[12]	valid_0's l2: 0.0313768
[13]	valid_0's l2: 0.0294112
[14]	valid_0's l2: 0.0276236
[15]	valid_0's l2: 0.0259901
[16]	valid_0's l2: 0.0245021
[17]	valid_0's l2: 0.0231147
[18]	valid_0's l2: 0.0219659
[19]	valid_0's l2: 0.0208087
[20]	valid_0's l2: 0.0198708
[21]	valid_0's l2: 0.0188883
[22]	valid_0's l2: 0.0180028
[23]	valid_0's l2: 0.0172802
[24]	valid_0's l2: 0.0164539
[25]	valid_0's l2: 0.0157784
[26]	valid_0's l2: 0.0151562
[27]	valid_0's l2: 0.0145697
[28]	valid_0's l2: 0.0140151
[29]	valid_0's l2: 0.0135582
[30]	valid_0's l2: 0.0131273
[31]	valid_0's l2: 0.0126841
[32]	valid_0's l2: 0.0122594
[33]	valid_0's l2: 0



[1]	valid_0's l2: 0.100419
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0926587
[3]	valid_0's l2: 0.0854384
[4]	valid_0's l2: 0.0789477
[5]	valid_0's l2: 0.0729119
[6]	valid_0's l2: 0.06728
[7]	valid_0's l2: 0.0622528
[8]	valid_0's l2: 0.057742
[9]	valid_0's l2: 0.053637
[10]	valid_0's l2: 0.0498043
[11]	valid_0's l2: 0.0462743
[12]	valid_0's l2: 0.0432206
[13]	valid_0's l2: 0.0403875
[14]	valid_0's l2: 0.0377581
[15]	valid_0's l2: 0.0353976
[16]	valid_0's l2: 0.033068
[17]	valid_0's l2: 0.0309932
[18]	valid_0's l2: 0.0291949
[19]	valid_0's l2: 0.0277335
[20]	valid_0's l2: 0.0264817
[21]	valid_0's l2: 0.0252959
[22]	valid_0's l2: 0.0241128
[23]	valid_0's l2: 0.0231087
[24]	valid_0's l2: 0.022239
[25]	valid_0's l2: 0.0211682
[26]	valid_0's l2: 0.0202906
[27]	valid_0's l2: 0.0193997
[28]	valid_0's l2: 0.0185885
[29]	valid_0's l2: 0.017931
[30]	valid_0's l2: 0.0172712
[31]	valid_0's l2: 0.0166646
[32]	valid_0's l2: 0.0160984
[33]	valid_0's l2: 0.015570



[1]	valid_0's l2: 0.0549496
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0516601
[3]	valid_0's l2: 0.0484436
[4]	valid_0's l2: 0.0456724
[5]	valid_0's l2: 0.0430367
[6]	valid_0's l2: 0.0408253
[7]	valid_0's l2: 0.038723
[8]	valid_0's l2: 0.0370856
[9]	valid_0's l2: 0.0353534
[10]	valid_0's l2: 0.0338137
[11]	valid_0's l2: 0.0325335
[12]	valid_0's l2: 0.0312016
[13]	valid_0's l2: 0.0300296
[14]	valid_0's l2: 0.0289329
[15]	valid_0's l2: 0.0279297
[16]	valid_0's l2: 0.0270393
[17]	valid_0's l2: 0.0262031
[18]	valid_0's l2: 0.0254001
[19]	valid_0's l2: 0.0247897
[20]	valid_0's l2: 0.0241931
[21]	valid_0's l2: 0.0237286
[22]	valid_0's l2: 0.0231483
[23]	valid_0's l2: 0.0227895
[24]	valid_0's l2: 0.0223575
[25]	valid_0's l2: 0.0220378
[26]	valid_0's l2: 0.0216828
[27]	valid_0's l2: 0.021364
[28]	valid_0's l2: 0.021079
[29]	valid_0's l2: 0.0208362
[30]	valid_0's l2: 0.0204818
[31]	valid_0's l2: 0.0202269
[32]	valid_0's l2: 0.0200014
[33]	valid_0's l2: 0.0

In [5]:
0.0084**(1/2),0.011**(1/2)

(0.0916515138991168, 0.10488088481701516)

# 出力確認用

In [2]:
oof_dates = np.load(f"{OUTPUT_DIR}/oof_dates.npy")
oof_true = np.load(f"{OUTPUT_DIR}/oof_true.npy")
oof_preds = np.load(f"{OUTPUT_DIR}/oof_preds.npy")

In [3]:
scores = []
for i in range(len(oof_dates)):
    mesh = pd.read_csv(f"H:\study\output\StackingOpt\EDA004\interpolated_mesh_data_{oof_dates[i]}.csv",index_col=0)
    t = pd.DataFrame(oof_true[i][0]).to_numpy().reshape(-1)-mesh.to_numpy().reshape(-1)
    scores.append(t.sum())
    

In [4]:
np.min(scores),np.max(scores),np.sum(scores)

(-8.933830257795705e-07, 8.025684375745357e-06, 0.000391706833073744)

In [5]:
oof_mse = compute_mse(oof_true, oof_preds)
oof_mse

0.029770939

In [6]:
test_preds_ensemble = np.load(f"{OUTPUT_DIR}/test_preds_ensemble.npy")
test_dates = np.load(f"{OUTPUT_DIR}/test_dates_ensemble.npy")

test_true = np.load(f"{OUTPUT_DIR}/test_true.npy")

In [7]:
score = []
mesh_list = []
for i in range(len(test_dates)):
    mesh = pd.read_csv(f"H:\study\output\StackingOpt\EDA004\interpolated_mesh_data_{test_dates[i]}.csv",index_col=0)
    mesh = mesh.to_numpy().reshape(1, 15, 12)
    test = test_true[i].reshape(1, 15, 12)
    score.append(np.sum(mesh-test))
    mesh_list.append(mesh)

test_true_mesh = np.concatenate(mesh_list, axis=0)    

print(np.min(scores),np.max(scores),np.sum(scores))

test_mse = compute_mse(test_true, test_preds_ensemble)
test_mse_mesh = compute_mse(test_true_mesh, test_preds_ensemble)
test_mse-test_mse_mesh 

-8.933830257795705e-07 8.025684375745357e-06 0.000391706833073744


-0.0929877139688046

In [8]:
test_preds_ensemble = np.load(f"{OUTPUT_DIR}/test_preds_ensemble.npy")

In [9]:
test_preds_ensemble[0].shape

(1, 15, 12)

# テストデータの整理

In [10]:
test_preds_ensemble = np.load(f"{OUTPUT_DIR}/test_preds_ensemble.npy")
test_dates = np.load(f"{OUTPUT_DIR}/test_dates_ensemble.npy")

test_true = np.load(f"{OUTPUT_DIR}/test_true.npy")

In [11]:
lat = pd.read_csv(r"H:\study\preprocessing_data\3_mesh_place\lati_zenkoku.csv", header=None, index_col=None)
lon = pd.read_csv(r"H:\study\preprocessing_data\3_mesh_place\long_zenkoku.csv", header=None, index_col=None)
id_all_data = pd.read_csv(r"H:\study\preprocessing_data\id_all_data.csv", encoding="shift_jis")

unique_id = ['10000095', '10000269', '1020000002', '1110000001', '1110000010', '1110000011', '1110000012', '1110000013', '1110000014', '1110000015', '1160000025', '1160000090', '1160000091', '1160000182', '1160000185', '1160000253', '1160000387', '1160000402', '1160000419', '1160000420', '1160000423', '1270000026', '1280000048', '1550000001', '1650000004', '1680000001', '1680000002', '1680000003', '1680000004', '1680000010', '1680000017', '1680000021', '1680000033', '1680000047', '1680000054', '1680000057', '1680000063', '1680000067', '1680000080', '1680000081', '1680000097', '1680000107', '1680000108', '1680000112', '1680000151', '1680000152', '1680000213', '1680000216', '1680000217', '1680000218', '1680000223', '1680000228', '1680000285', '1680000287', '1680000327', '1680000364', '2220000001', '2220000002', '2220000003', '2730000001', '2910000002', '3000000007', '3000000012', '3000000042', '5000000044', '5000000045', '6000000016', '6000000017', '6060000016', '6060000017', '6060000018', '6170000016', '6170000123', '6170000124', '6170000125', '6620000065', '6620000066', '6620000088', '6620000089', '6620000111', '6620000117', '6620000118', '6620000121', '6620000122', '6620000123', '6620000124', '6620000131', '6620000132', '6910000180', '6910000198', '6910000200', '6910000206', '6910000216', '6910000217', '6910000239', '6910000240', '6910000249', '6910000250', '6910000421', '6910000424', '6910000425', '6910000469', '6910000470']
unique_id = [int(i) for i in unique_id]
to_unique_id = [str(num).zfill(10) for num in unique_id]
id_data = id_all_data[id_all_data.id.isin(unique_id)].reset_index(drop=True)

min_lat,max_lat = id_data.id_lat.min(),id_data.id_lat.max()
min_lng,max_lng = id_data.id_lng.min(),id_data.id_lng.max()
userow = (lon.iloc[:,0]>=min_lng)&(lon.iloc[:,0]<=max_lng)
usecol = (lat.iloc[0]>=min_lat)&(lat.iloc[0]<=max_lat)

lat = lat.loc[userow,usecol]
lon = lon.loc[userow,usecol]

In [12]:
preds = test_preds_ensemble.reshape(-1)
dates_np = np.repeat(test_dates, lat.shape[0]*lat.shape[1])

lat_np = lat.to_numpy().reshape(-1)
lat_np_repeated = np.tile(lat_np, (len(test_dates)))

lon_np = lon.to_numpy().reshape(-1)
lon_np_repeated = np.tile(lon_np, (len(test_dates)))

In [13]:
preds = pd.DataFrame(zip(dates_np,lat_np_repeated,lon_np_repeated,preds),columns=["datetime","id_lat_mesh","id_lng_mesh","pred"])
preds["datetime"] = pd.to_datetime(preds["datetime"], format='%Y%m%d%H%M')

In [14]:
preds

Unnamed: 0,datetime,id_lat_mesh,id_lng_mesh,pred
0,2014-07-01 07:00:00,35.66,139.90,0.383354
1,2014-07-01 07:00:00,35.68,139.90,0.480452
2,2014-07-01 07:00:00,35.70,139.90,0.453691
3,2014-07-01 07:00:00,35.72,139.90,0.501307
4,2014-07-01 07:00:00,35.74,139.90,0.503702
...,...,...,...,...
128335,2014-07-31 18:00:00,35.80,140.18,0.707020
128336,2014-07-31 18:00:00,35.82,140.18,0.705699
128337,2014-07-31 18:00:00,35.84,140.18,0.700958
128338,2014-07-31 18:00:00,35.86,140.18,0.696757


In [15]:
import utils 

df = utils.get_preprocessing_data(to_unique_id)
df["datetime"] = pd.to_datetime(df["datetime"])
df = df.merge(id_data[["id","id_lat_mesh","id_lng_mesh","pvrate","observed_max"]],on=["id"],how="left")

date_range = pd.to_datetime(test_dates, format='%Y%m%d%H%M')
df = df[df.datetime.isin(date_range)]

In [16]:
df = df.merge(preds,on=["datetime","id_lat_mesh","id_lng_mesh"],how="left")

df["pred*two_weeks_max"] = df["pred"]*df["two_weeks_max"]
df["nv*twoweeks_max"] = df["nv"]*df["two_weeks_max"]

In [17]:
df["APE"] = np.abs(df["pred*two_weeks_max"]-df["nv*twoweeks_max"])/df["observed_max"]*100

In [18]:
df["APE"].mean()

8.960476953899216

In [20]:
y_true = df["pred*two_weeks_max"] 
y_pred = df["nv*twoweeks_max"]
np.sqrt(np.mean((y_true - y_pred)**2))

1223.4059527433953

In [23]:
y_true = df["pred"]
y_pred = df["nv"]
np.mean((y_true - y_pred)**2)

0.045501480973918575