# Estimate LB noise

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
train = pd.read_csv("/kaggle/input/petfinder-pawpularity-score/train.csv")
train.head()

In [None]:
# All zero RMSE
mean_squared_error(np.zeros(len(train)), train["Pawpularity"], squared=False)

In [None]:
# All mean RMSE
mean_target = np.mean(train["Pawpularity"])
print("Mean target", mean_target)
mean_squared_error([mean_target] * len(train), train["Pawpularity"], squared=False)

All zeros sub on public LB is 43.50174.

Using only the train mean as a submission on public LB is 20.50574 [notebook](https://www.kaggle.com/kaushal2896/petfinder-my-initial-eda-mean-baseline)

# Test dataset
* 6800 photos
* LB is 25% (1700 photos)

Monte Carlo method:
* Choose 1700 random photos from train
* Calculate RMSE for an all zero/mean prediction
* Repeat 10000 times

## All zero

In [None]:
N = 1700
rmse_values = []

for i in tqdm(range(10000)):
    sample = train["Pawpularity"].sample(n=N)
    rmse_values.append(mean_squared_error(np.zeros(N), sample, squared=False))

In [None]:
mc_mean = np.mean(rmse_values)
mc_std = np.std(rmse_values)
mc_mean, mc_std

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(rmse_values, bins=100);
plt.title("RMSE calculated using all zeros")
plt.axvline(43.25455, c="k", label="Full train RMSE (43.25455)");
plt.axvline(43.50174, c="r", label="Public LB RMSE (43.50174)");
plt.axvline(mc_mean, c="g", label="Mean of sampled train RMSE");
plt.axvline(mc_mean - mc_std, c="g", linestyle="--", label="-1 std dev");
plt.axvline(mc_mean + mc_std, c="g", linestyle="--", label="+1 std dev");
plt.legend(loc="upper right");

## All mean

In [None]:
N = 1700
rmse_values = []

for i in tqdm(range(10000)):
    sample = train["Pawpularity"].sample(n=N)
    rmse_values.append(mean_squared_error([mean_target] * N, sample, squared=False))

In [None]:
mc_mean = np.mean(rmse_values)
mc_std = np.std(rmse_values)
mc_mean, mc_std

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(rmse_values, bins=100);
plt.title("RMSE calculated using train mean: 38.03904")
plt.axvline(20.59095, c="k", label="Full train RMSE (20.59095)");
plt.axvline(20.50574, c="r", label="Public LB RMSE (20.50574)");
plt.axvline(mc_mean, c="g", label="Mean of sampled train RMSE");
plt.axvline(mc_mean - mc_std, c="g", linestyle="--", label="-1 std dev");
plt.axvline(mc_mean + mc_std, c="g", linestyle="--", label="+1 std dev");
plt.legend(loc="upper right");

In [None]:
nan

In [None]:
nan