# Model fitting

Saves

- lambda value
- sampled betas
- median beta
- 'actual' betas for fitting figure

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

from scipy.optimize import minimize

from polymodel.utils import (
    find_beta_vectorised,
    truncated_exp_pdf,
)

# Beta

We collate all of the worst cultivars in the high pressure locations across all years, and then use a gaussian kernel-density estimate using Gaussian kernels and small bandwith 0.05 to smooth the values.

We can use the single I0 value found above to give us a beta value for each of these smoothed final severities.

In [9]:
I0_value = pd.read_csv('../data/03_model_inputs/I0_value.csv').I0_value.iloc[0]
I0_value

0.0098570319660089

In [None]:
worst_stb_with_cult_and_count_non_unique = pd.read_csv('../data/03_model_inputs/worst_stb_with_cult_and_count_non_unique.csv')

In [None]:
high_pressure_locations = pd.read_csv('../data/03_model_inputs/high_pressure_locations,csv')

In [None]:
stb_values = (
    worst_stb_with_cult_and_count_non_unique
    .drop(['stb_mean', 'count'], axis=1)
    .set_index(['year', 'location', 'cultivar'])
    .join(
        high_pressure_locations
        .set_index(['year', 'location', 'cultivar'])
    )
    .reset_index()
)

stb_values.stb.hist()

In [None]:
stb_values

## Truncated exponential

In [None]:
stb_values.stb.describe()

In [None]:
xx = np.linspace(-0.5,100.5,301)

In [None]:
def neg_log_likelihood(lambd):
    log_probs = [np.log(truncated_exp_pdf(x, lambd)) for x in stb_values.stb]
    return - np.sum(log_probs)

In [None]:
min_out = minimize(
    neg_log_likelihood,
    [0.04],
    bounds=[(1e-6, 100)],
    tol=1e-6,
)
min_out

In [None]:
lambd_fitted = min_out.x[0]
lambd_fitted

In [None]:
my_line = [truncated_exp_pdf(ii, lambd_fitted) for ii in xx]

In [None]:
if True:
    stb_values.to_csv('../data/03_model_inputs/stb_vals.csv')

In [None]:
f, ax = plt.subplots(figsize=(8,7))

(
    stb_values
    .stb
    .hist(ax=ax, 
          bins=20,
          density=True
    )
)

ax.plot(xx, my_line, c='r', lw=3)

In [None]:
f.savefig('../figures/paper_figs/trunc_exp_20.jpg')

In [None]:
f, ax = plt.subplots(figsize=(8,7))

(
    stb_values
    .stb
    .hist(ax=ax, 
          bins=40,
          density=True
    )
)

ax.plot(xx, my_line, c='r', lw=3)

In [None]:
f.savefig('../figures/paper_figs/trunc_exp_40.jpg')

In [None]:
ldf = pd.DataFrame(dict(lambda_fitted = [lambd_fitted]))
ldf

## Save lambda (exponential value)

In [4]:
filename = '../data/03_model_inputs/lambda_fitted.csv'

if True:
    print(f'saving to {filename}')
    ldf.to_csv(filename)
    
lambda_use = float(pd.read_csv(filename).iloc[:, 1])
lambda_use

0.0328774362127231

## Sample stb and beta

Need to check that the resulting beta values are sensible. Should be somewhere in the order of `1e-3`.

Then can post-hoc filter out any values that do something weird.

Have the pdf is `k * exp ( - lam * x )`,

Integrate, get `- k /lam * exp(-lam*x)`, so `1 = - k /lam * exp(-lam * 100) + k / lam`. So `k = lam / ( 1 - exp(-100 lam))`.

Then get integral from 0 to `S` is:

`p = (k /lam) * (1 - exp(-lam*S) )`

So `1 - exp(-lam * S) = lam * p / k`

So `S = - (1/lam) *ln(1 - lam * p / k)`

So `S = - (1/lam) *ln(1 -  p * ( 1 - exp(-100 lam))`

In [3]:
def find_stb(lambd, p):
    arg = 1 - p + p*np.exp(-100*lambd)
    out = -1/lambd * np.log(arg)
    return out

In [None]:
N_SAMPLE = 20000

In [None]:
np.random.seed(1)
random_unif = np.random.uniform(size=N_SAMPLE)

In [None]:
stb_generated = find_stb(lambda_use, random_unif)

In [None]:
pd.DataFrame(dict(stb=stb_generated)).describe()

In [None]:
I0_value

In [None]:
find_beta_vectorised([2e-3, 9.9e-1], I0_value)

In [None]:
beta_df = (
    pd.DataFrame(dict(stb=stb_generated))
    .assign(beta = lambda df: 
            find_beta_vectorised(0.01*df.stb, I0_value)
    )
)

beta_df.head()

In [None]:
beta_df.to_csv('../data/03_model_inputs/many_sampled_betas.csv')

In [None]:
f, ax = plt.subplots()

beta_df.hist(ax=ax
             # , bins=50
            )

In [None]:
beta_df.sort_values(['beta', 'stb']).loc[lambda df: np.isclose(df.beta, 1e-4)]

In [None]:
bad_betas = (
    beta_df
    .loc[lambda df: (
        (df.beta<=1e-4) |
        (df.beta>=5e-2) |
        (df.beta.isin([np.nan]))
        
    )]
)

bad_betas

In [None]:
beta_df.loc[np.isclose(beta_df.beta,0.0001), :] = np.nan

In [None]:
sampled_betas_use = (
    beta_df.loc[~np.isclose(beta_df.beta,0.0001)]
    .loc[lambda df: ~df.beta.isnull()]
)

sampled_betas_use.shape

NB that think can't achieve this lowest sev with same I0 - even with beta=0 will have some minimum severity. So just filter out these very rare cases - 13 out of 20000.

In [None]:
sampled_betas_use.describe()

In [None]:
(
    sampled_betas_use
    .assign(log_b = lambda df: np.log(df.beta))
    .plot
    .scatter(x='log_b', y='stb', alpha=0.1)
)

In [None]:
# f.savefig('../figures/paper_figs/stb_vs_beta_expo.jpg')

## Save sampled betas?

In [None]:
# if SAVING:
if True:
    filename = '../data/03_model_inputs/beta_sampled_no_repeats.csv'
    print(f'saving beta to {filename}')
    sampled_betas_use.beta.to_csv(filename)

## Resample to get more

In [None]:
betas = pd.read_csv('../data/03_model_inputs/beta_sampled_no_repeats.csv').iloc[:, 1:]
betas

In [None]:
betas2 = (
    pd.concat([betas] +
        [betas.sample(frac=1, random_state=ii) for ii in range(10)]
    )
)

betas2.head(10)

In [None]:
betas2.shape

In [None]:
betas2.to_csv('../data/03_model_inputs/beta_sampled.csv')

### for nik cdf

In [None]:
cdf_df = (
    pd.DataFrame(dict(x = np.linspace(0,1,100)))
    .assign(cdf = lambda df: find_stb(lambda_use, df.x))
)

In [None]:
f, ax= plt.subplots(figsize=(14,8))

cdf_df.plot(x='x', y='cdf', ax=ax)


(
    stb_values
    .sort_values('stb')
    .reset_index(drop=True)
    .assign(quartile = lambda df: (
        df.index / (df.shape[0] - 1)
    )
    )
    .set_index('quartile')
    .loc[:, ['stb']]
    .plot(ax=ax)
)

# (
#     random_sample
#     .sort_values('random_stb')
#     .reset_index(drop=True)
#     .assign(quartile = lambda df: (
#         df.index / (df.shape[0] - 1)
#     )        
#     )
#     .set_index('quartile')
#     .loc[:, ['random_stb']]
#     .rename(columns={'random_stb': 'smoothed_stb'})
#     .plot(ax=ax)
# )

In [None]:
f.savefig('../figures/paper_figs/cdf.jpg')

## Save median beta?

Use pdf to find median stb, then use this to find median beta value

In [13]:
stb_median = find_stb(lambda_use, 0.5)

stb_median

19.967778417272328

In [15]:
beta_median = find_beta_vectorised([0.01*stb_median], I0_value)[0]
beta_median

0.007865415999695926

In [16]:
# betasm = pd.read_csv('../data/03_model_inputs/beta_sampled.csv').iloc[:, 1:]
# betasm.head()

Unnamed: 0,beta
0,0.007551
1,0.00872
2,0.007068
3,0.006144
4,0.005606


In [17]:
# betasm.mean(), betasm.median()

(beta    0.007808
 dtype: float64,
 beta    0.007895
 dtype: float64)

In [20]:
bdf = pd.DataFrame(dict(beta_median=[beta_median]))

if True:
    filename = '../data/03_model_inputs/beta_value.csv'
    print(f'saving to {filename}')
    bdf.to_csv(filename)

bdf

saving to ../data/03_model_inputs/beta_value.csv


Unnamed: 0,beta_median
0,0.007865


## Save 'actual' betas for figure?

Not for model

In [None]:
b_actual = find_beta_vectorised(
    0.01*pd.read_csv('../data/03_model_inputs/stb_vals.csv').stb,
    I0_value
)

In [None]:
if True:
    print('saving')
    
    (
        pd.DataFrame(dict(betas = b_actual))
        .to_csv('../data/03_model_inputs/beta_from_data_not_sampled_dist.csv')
    )