# Fitting distributions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import optuna

from optuna.visualization import (
    plot_optimization_history,
    plot_contour,
)

from optuna.samplers import TPESampler

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from polymodel.fitting import (
    HostObjective,
    score_for_this_df_weighted,
    fitting_df,
)
    
from polymodel.config import Config

from polymodel.consts import (
    MUTATION_PROP,
    DEFAULT_P,
    HOST_MUTATION_SCALE,
    FUNG_MUTATION_SCALE,
)

# Host

## Fit

NB need to think about how to weight points - by `n`, `sqrt(n)` or not at all?

In [None]:
host_fit_config = Config(
    'single', 
    n_k=10,
    n_l=500,
    mutation_proportion=MUTATION_PROP,
    mutation_scale_fung=DEFAULT_P * FUNG_MUTATION_SCALE,
    mutation_scale_host=DEFAULT_P * HOST_MUTATION_SCALE,
)

In [None]:
optuna.logging.set_verbosity(0)

In [None]:
sampler = TPESampler(seed=0)
study = optuna.create_study(sampler=sampler)
obj_h = HostObjective(host_fit_config)

In [None]:
%%time

study.optimize(obj_h, n_trials=300)
int(study.best_value)

In [None]:
%%time

study.optimize(obj_h, n_trials=300)
int(study.best_value)

In [None]:
# study.optimize(obj_h, n_trials=100)

In [None]:
plot_optimization_history(study)

In [None]:
plot_contour(study)

## Replicate result

In [None]:
study.best_params

In [None]:
yh = (
    HostObjective(host_fit_config)
    .run_model(params = study.best_params)
    
    # .run_model(params = {
    #     'mean': 0.83,
    #     'mutation_scale': 0.1
    # })
)

yh

In [None]:
control_data_h = (
    obj_h.df
    .loc[:, ['data_control', 'n_data', 'year']]
    .assign(year = lambda df: df.year - df.year.min())
)

control_data_h.head()

In [None]:
score_for_this_df_weighted(control_data_h, yh)

In [None]:
f, ax = plt.subplots(figsize=(14,7))

sns.scatterplot(
    x='year',
    y='data_control',
    size='n_data',
    data=control_data_h,
    ax=ax,
)

ax.plot(yh, lw=4, color='red')

ax.set_ylim([0,100])

## Save best values

In [None]:
filename = '../data/03_model_inputs/fitted.csv'

In [None]:
fitted = pd.read_csv(filename)
fitted

In [None]:
data = fitting_df(host_fit_config, study)
data

In [None]:
combined = (
    pd.concat([
        fitted,
        data
    ])
    .sort_values('date', ascending=False)
    .drop_duplicates()
    .reset_index(drop=True)
    .astype({'trial_number': 'int64'})
)

combined

In [None]:
combined.to_csv('../data/03_model_inputs/fitted.csv', index=False)