In [None]:
%matplotlib inline

import os 
import json 

import boto3
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from mirrorverse.utils import read_data_w_cache

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

COLOR_PALETTE = ['#648FFF', '#785EF0', '#DC267F', '#FE6100', '#FFB000']

In [None]:
s3_client = boto3.client('s3')
bucket = 'mimic-log-odds-models'
version = 7
prefix = f'movement-model-m3-a4-v{version}'

response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)

configs = {}
if 'Contents' in response:
    for obj in response['Contents']:
        if obj['Key'].endswith('config.json'):
            config = json.loads(s3_client.get_object(Bucket=bucket, Key=obj['Key'])['Body'].read().decode('utf-8'))
            if 'model' in config:
                configs[config['run_id']] = config['model']
else:
    print("No objects found.")

rows = []
for run_id, config in configs.items():
    if 'num_layers' in config:
        rows.append({
            'run_id': run_id,
            'num_layers': config['num_layers'],
            'neurons': int(config['layers'][0][1:]),
            'dropout': 0.0 if not config['layers'][1].startswith('Dropout') else float(config['layers'][1][-1])/10,
            'learning_rate': config['optimizer_kwargs']['learning_rate']
        })

configs_data = pd.DataFrame(rows)

results = (
    read_data_w_cache(f'select * from movement_model_experiment_m3_a4_v{version}')
    .sort_values(['run_id', 'epoch'], ascending=True)
)
results = results[['run_id', 'loss', 'val_loss', 'train_loss', 'epoch']].merge(configs_data)
results.head()

In [None]:
final = results.groupby('run_id')[['val_loss', 'train_loss', 'loss', 'epoch', 'neurons', 'dropout', 'num_layers', 'learning_rate']].last().reset_index().sort_values('val_loss', ascending=True)
final.head()

In [None]:
x = final.sort_values('val_loss', ascending=True)['run_id'].values[0]
x

In [None]:
px.scatter(final, x='train_loss', y='val_loss')

In [None]:
x = final.sort_values('val_loss', ascending=True)['run_id'].values[0]
subset = results[results['run_id'] == x]
val = subset[['epoch', 'val_loss']].rename({'val_loss': 'loss'}, axis=1)
train = subset[['epoch', 'train_loss']].rename({'train_loss': 'loss'}, axis=1)
val['case'] = 'val'
train['case'] = 'train'
px.line(pd.concat([val, train]), x='epoch', y='loss', color='case')

In [None]:
px.scatter(final, x='learning_rate', y='val_loss')

In [None]:
results.groupby('run_id')[['val_loss', 'neurons', 'num_layers']].min().sort_values('val_loss')

In [None]:
results[results['run_id'] == 'c98bb56a7d5c4ce058e26c97889da98188b2551a65afe407371388cd3c5fd166']

In [None]:
final.groupby(['neurons', 'learning_rate', 'num_layers', 'dropout'])['val_loss'].agg(['min', 'max'])