In [2]:
import pandas as pd
from prophet import Prophet
from pymongo import MongoClient
from prophet.plot import plot_plotly, plot_components_plotly
import os
import time
import pickle
from datetime import datetime
import itertools
import dask
from dask.distributed import Client

Importing plotly failed. Interactive plots will not work.


In [3]:
df_parent = pd.read_csv('covid_parents_trained.csv')
df_parent.shape

(56, 4)

In [4]:
class TrainedParent:
    def __init__(self, gis_join, rmse, changepoint_prior_scale, seasonality_prior_scale):
        self.gis_join = gis_join
        self.rmse = rmse
        self.changepoint_prior_scale = changepoint_prior_scale
        self.seasonality_prior_scale = seasonality_prior_scale
        
    def __str__(self):
        return f'{self.gis_join}: (rmse={self.rmse}, changepoint_prior_scale={self.changepoint_prior_scale}, seasonality_prior_scale={self.seasonality_prior_scale})'
    
trained_parents_list = []    
for i, row in df_parent.iterrows():
    gis_join = row['GISJOIN']
    rmse = row['rmse']
    changepoint_prior_scale = row['changepoint_prior_scale']
    seasonality_prior_scale = row['seasonality_prior_scale']
    trained_parents_list.append(TrainedParent(gis_join, rmse, changepoint_prior_scale, seasonality_prior_scale))
    
print(f'{len(trained_parents_list)}')
print(trained_parents_list[0])

56
G0400130: (rmse=316.3039434010886, changepoint_prior_scale=0.5, seasonality_prior_scale=0.1)


In [5]:
df_clusters = pd.read_csv('~/ucc-21/clusters-covid.csv')
df_clusters.head()

Unnamed: 0.1,Unnamed: 0,GISJOIN,cluster_id,distance,is_parent,frac_distance,sample_percent
0,0,G0100010,39,7.582524,0.0,0.046117,0.059223
1,1,G0100030,37,21.277778,0.0,0.109459,0.071892
2,2,G0100050,47,22.647059,0.0,0.288432,0.107686
3,3,G0100070,22,53.160338,0.0,0.611449,0.17229
4,4,G0100090,29,55.71875,0.0,0.522091,0.154418


In [6]:
# child GISJOIN to sample_percent map
child_map = {}
for i, row in df_clusters.iterrows():
    is_parent = row['is_parent']
    sample_percent = row['sample_percent']
    if not is_parent and (0.25 >= sample_percent > 0.15):
        gis_join = row['GISJOIN']
        
        child_map[gis_join] = sample_percent
        
no_of_children = len(child_map.keys())
no_of_parents = len(trained_parents_list)

# assert no_of_children == (df_clusters.shape[0] - no_of_parents)
print(no_of_children)

1242


In [7]:
db = MongoClient("lattice-100", 27018)
collection = 'covid_county_formatted'

def get_df_by_gis_join(gis_join, sample_percent=1.0):
    print(gis_join, end=' ')
    cursor = db.sustaindb[collection].aggregate([{"$match": {"GISJOIN": gis_join}}])
    df = pd.DataFrame(list(cursor))[['date', 'cases']]
    df.columns = ['ds','y']
    return df.sample(frac=sample_percent)

In [13]:
def predict_transfer(df_train):
    time1 = time.monotonic()
    # initilaize model with hyperparameters from parent model
    m = Prophet(
        seasonality_prior_scale = 10.0,
        changepoint_prior_scale = 0.5,
    )
    m.fit(df_train, algorithm='LBFGS')
    df_train_future = m.make_future_dataframe(periods=300, freq='H')
    df_train_forecast = m.predict(df_train_future)
    time2 = time.monotonic()

    return m, df_train_future, df_train_forecast, (time2 - time1)


def predict_transfer_task(df_train, gis_join):
    m, df_train_future, df_train_forecast, time_taken = predict_transfer(df_train)
    return gis_join, time_taken


# child_list = []
# child_dfs_list = []

# for gis_join, sample_percent in child_map.items():
#     child_list.append(gis_join)
#     child_dfs_list.append(get_df_by_gis_join(gis_join, sample_percent))
    
child_list = pickle.load(open('ucc-21/child_list_15_25.pkl', 'rb'))
child_dfs_list = pickle.load(open('ucc-21/child_dfs_list_15_25.pkl', 'rb'))

In [14]:
print(len(child_list))
print(len(child_dfs_list))

1242
1242


In [12]:
# pickle.dump(child_list, open('ucc-21/child_list_15_25.pkl', 'wb'))
# pickle.dump(child_dfs_list, open('ucc-21/child_dfs_list_15_25.pkl', 'wb'))

In [15]:
client = Client('lattice-150:8786')

counter = 1
lazy_results = []
for gis_join, df_ in zip(child_list, child_dfs_list):
    lazy_result = dask.delayed(predict_transfer_task)(df_, gis_join)
    lazy_results.append(lazy_result)
    if counter % 100 == 0:
        print(counter, end=', ')
    counter += 1

futures = dask.persist(*lazy_results)  # trigger computation in the background
results = dask.compute(*futures)
results[:5]

100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 

(('G0100070', 73.10810322500765),
 ('G0100090', 52.46689986297861),
 ('G0100130', 77.76051817391999),
 ('G0100170', 75.40392595203593),
 ('G0100190', 76.44265002198517))

In [18]:
gis_joins = []
times = []

for r, t in results:
    gis_joins.append(r)
    times.append(t)
    
times_0_15_df = pd.DataFrame(zip(gis_joins, times), columns=['GISJOIN', 'time'])
times_0_15_df.to_csv('ucc-21/child_training_tl_times_15_25.csv', index=False)