In [922]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_scatter import scatter_mean
import pickle
import pandas as pd
import numpy as np 
import torch_geometric_temporal
from torch_geometric.nn.models.re_net import RENet

In [923]:
from torch_geometric_temporal.data.splitter import discrete_train_test_split
from torch_geometric_temporal.data.discrete.dynamic_graph_discrete_signal import DynamicGraphDiscreteSignal
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#from rise_utils import to_torch_inputs
from covid_CA_forecasting import torch_graph

In [924]:
path = '/home/mxenoc/workspace/pickles/RISE/'

In [925]:
# data preprocessing for SC
with open(path + 'weekly_movements_SC.pkl', 'rb') as f:
    df = pickle.load(f)

with open(path + 'weekly_infections.pkl', 'rb') as f:
    cc = pickle.load(f)


In [926]:
with open('/home/mxenoc/workspace/pickles/RISE/SC_cities.pkl', 'rb') as f:
    SC_cities = pickle.load(f)

In [927]:
all_cities = pd.unique(df[['origin_city', 'destination_city']].values.ravel('K'))
all_days = np.unique(df.loc[:,'date'])

In [928]:
# encode city
le = LabelEncoder()
le.fit(all_cities)

LabelEncoder()

In [929]:
cc = cc[cc['place'].isin(all_cities)]
cc = cc[cc['date'].isin(all_days)]

In [930]:
#Encode city for all datasets
df['origin_city_encoded'] = le.transform(df['origin_city'])
df['destination_city_encoded'] = le.transform(df['destination_city'])
cc['city_encoded'] = le.transform(cc['place'])

In [931]:
census_data_mean = pd.read_csv('/home/mxenoc/workspace/covid-CA-forecasting/data/census_features/censusFeature_mean.csv')

In [932]:
census_data = pd.read_csv('/home/mxenoc/workspace/covid-CA-forecasting/data/census_features/censusFeature.csv')

In [933]:
common_columns = census_data.columns.intersection(census_data_mean.columns).drop('city')

In [934]:
census_data = census_data.drop(common_columns, axis = 1)

In [935]:
census_data_mean = census_data_mean[['city','mean_male_age', 'mean_female_age', 'mean_income', 'mean_pphh', 'mean_B08202', 'mean_B19101',
                 'mean_B25014_owner', 'mean_B25014_renter', 'mean_B25017']]

In [936]:
census_data = pd.merge(census_data, census_data_mean, on="city")

In [937]:
#Keep only the cities that interact with cities in SC county
census_data = census_data[census_data['city'].isin(all_cities)]

#Set index to be the encoding of the city
census_data['city_encoded'] = le.transform(census_data['city'])

In [938]:
#Reindex the index and city column
census_data.set_index('city_encoded', inplace=True)
census_data = census_data.reindex(le.transform(all_cities))
census_data['city'] = le.inverse_transform(census_data.index)
#Sort by index 
census_data = census_data.sort_index()

In [1005]:
feature_df = cc.merge(census_data, how='inner', on='city_encoded')

In [1237]:
all_weeks = np.unique(feature_df['date'])
all_weeks = all_weeks[54:319]

all_weeks_train = all_weeks[:len(all_weeks)-7]
all_weeks_test = all_weeks[:len(all_weeks)-6]
#all_weeks_train = all_weeks[:(2*len(all_weeks))//4]
#all_weeks_test = all_weeks[(2*len(all_weeks))//4:]

In [1213]:
all_weeks = np.unique(feature_df['date'])

all_weeks_train = all_weeks[:-1]
all_weeks_test = all_weeks
#all_weeks_train = all_weeks[:(2*len(all_weeks))//4]
#all_weeks_test = all_weeks[(2*len(all_weeks))//4:]

In [1238]:
len(all_weeks)

265

In [1239]:
len(all_weeks_train)

258

In [1240]:
len(all_weeks_test)

259

In [1241]:
all_weeks_test[0]

numpy.datetime64('2020-06-01T00:00:00.000000000')

In [1242]:
all_weeks_test[-1]

numpy.datetime64('2021-02-14T00:00:00.000000000')

In [1243]:
feature_df_train = feature_df[feature_df['date'].isin(all_weeks_train)]
feature_df_test = feature_df[feature_df['date'].isin(all_weeks_test)]

In [1244]:
features_to_use = list(census_data.columns.drop('city'))
features_to_use.append('new_cases_per1000')

#features_to_use = ['new_cases_per1000']

In [1245]:
feature_df_train.replace([np.inf, -np.inf], 0, inplace=True)
feature_df_test.replace([np.inf, -np.inf], 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [1246]:
scaler = StandardScaler()

feature_list = scaler.fit(feature_df_train[features_to_use])             

In [1253]:
choose_set = 'test'

In [1254]:
if choose_set == 'train':
    all_weeks_set = all_weeks_train
    feature_df_set = feature_df_train
    
elif choose_set == 'test': 
    all_weeks_set = all_weeks_test
    feature_df_set = feature_df_test    

In [1255]:
#Find the code for each city
cities_codes = cc.reset_index(drop = True)
cities_codes = cities_codes[['place', 'city_encoded']]
cities_codes = cities_codes.drop_duplicates()
codes_SC = cities_codes[cities_codes.place.isin(SC_cities)]

In [1256]:
edge_indices_tensorlist = []
edge_weights_tensorlist = []
features_tensorlist = []
targets_tensorlist = []

labels = {}
# for one graph
for selected_week in all_weeks_set:

    # select edges for the date
    df_selected = df.loc[df.date==selected_week, :]
    df_selected = df_selected.fillna(0)

    # select cases and fill NaN value for cases that don't exist in the cities
    feature_df_selected = feature_df_set.sort_values(['city_encoded', 'date'])
    feature_df_selected = feature_df_selected.loc[feature_df_selected.date==selected_week, :]
    feature_df_selected.set_index('city_encoded', inplace=True)
    feature_df_selected = feature_df_selected.reindex(le.transform(all_cities))
    feature_df_selected['city'] = le.inverse_transform(feature_df_selected.index)
    feature_df_selected['date'] = selected_week
    feature_df_selected = feature_df_selected.sort_index()
    feature_df_selected = feature_df_selected.fillna(0)
    
    edges_list = [np.array(df_selected['origin_city_encoded']), np.array(df_selected['destination_city_encoded'])]

    #edge_indices = torch.tensor(edges_list, dtype=torch.long)
    edge_indices = edges_list
    
    #edge_weights = torch.tensor(np.array(df_selected['total_moving_devices']), dtype=torch.float)
    edge_weights = np.array(df_selected['device_count_normalised'])
    
    feature_list = []
    #feature_df = cc_selected.merge(census_data, how='inner', on='city_encoded')

    #Transform data
    features = scaler.transform(feature_df_selected[features_to_use])
    features = pd.DataFrame(features, columns = feature_df_selected[features_to_use].columns)

    for i in range(feature_df_selected.shape[0]):
        new_features = np.array(features.iloc[i])
        
        #new_features = np.array(cc_selected.iloc[i,1:8])
        feature_list.append(new_features)

#    features = np.array(cc_selected['new_cases_per1000'])
    
    features = feature_list
    #targets = np.array(feature_df_selected.loc[feature_df_selected.index.isin(codes_SC['city_encoded']), 'new_cases_per1000_in_10_days'])
    targets = np.array(feature_df_selected['new_cases_per1000_in_10_days'])
    
    edge_indices_tensorlist.append(edge_indices)
    edge_weights_tensorlist.append(edge_weights)
    features_tensorlist.append(features)
    targets_tensorlist.append(targets)

In [1257]:
data = DynamicGraphDiscreteSignal(edge_indices = edge_indices_tensorlist, edge_weights = edge_weights_tensorlist, features = features_tensorlist, targets = targets_tensorlist)

In [1258]:
with open(path+choose_set+'_SC_all_14_02.pkl', 'wb') as f:  
    pickle.dump(data, f)

In [1211]:
path

'/home/mxenoc/workspace/pickles/RISE/'

In [970]:
np.mean((feature_df_train['new_cases_per1000']-feature_df_train['new_cases_per1000_in_10_days'])**2)

0.04330999694841679

In [971]:
np.mean((feature_df_test.loc[feature_df_test.date == '2021-02-09', 'new_cases_per1000'].values-feature_df_test.loc[feature_df_test.date == '2021-02-09', 'new_cases_per1000_in_10_days'].values)**2)

0.06061878772217091

In [1117]:
np.mean(abs(feature_df_train['new_cases_per1000']-feature_df_train['new_cases_per1000_in_10_days']))

0.11636441142660875

In [1264]:
np.mean(abs(feature_df_test.loc[feature_df_test.date == '2021-02-09', 'new_cases_per1000'].values-feature_df_test.loc[feature_df_test.date == '2021-02-09', 'new_cases_per1000_in_10_days'].values))

0.1704354728631061

In [1227]:
all_weeks_test[-1]

numpy.datetime64('2021-02-14T00:00:00.000000000')

In [None]:
#0.1044, 0.1208, 0.1232, 0.1315, 0.1452, 0.1704