In [14]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [15]:
import glob
import os
import numpy as np
import pandas as pd
import numpy as np
import glob
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
from torch import Tensor
import os

In [27]:
## Read mobility files / NOS of days of data

%cd /content/gdrive/My Drive/Mobility_data/
mobility_csv_files = glob.glob("*.csv")
(len(mobility_csv_files)) # Nos of days

/content/gdrive/My Drive/Mobility_data


In [17]:
#census data frame containing static features of counties. Static features are not used in our model as of now.

census_df = pd.read_csv('/content/gdrive/My Drive/nodes.csv')
census_df.drop(census_df.iloc[:, 0:1], inplace = True, axis = 1)
#census_df

In [28]:
# create a date range to keep track of the sequence of the days while reading mobility data files for each day

import datetime
start_date = datetime.date(2020, 5, 12)
number_of_days = 235

date_list = []
for day in range(number_of_days):
  a_date = (start_date + datetime.timedelta(days = day)).isoformat() + "-social-distancing.csv.mobility.csv"
  date_list.append(a_date)

#print (len(date_list))

In [19]:
# Predictor Z(Nos of deaths) # days * counties

deaths_data = pd.read_csv ('/content/gdrive/My Drive/DeepAR-pytorch-master/data/elect/COUNTY_DEATHS_DATA.csv')
#deaths_data

In [11]:
# Get COLUMN names for matching the counties of mobility and NYT COVID CASES/DEATHS data

census_df = census_df.iloc[0:300,:] # take  first 300 counties
column_name = census_df["origin_county_FIPS"].tolist()

drop_counties = []
# Drop those counties for creating mobility data  which are also dropped from NYT COVID data ## missing values
drop_counties = [1029,2013,2016,2060,2068,2100,2164,2185,2164,2188,2232,2280,4023,5081,6035,6049,8033,8105]
for counties in drop_counties:
  if counties in column_name:
    column_name.remove(counties)

county_list = column_name[:277]
len(county_list)


277

In [12]:
# Read nos of cases time series data. We used number of cases and mobility as node features to predict death counts.

cases_data_df = pd.read_csv("/content/gdrive/My Drive/DeepAR-pytorch-master/data/elect/COUNTY_CASES_DATA.csv")
cases_data_df = cases_data_df.iloc[:,1:]
cases_data_df.columns = county_list
#cases_data_df


In [13]:
# Mobility data used for cross county and within county mobility patterns.

mobility_data = pd.read_csv("/content/gdrive/My Drive/Mobility_data/2020-12-30-social-distancing.csv.mobility.csv")
#mobility_data

In [None]:
# Perform the graph convolution operation between diffusion matrix and node features to create the dynamic covariate matrices for each day. 
#Initialze random weights to diffusion matrix.

file_count = 1 # Nos of days 
nodes = 277 # nos of counties
cov_feats = 2 # number of node features like people mobility, infection rate, cases etc


for f in date_list: # loop through all days in sequence
    file = glob.glob(f)
    #print ("FIRST DAY MOBILITY:", file)
    if  (len(file)) == 1:
      mob_data = pd.read_csv(file[0])
    else:
      continue
  
    # create adj matrix
    adj_mat = pd.DataFrame( columns = county_list,
                    index=county_list) 
    
     # Create  node features matrix X # Nos of days * nos of node feats(people mobility at node and infection cases)
    df_X = pd.DataFrame( columns = ["mean_distance_traveled_from_home",'Cases'], index=county_list)
   

    ## Dynamic adjacency matrix for each day
    for index,row in mob_data.iterrows():
        county_origin = row["origin_county_FIPS"]
        county_dest = row["destination_county_FIPS"]
        agg_visits = row["agg_visits"]
        mean_distance_traveled_from_home = row["mean_distance_traveled_from_home"] 
        

        if county_origin in county_list:
          if county_dest in county_list:
            # Create covarite matrix with the mean distance for each day
            df_X.loc[county_origin,"mean_distance_traveled_from_home"] = mean_distance_traveled_from_home
            df_X.loc[county_origin,"Cases" ] = cases_data_df.iloc[file_count -1][county_origin]
        

            # Create the edges of adj matrix based on nos of visits 
            if  agg_visits > 200:            
              adj_mat.loc[county_origin,county_dest] = agg_visits
            else:
              adj_mat.loc[county_origin,county_dest] = 0.000001

    adj_mat = adj_mat.replace(np.nan,0.000001) ## no connections
    print ("Shape adj_mat:", np.shape(adj_mat))
    #print ("adj_mat:", adj_mat)



    # pearson correlation of adjacency matrix
    R1 = np.corrcoef(adj_mat)
    adj_mat_coerr = pd.DataFrame(R1,columns = county_list,
                        index=county_list)
    #adj_mat_coerr.to_csv("/content/gdrive/My Drive/df_coerr.csv",index=False)

    print ("Shape adj_mat_coerr:", np.shape(adj_mat_coerr))
    #print ("adj_mat_coerr:", adj_mat_coerr)
  


    # convert pd to tensors for adj matrix and input covariates(mobility)
    adj_mat_coerr = torch.Tensor(adj_mat_coerr.values)
    adj_mat = torch.Tensor(adj_mat.values)
    inp = torch.Tensor(df_X.values.astype(float))
    inp = torch.reshape(inp,(nodes,cov_feats))
    print ("inp shape:", np.shape(inp))
    #print ("INP:", inp)

      

   # Perform GCN ((W_0.(D_0^-1.A) + W_1.(D_1^-1.A))X
    
    weight = Parameter(torch.Tensor(nodes,nodes))

    #stdv = 1. / math.sqrt(weight.size(1))
    #weight.data.uniform_(-stdv, stdv)
    nn.init.xavier_normal_(weight.data, gain=0.02) 

    #Add normalization to degree matrix
    D_inverse= torch.diag(1 / torch.sum(adj_mat_coerr, 0))
    norm_A = torch.matmul(D_inverse, adj_mat_coerr)


    # Perform Graph Conv   
    support = torch.spmm(weight,norm_A)
    print ("support:", np.shape(support))
    

    output = torch.mm(support,inp)
    #print ("output:", output)
    print ("output shape:", np.shape(output))

    # Save each graph convoluted covariate matrix for each day ## Nos of days * X(node) features     
    covariate_matrix = 'covariate_matrix_day_'+ str(file_count)+".csv"
    file_count = file_count + 1

    new_path = "/content/gdrive/My Drive/FINAL_135_covariate_mat_300_CASES_MOBILITY_ADJ_MAT_200_VISITS/"
    save_path = new_path + covariate_matrix
    print ("save path:", save_path)
    output_final = pd.DataFrame(output.detach().numpy())
    output_final.to_csv(save_path, index=False)

    

In [21]:
#Change directory

!pwd

%cd /content/gdrive/My Drive/DeepAR-pytorch-master/

/content/gdrive/MyDrive/Mobility_data
/content/gdrive/My Drive/DeepAR-pytorch-master


In [None]:
# Preprocess the data

!python3 preprocess_mobility_cases_dynamic_feats.py

In [25]:
# Train model

!python3 train.py

PARAMETERS: {'learning_rate': 0.001, 'batch_size': 4, 'lstm_layers': 5, 'num_epochs': 40, 'train_window': 12, 'test_window': 12, 'predict_start': 8, 'test_predict_start': 8, 'predict_steps': 4, 'num_class': 277, 'cov_dim': 554, 'lstm_hidden_dim': 40, 'embedding_dim': 20, 'sample_times': 12, 'lstm_dropout': 0.3, 'predict_batch': 10}
[18:25:42] DeepAR.Train: Loading the datasets...
feat shape: (4950, 12, 556)
label shape: (4950, 12)
[18:25:42] DeepAR.Data: train_len: 4950
[18:25:42] DeepAR.Data: building datasets from data/elect...
[18:25:42] DeepAR.Data: test_len: 165
[18:25:42] DeepAR.Data: building datasets from data/elect...
[18:25:42] DeepAR.Data: weights: tensor([3.8790e-06, 5.5760e-06, 6.7882e-06,  ..., 5.0911e-06, 5.0911e-06,
        5.0911e-06], dtype=torch.float64)
[18:25:42] DeepAR.Data: num samples: 4950
  cpuset_checked))
[18:25:42] DeepAR.Train: Loading complete.
[18:25:42] DeepAR.Train: Model: 
Net(
  (embedding): Embedding(277, 20)
  (lstm): LSTM(575, 40, num_layers=5, dr

In [26]:
# evaluate model

!python3 evaluate.py

PARAMETERS: {'learning_rate': 0.001, 'batch_size': 4, 'lstm_layers': 5, 'num_epochs': 40, 'train_window': 12, 'test_window': 12, 'predict_start': 8, 'test_predict_start': 8, 'predict_steps': 4, 'num_class': 277, 'cov_dim': 554, 'lstm_hidden_dim': 40, 'embedding_dim': 20, 'sample_times': 12, 'lstm_dropout': 0.3, 'predict_batch': 10}
[19:21:17] DeepAR.Eval: Not using cuda...
[19:21:17] DeepAR.Eval: Loading the datasets...
[19:21:17] DeepAR.Data: test_len: 165
[19:21:17] DeepAR.Data: building datasets from data/elect...
  cpuset_checked))
[19:21:17] DeepAR.Eval: - done.
model:  Net(
  (embedding): Embedding(277, 20)
  (lstm): LSTM(575, 40, num_layers=5, dropout=0.3)
  (relu): ReLU()
  (distribution_mu): Linear(in_features=200, out_features=1, bias=True)
  (distribution_presigma): Linear(in_features=200, out_features=1, bias=True)
  (distribution_sigma): Softplus(beta=1, threshold=20)
)
[19:21:17] DeepAR.Eval: Starting evaluation
100% 17/17 [00:04<00:00,  4.01it/s]
[19:21:22] DeepAR.Eval: 