In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
import random
import sklearn
import tslearn

from tslearn.clustering import KernelKMeans
from tslearn.utils import to_time_series_dataset
import numpy as np
import networkx as nx

import sys
sys.path.append("../")

from pyspark.sql import SparkSession
from pyspark.sql.functions import format_string
from sklearn.manifold import TSNE
from analysis.spark import SparkEngine
from analysis.preprocess import extract_states

spark = SparkSession.builder.getOrCreate()
spark_engine = SparkEngine(spark)

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
import altair as alt
from vega_datasets import data
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [3]:
# If you already uploaded data to HFS
filepath = "/user/gw2145/integrated.csv"
spark = SparkSession.builder.getOrCreate()
covid_data = spark.read.format('csv').options(header='true', inferschema='true').load(filepath)
covid_data.createOrReplaceTempView("covid")
integrated_src_data = covid_data.toPandas()
integrated_data = integrated_src_data.fillna(0)
states = extract_states(integrated_data)
states

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [4]:
# if you did not upload data to HFS
local_path = "./integrated_data.csv"
covid_pd = pd.read_csv(local_path)
covid_df = spark.createDataFrame(covid_pd)
covid_data = covid_df
covid_data.createOrReplaceTempView("covid")
integrated_src_data = covid_data.toPandas()
integrated_data = integrated_src_data.fillna(0)
states = extract_states(integrated_data)
states

In [4]:
# Data Analysis



In [5]:
# Cluster state data by time series k-means


In [4]:
from analysis.time_series import get_state_embeddings, add_cluster_info, extract_state_series, cluster_states_by_ts
# This cell hold the functions for layout

In [5]:
X = extract_state_series(integrated_data)
labels = cluster_states_by_ts(X)


In [6]:
embeddings = get_state_embeddings(states, labels)
integrated_data = add_cluster_info(integrated_data, embeddings)

In [8]:
from analysis.visualization import plot_ts_clusters

plot_ts_clusters(integrated_data, embeddings)

In [4]:
from analysis.preparation import load_state_info
state2fips_df, populations = load_state_info()

In [5]:
from analysis.visualization import plot_map

plot_map(integrated_data, state2fips_df, populations, date="2021-01-31", attr="cases_by_population")

TypeError: plot_map() got an unexpected keyword argument 'date'

In [5]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Load training data


In [6]:
import pyspark.sql.functions as F

df = spark_engine.join_population(covid_data, populations)
df1 = spark_engine.add_idx(df)


In [7]:
mobility_metrics = [
    "retail_and_recreation_percent_change_from_baseline",
    "grocery_and_pharmacy_percent_change_from_baseline",
    "workplaces_percent_change_from_baseline",
    "residential_percent_change_from_baseline"
]
state_metric_correlation = spark_engine.generate_state_metric_corr(df1, states)
state_metric_correlation

processing retail_and_recreation_percent_change_from_baseline: 100%|██████████| 51/51 [08:44<00:00, 10.28s/it]
processing grocery_and_pharmacy_percent_change_from_baseline: 100%|██████████| 51/51 [07:57<00:00,  9.36s/it]
processing workplaces_percent_change_from_baseline: 100%|██████████| 51/51 [06:31<00:00,  7.67s/it]
processing residential_percent_change_from_baseline: 100%|██████████| 51/51 [03:15<00:00,  3.83s/it]


Unnamed: 0,state,retail_and_recreation_percent_change_from_baseline_cases,retail_and_recreation_percent_change_from_baseline_deaths,grocery_and_pharmacy_percent_change_from_baseline_cases,grocery_and_pharmacy_percent_change_from_baseline_deaths,workplaces_percent_change_from_baseline_cases,workplaces_percent_change_from_baseline_deaths,residential_percent_change_from_baseline_cases,residential_percent_change_from_baseline_deaths
0,Alabama,0.03827,0.0,-0.011976,-0.178374,0.020996,0.299997,-0.002186,0.0
1,Alaska,-0.107323,-1.611878,-0.127425,0.0,0.034287,0.0,0.020516,0.315854
2,Arizona,0.028557,0.0,0.0,0.0,0.013195,-1.054756,0.0,0.393135
3,Arkansas,0.0,0.0,-0.042979,-1.365108,0.01651,0.245702,0.0,0.0
4,California,0.074912,1.988404,0.011841,-1.583475,0.0,-2.343522,-0.014824,0.364444
5,Colorado,-0.008803,-7.076329,-0.052,-0.580948,0.09736,-11.713867,0.0,4.863479
6,Connecticut,0.043125,-7.791783,0.0,-2.086468,0.0,-6.73981,0.0,3.618886
7,Delaware,-0.050821,-2.126141,-0.046884,-1.020633,0.037138,-1.467508,0.0,0.943806
8,District of Columbia,-0.084432,-5.964025,-0.101007,-2.959307,0.152609,-7.679318,-0.002485,4.113601
9,Florida,0.157273,6.294405,0.094106,0.0,0.112027,1.554972,-0.052375,-1.026304


In [18]:
import importlib
import analysis.visualization
importlib.reload(analysis.visualization)
from analysis.visualization import plot_state_metric_corr, plot_corr_map
import warnings
warnings.filterwarnings('ignore')
source_corr_matrix = []
        
plot_state_metric_corr(state_metric_correlation, mobility_metrics)
# state_metric_correlation_log

In [19]:
plot_corr_map(state_metric_correlation, state2fips_df, mobility_metrics)

In [17]:
df.createOrReplaceTempView("analysis_data")

Unnamed: 0,state,date,cases_by_population
0,Missouri,2021-03-08,0.008202
1,New Jersey,2021-01-04,0.005656
2,Rhode Island,2021-01-04,0.004482
3,Rhode Island,2020-12-07,0.003467
4,Rhode Island,2020-12-28,0.00333
5,North Dakota,2020-12-08,0.003098
6,Rhode Island,2020-12-14,0.003064
7,North Dakota,2020-11-14,0.003006
8,Rhode Island,2021-01-11,0.002819
9,Rhode Island,2020-11-30,0.002608


+--------------------+--------+------------------+
|               state|subgroup|        avg(cases)|
+--------------------+--------+------------------+
|             Vermont|       0| 48.73024523160763|
|              Hawaii|       0| 78.75204359673025|
|District of Columbia|       0|118.28610354223433|
|               Maine|       0| 132.7057220708447|
|             Wyoming|       0|151.97547683923705|
|              Alaska|       0|166.73297002724794|
|       New Hampshire|       0| 220.8910081743869|
|            Delaware|       0|252.21798365122615|
|        North Dakota|       0|277.56948228882834|
|             Montana|       0| 282.1525885558583|
|        South Dakota|       0| 316.3106267029973|
|        Rhode Island|       0|  365.858310626703|
|       West Virginia|       0|377.15803814713894|
|              Oregon|       0| 441.3188010899183|
|               Idaho|       0|485.59128065395095|
|          New Mexico|       0| 517.7302452316077|
|            Nebraska|       0|

In [None]:
from datetime import datetime, timedelta




DataFrame[state: string, _c0: int, date: timestamp, country_region: string, place_id: string, retail_and_recreation_percent_change_from_baseline: double, grocery_and_pharmacy_percent_change_from_baseline: double, parks_percent_change_from_baseline: double, transit_stations_percent_change_from_baseline: double, workplaces_percent_change_from_baseline: double, residential_percent_change_from_baseline: double, cases: int, deaths: int, acc_cases: int, acc_deaths: int, tmp: int, population: bigint, cases_by_population: double, deaths_by_population: double]

In [20]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install nltk
# !pip install nl4dv

import importlib
import st_nli
importlib.reload(st_nli)
from st_nli import NLInterface


query_sent = "Show the case rate in New England"
query_sent = "Show the case rate on 01/31/2021 in New England"
query_sent = input()
interface = NLInterface()
interface.run(query_sent, df)


Show the case rate on 01/31/2021 in New England
input:  Show the case rate on 01/31/2021 in New England
spec: [<st_nli.libs.filters.TemporalFilter object at 0x7f1502c916d0>, <st_nli.libs.filters.SpatialFilter object at 0x7f1503dc8110>]


Show the cases and retail mobility of the Pacific and Mountain areas


Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 12.7 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


['Show', 'cases', 'retail', 'mobility', 'Pacific', 'Mountain', 'areas', 'Show cases', 'cases retail', 'retail mobility', 'mobility Pacific', 'Pacific Mountain', 'Mountain areas', 'Show cases retail', 'cases retail mobility', 'retail mobility Pacific', 'mobility Pacific Mountain', 'Pacific Mountain areas']
['Show', 'cases', 'retail', 'mobility', 'Pacific', 'Mountain', 'areas', 'Show cases', 'cases retail', 'retail mobility', 'mobility Pacific', 'Pacific Mountain', 'Mountain areas', 'Show cases retail', 'cases retail mobility', 'retail mobility Pacific', 'mobility Pacific Mountain', 'Pacific Mountain areas']
Show
cases
retail
mobility
Pacific
Mountain
areas
Show cases
cases retail
retail mobility
mobility Pacific
Pacific Mountain
Mountain areas
Show cases retail
cases retail mobility
retail mobility Pacific
mobility Pacific Mountain
Pacific Mountain areas
