In [17]:
import numpy as np
from pathlib import Path
import pandas as pd
import kmapper
from sklearn import datasets
from datetime import date, timedelta, datetime
from sklearn.preprocessing import normalize
import networkx as nx
import IPython
import tkinter

In [18]:
data_folder = Path("csse_covid_19_data/csse_covid_19_daily_reports/")

pandemic_start = date(2020, 1, 22)

start_date = date(2020, 3, 22)
end_date = date(2020, 12, 19)

delete_location = True
delete_unassigned = True
normalize_data = True
sort_by_location = False

delta = end_date - start_date

In [19]:
for day in range(delta.days + 1):

    date = start_date + timedelta(days=day)
    day_file = date.strftime("%m-%d-%Y") + ".csv"

    file_to_open = data_folder / day_file
    raw_data = pd.read_csv(file_to_open, header=0, delimiter=',', encoding=None, usecols=(
        "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed"))

    days_since_start = date - pandemic_start

    raw_data['Days since start'] = days_since_start.days  # number of days
    raw_data['Date'] = date.strftime("%m-%d-%Y")
    raw_data = raw_data[raw_data['Lat'].notna()] #drop rows where Lat is empty
    raw_data = raw_data[raw_data['Long_'].notna()] #drop rows where Long_ is empty
    raw_data = raw_data.fillna(value = "N/A")
    
    if day == 0:
        final_array = raw_data.copy(deep=False)  # make shallow copy
    else:
        final_array = final_array.append(raw_data)

In [20]:
final_array.drop(
    final_array[final_array['Country_Region'] != "US"].index, inplace=True)

if delete_unassigned:
    final_array.drop(
        final_array[final_array['Admin2'] == "Unassigned"].index, inplace=True)
    final_array.drop(
        final_array[final_array.Admin2.str.contains("Out of")].index, inplace=True)

if sort_by_location:
    final_array = final_array.sort_values(['Province_State', 'Admin2'])

if delete_location:
    #matrix: "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Confirmed", "days since start", "date"
    temp = final_array.to_numpy()
    indices = np.empty(len(temp), dtype=object)
    for i in range(len(temp)):
        indices[i] = str(temp[i, 0]) + ", " + str(temp[i, 1]) + ", " + str(temp[i, 7])

    print(indices)
    final_array = final_array.drop(
        columns=["Admin2", "Province_State", "Country_Region", "Date"])
    final_array = final_array.to_numpy()
    
    if normalize_data: #normalize the columns (axis = 0)            
        data = normalize(final_array, axis=0, norm='l2')
    else:
        data = final_array.copy()
else:
    data = final_array.copy()

['Chesterfield, Virginia, 03-22-2020' 'Alexandria, Virginia, 03-22-2020'
 'Stafford, Virginia, 03-22-2020' ... 'Cedar, Missouri, 12-19-2020'
 'Crawford, Missouri, 12-19-2020' 'Daviess, Missouri, 12-19-2020']


In [21]:
km = kmapper.KeplerMapper()
lens = km.project(data)
graph = km.map(X=data, lens=lens, cover=kmapper.Cover(n_cubes=10, perc_overlap=0.08))

In [22]:
url = 'make_circles_keplermapper_output.html'


km.visualize(graph,
                 path_html=url,
                 title="COVID-19 Cases Dataset", custom_tooltips = indices)

# iframe = '<iframe src=' + url + ' width=1200 height=800></iframe>'
# IPython.display.HTML(iframe)
IPython.display.IFrame(url, width=1200, height=800)

In [23]:
# nx.draw(nx_graph)

In [24]:
print(data)

[[ 1.57225311e-03 -1.34782258e-03  1.19024238e-06  4.60715067e-04]
 [ 1.63284300e-03 -1.33905033e-03  7.43901490e-07  4.60715067e-04]
 [ 1.61616643e-03 -1.34560540e-03  7.43901490e-07  4.60715067e-04]
 ...
 [ 1.58698473e-03 -1.63042103e-03  7.18608840e-05  2.54929004e-03]
 [ 1.59761637e-03 -1.58613729e-03  2.19897281e-04  2.54929004e-03]
 [ 1.68111439e-03 -1.63269226e-03  6.32316267e-05  2.54929004e-03]]
