# Simulation notebook - Colombia data

This notebook generates a people dataframe with 500k+ people from the manzanas data, and runs the HMM-based simualtion with fixed transitions over them.

In [1]:
## General Imports
import sys
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import scipy
from IPython.display import display
from timeit import default_timer as timer
import swifter
from tqdm.notebook import tqdm
import dask.dataframe as dd
from dask.multiprocessing import get
from shapely.ops import nearest_points
from shapely import wkt

%load_ext autoreload
%autoreload 2


# Project Imports
sys.path.append("../..")
import model 
from model.sim import location, state, contacts
from colombia_utils import *


## Constants
shp_path = '../../../shp' # Change to your local shape file directory

## Print info on manzana data

In [2]:
mnz_data = gpd.read_file(os.path.join(shp_path,"Censo_personas_manzanas_2018.shp"))
mnz_data = mnz_data.to_crs({"init": "EPSG:3857"})
print("Average number of people in mnz:", mnz_data['SEXO_TOTAL'].mean())
print("Number of manzanas:", len(mnz_data))
print("Total number of people:", mnz_data['SEXO_TOTAL'].sum())

Average number of people in mnz: 89.66901408450704
Number of manzanas: 5964
Total number of people: 534786


# Function to calculate daily contacts
Unused

In [3]:
def calculate_daily_contacts(people_proj, sim, load_from_csv=True):
    
    if load_from_csv:
        return pd.read_csv("improved_daily_contacts.csv")
    
    start = timer()

    # calculate daily contacts for day 1 as they will be repeated for the other days
    date = sim["dates"]["date"][0]
    daily_contacts = people_proj.apply(calculate_contacts, axis=1, args=[people_proj, date])

    original_daily_contacts = pd.concat(list(daily_contacts), sort=False)
    original_daily_contacts.reset_index(drop=True, inplace=True)

    # calculate the table for all the dates
    improved_daily_contacts = original_daily_contacts.copy()
    for date in sim["dates"]["date"]:
        if len(original_daily_contacts[original_daily_contacts['date'] == date]) < 1:
            new_df = original_daily_contacts.copy()
            new_df['date'] = date
            improved_daily_contacts = improved_daily_contacts.append(new_df)

    improved_daily_contacts.reset_index(drop=True, inplace=True)

    end = timer()
    print("Compute time:",end-start)
    
    return improved_daily_contacts

# improved_daily_contacts = calculate_daily_contacts(people_proj, sim)
# sim["contacts"] = improved_daily_contacts
# sim["N_c"] = contacts.calculate_Nc(sim)
# print("Average daily contacts: {}".format(sim["N_c"]))

In [4]:
# print(improved_daily_contacts["date"].max())

In [5]:
# save=False
# if save:   
#     improved_daily_contacts.to_csv("improved_daily_contacts.csv")

In [6]:
# gdf = gpd.GeoDataFrame(
#     df,
#     geometry=gpd.points_from_xy(
#         df["longitude"],
#         df["latitude"],
#     ),
#     crs={"init":"EPSG:4326"},
# )

# # 10 records
# filtered_df

# filtered_gdf = gpd.GeoDataFrame(
#     filtered_df, 
#     geometry=gpd.points_from_xy(
#         filtered_df["longitude"],
#         filtered_df["latitude"],
#     ),
#     crs={"init":"EPSG:4326"},
# )

# # EPSG:3857 converts it to meters, correct?

# gdf_proj = gdf.to_crs({"init": "EPSG:3857"})
# filtered_gdf_proj = filtered_gdf.to_crs({"init": "EPSG:3857"})

# # so 100 miles would be 160934 meters

# x = filtered_gdf_proj.buffer(100).unary_union

# neighbours = gdf_proj["geometry"].intersection(x)

# Create people dataframe

In [7]:
# Build people_full dataframe containing census data, infected data + estimated people from neighborhood data

def create_people_df_from_mnz_data(mnz_data, create_contacts=True):

#     people_df = pd.DataFrame()
    contacts_df_list = []
    i = 0
    first_pass = True
    for idx, row in tqdm(mnz_data.iterrows()):
        n_ppl_in_mnz = row["SEXO_TOTAL"]
        if not n_ppl_in_mnz:
            continue
        
#         print("row.geometry.centroid",row.geometry.centroid)
#         dummy_row.geometry = row.geometry
        person_row = pd.DataFrame({"patient": [0], "position": [row.geometry.centroid]})
        
#         dummy_row["calculated_centroid"] = row.geometry.centroid
#         print(dummy_row.columns)
        ppl_in_mnz = pd.concat([person_row]*n_ppl_in_mnz)
        ids = np.arange(i, i+n_ppl_in_mnz)
        ppl_in_mnz.patient = ids
        i+=n_ppl_in_mnz
        if first_pass:
            people_df = ppl_in_mnz
            first_pass = False
        else:
            people_df = people_df.append(ppl_in_mnz, ignore_index=True)
            
        
        if create_contacts:
            for date in range(90):
                for id_ in ids:
                    data = {
                            'date': np.repeat(date, n_ppl_in_mnz),
                            'patient1': np.repeat(id_, n_ppl_in_mnz),
                            'patient2': ids
                        }
                    patient_contact = pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"])
                    csv_name = "contacts/contacts_patient%d_date%d.csv" % (id_, date)
                    if not os.path.exists(csv_name):
                        patient_contact.to_csv(csv_name)
#                     contacts_df_list.append(pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"]))    
    print("Final shape of people_df:",people_df.shape)
    if create_contacts:
        contacts_df = pd.concat(contacts_df_list, sort=False)
        contacts_df.reset_index(drop=True, inplace=True)
        return people_df, contacts_df
    else:
        return people_df, None
    
    


large_people_df, large_contacts_df = create_people_df_from_mnz_data(mnz_data, 
                                                                    create_contacts=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Final shape of people_df: (534786, 2)


In [8]:
print("Number of people before adding infected:",len(large_people_df))

Number of people before adding infected: 534786


In [9]:
load_people_df=False
if load_people_df:
    large_people_df = pd.read_csv("people_df_from_manzanas.csv")
    
print("Loaded people df with shape:", large_people_df.shape)
print("Columns in people df:", large_people_df.columns)

Loaded people df with shape: (534786, 2)
Columns in people df: Index(['patient', 'position'], dtype='object')


In [10]:
save_people_df=False
if save_people_df:
    large_people_df.to_csv("people_df_from_manzanas.csv")

In [11]:
# print("Shape of contacts df (should be people_df*90*avg_n_contacts)", contacts_df.shape)
# contacts_df.iloc[3]

# Extrapolate contacts (DEPRECATED)

In [12]:

# def nearest(row, geom_union, df1, df2, geom1_col='geometry', geom2_col='geometry', src_column=None):
#     """Find the nearest point and return the corresponding value from specified column."""
#     # Find the geometry that is closest
#     nearest = df2[geom2_col] == nearest_points(row[geom1_col], geom_union)[1]
#     # Get the corresponding value from df2 (matching is based on the geometry)
#     value = df2[nearest][src_column].get_values()[0]
#     return value

# def nearest(point, people_df):
#     people_locs = people_df.geometry.centroid.unary_union
    
# #     print("Trying to find nearest point to %s" % point)
# #     print("Candidates are:", people_locs[:5], "and more (%d total)" % len(people_locs)) # "with ids:", people_df["patient"][:5]
    
#     # find the nearest person for which we have data
# #     print("people_df.geometry.centroid", people_df.geometry.centroid)
# #     print("nearest_points(point, people_locs)", nearest_points(point, people_locs))
#     nearest = people_df.geometry.centroid == nearest_points(point, people_locs)[1]
# #     print(nearest)
    
#     return people_df[nearest]

# def calculate_contacts(polygon, points, date):
#     _people = points[points.geometry.centroid.within(polygon.geometry.buffer(100))]
#     data = {
#         'date': np.repeat(date, len(_people)),
#         'patient1': np.repeat(polygon['patient'], len(_people)),
#         'patient2': list(_people['patient'])
#     }
    
#     exposure_df = pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"])
#     return exposure_df

# def contacts_from_closest_person(row, people, contacts):
#     print("\nGetting contacts for person", row["patient"])
#     nearest_person = nearest(row.calculated_centroid, people)
#     contact_df = contacts[contacts['patient1']==nearest_person["patient"].iloc[0]]
#     contact_df.patient1 = row["patient"]
# #     print(contact_df)
#     print("Found nearest person. ID: %s. Centroid: %s. Barrio: %s. Contacts: %d" % (nearest_person["patient"], nearest_person.geometry.centroid,
#                                                                                    nearest_person.BARRIO, len(contact_df))) 
    
#     return contact_df

# def contacts_from_neigh(row, people, contacts):
    
#     contacts_for_this_id = neigh2contacts[row[""]]
#     return contacts

# def dummy_contacts(row, people, contacts):
#     contact_df = contacts.iloc[:30]
#     contact_df.patient1 = row["patient"]
#     data = {
#         'date': contact_df["date"].to_numpy(),
#         'patient1': contact_df["patient1"].to_numpy(),
#         'patient2':  contact_df["patient2"].to_numpy()
#     }
#     contact_df = pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"])
#     return contact_df
    
# def extrapolate_contacts(contacts, people, full_people):
#     print("Extrapolating contacts for full dataframe of length %d from small dataframe of length %d" % (len(full_people), len(people)))
# #     extrapolated_contacts = full_people.apply(contacts_from_closest_person, axis=1, args=[people, contacts])
#     ddata = dd.from_pandas(full_people, npartitions=100)
#     extrapolated_contacts = ddata.map_partitions(lambda df: df.apply(calculate_contacts, axis=1, args=[people, date])).compute(get=get) 
#     return extrapolated_contacts
    

In [13]:
# print(improved_daily_contacts.columns)
# extrapolated_contacts = extrapolate_contacts(improved_daily_contacts, people_proj, large_people_df)

In [14]:
# print("shape extrapolated contacts:", extrapolated_contacts.shape)

In [139]:
# save=True
# if save:
#     extrapolated_contacts.to_csv("extrapolated_contacts.csv")
    
# load=False
# if load:
#     extrapolated_contacts = pd.read_csv("extrapolated_contacts.csv")

## Get mnz2id dictionaries

In [197]:
mnz2ids = get_mnz2ids(mnz_data)
id2mnz = get_id2mnz(mnz2ids)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5964.0), HTML(value='')))




In [198]:
# print(len(mnz2ids.keys()))
# s=0
# for k,v in mnz2ids.items():
#     s += len(v)
    
# print(s)


# c=0
# for idx, row in tqdm(mnz_data.iterrows()):
#     n_ppl_in_mnz = row["SEXO_TOTAL"]
#     print()
#     print(n_ppl_in_mnz)
#     print(len(mnz2ids[idx]))
# #     i += n_ppl_in_mnz
#     c+=1
#     if c> 10: break

# Get list of infected

Some of the infected have an address that is in none of the manzanas

In [199]:

def include_infected_to_people_df(people_df, 
                                 mnz2ids, 
                                 id2mnz,
                                 path_to_infected_file=os.path.join(shp_path,"POSITIVOS_COVID_19.shp"), 
                                 T=90):
    
    infected = gpd.read_file(path_to_infected_file)
    inf_active = infected[infected["estado_ate"]==1]
    

    print("Loaded %d infected rows" % len(inf_active))
    print("people_df.shape before infected concat", people_df.shape)
    max_id = people_df.patient.max()
    assert(max_id == max(id2mnz.keys()))
    infected_ids = np.arange(len(inf_active))+max_id+1

    infected_of_interest = []
    
    # Find mnz corresponding to each infected
    c=0
    for i, mnz in tqdm(mnz_data.iterrows()):
#         print("\nsearching if any infected in mnz %d" % i)
        infected_ids_in_mnz = infected_ids[inf_active.geometry.centroid.within(mnz.geometry.buffer(100))]
#         print("found %d infected in mnz" % len(infected_ids_in_mnz), i, "with ids: ", infected_ids_in_mnz)
        if len(infected_ids_in_mnz)==0:
            continue
            
#         print("adding infected ids to mnz. len mnz2ids[i] before:", len(mnz2ids[i]))
        mnz2ids[i] = np.concatenate([mnz2ids[i], infected_ids_in_mnz])
#         print("after:",len(mnz2ids[i]), mnz2ids[i].shape) 
#         print("updating id2mnz")
        for inf_id in infected_ids_in_mnz:
            id2mnz[inf_id] = id2mnz.get(inf_id, [])
            id2mnz[inf_id].append(i)
#             print("id2mnz[inf_id]:",id2mnz[inf_id])
        
#         print( infected_ids_in_mnz[0])
#         if 534833 in infected_ids_in_mnz:
#             print("534833 found", infected_ids_in_mnz[0])
#         print("infected_ids_in_mnz",infected_ids_in_mnz)
        infected_of_interest.append(infected_ids_in_mnz)
#         c+=1
#         if c>10:
#             break

    infected_of_interest = np.unique(np.concatenate(infected_of_interest))
    print("infected_of_interest.shape",infected_of_interest.shape)
    people_df = pd.concat([people_df,pd.DataFrame({"patient":infected_of_interest, 
                                                   "position":inf_active.iloc[infected_of_interest-max_id-1].geometry})])
    print("Concatenated infected to people_df. People_df is now of shape:", people_df.shape)
    print(people_df.columns)
    
#     latitudes = inf_active.geometry.centroid.y
#     longitudes = inf_active.geometry.centroid.x
#     infected_ids = np.arange(start_id, len(inf_active)+start_id)
#     infected_locations = pd.DataFrame({"patient": np.tile(infected_ids, T), 
#                                        "date": np.tile(np.arange(T), len(inf_active)), 
#                                        "latitude": np.repeat(latitudes, T), 
#                                        "longitude": np.repeat(longitudes, T)})
    
#     sim["location"].append(infected_locations)
#     sim["patients"] = sim["patients"].append(pd.DataFrame({"patient": infected_ids}))
    
    return people_df, mnz2ids, id2mnz, infected_of_interest


# def include_infected_to_contacts_df(sim, 
#                                     mnz_data, 
#                                     path_to_infected_file=os.path.join(shp_path,"POSITIVOS_COVID_19.shp"), 
#                                     T=90, 
#                                     start_id=0):
#     contacts_df_list = []
#     c=0
#     for i, mnz in tqdm(mnz_data.iterrows()):
# #         print("searching if any infected in mnz %d" % i)
#         infected_ids_in_mnz = infected_ids[inf_active.geometry.centroid.within(mnz.geometry.buffer(100))]
# #         print("found %d infected in mnz" % len(infected_ids_in_mnz))
# #         print(infected_ids_in_mnz)
#         if len(infected_ids_in_mnz)==0:
#             continue
            
#         ids_in_mnz = mnz2ids[i]
#         n_ppl_in_mnz = len(ids_in_mnz)
# #         print("People in this mnz:", n_ppl_in_mnz)
        
#         for date in range(90):
#             for id_inf in infected_ids_in_mnz:
#                 data = {
#                     'date': np.repeat(date, n_ppl_in_mnz),
#                     'patient1': np.repeat(id_inf, n_ppl_in_mnz),
#                     'patient2': ids_in_mnz
#                 }

#                 contacts_df_list.append(pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"]))
                
#         c+=1
#         if c>100:break
#     print("Number of contact dataframes (each correspond to the contacts of one infected on one date): %d" % len(contacts_df_list))
#     infected_contacts = pd.concat(contacts_df_list)
#     sim["contacts"].append(infected_contacts)
    
#     return sim, infected_ids


In [200]:

large_people_df_inf, mnz2ids_inf, id2mnz_inf, infected_ids = include_infected_to_people_df(large_people_df, mnz2ids, id2mnz)

Loaded 456 infected rows
people_df.shape before infected concat (534786, 2)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


infected_of_interest.shape (414,)
Concatenated infected to people_df. People_df is now of shape: (535200, 2)
Index(['patient', 'position'], dtype='object')


In [201]:
print(sum(infected_ids==534833))

0


In [175]:
print(max(id2mnz.keys()))

535241


# Run full simulation

In [148]:
print("number of people including infected:",len(large_people_df_inf))
print("min and max ids:", large_people_df_inf["patient"].max(), large_people_df_inf["patient"].min())
print("duplicates:", large_people_df[large_people_df["patient"].duplicated()])

number of people including infected: 535200
min and max ids: 535241 0
duplicates: Empty DataFrame
Columns: [patient, position]
Index: []


In [155]:
T=90
# if load_people_df:
# #     print(type(large_people_df["position"].iloc[0]))
#     large_people_df["position"] = large_people_df["position"].apply(wkt.loads)

sim_full = instantiate_sim(mnz_data, large_people_df_inf, T=T, reindex=False)
sim_full["mnz2ids"] = mnz2ids_inf
sim_full["id2mnz"] = id2mnz_inf
print("Simulation instantiated. Current people in simulation: %d" % len(sim_full["patients"]))
print("Current simulation keys:", sim_full.keys())
# print("Adding infected")
# # sim_full, infected_ids = include_infected(sim_full, T=T, start_id=len(large_people_df))
# print("Included %d infected" % len(infected_ids))
print("Final number of people to simulate: %d" % len(sim_full["patients"]))
sim_full["N_c"] = contacts.calculate_Nc_for_mnz_approximation(sim_full)
print("Average daily contacts for full df: {}".format(sim_full["N_c"]))
# np.savetxt('sim_states_full.csv', sim_results["states"], delimiter=',')

Instantiating Simulation
535200 Empty DataFrame
Columns: [patient, position]
Index: []
Simulation instantiated. Current people in simulation: 535200
Current simulation keys: dict_keys(['map', 'location', 'patients', 'dates', 'mnz2ids', 'id2mnz'])
Final number of people to simulate: 535200
Average daily contacts for full df: 193.7652615844544


In [202]:
# Run sim
def run_sim(sim, N_infected=15, infected=None):
    sim["states"], sim["tests"] = state.simulate_states(sim, N_infected=N_infected, infected=infected)
    sim["hospital"] = state.get_first_occurrence(sim["states"], 6)
    sim["deaths"] = state.get_first_occurrence(sim["states"], 8)
    return sim

sim_results = run_sim(sim_full, N_infected=len(infected_ids), infected = infected_ids)

Calculating states...
t = 0; 414 infectious; 1438 exposed; 534786 susceptible; 0 dead
t = 9; 114 infectious; 300 exposed; 534771 susceptible; 0 dead
t = 18; 31 infectious; 1438 exposed; 534760 susceptible; 0 dead
t = 27; 23 infectious; 111 exposed; 534734 susceptible; 0 dead
t = 36; 16 infectious; 111 exposed; 534731 susceptible; 0 dead
t = 45; 6 infectious; 263 exposed; 534728 susceptible; 0 dead
t = 54; 7 infectious; 263 exposed; 534726 susceptible; 0 dead
t = 63; 5 infectious; 263 exposed; 534721 susceptible; 0 dead
t = 72; 4 infectious; 263 exposed; 534715 susceptible; 0 dead
t = 81; 5 infectious; 263 exposed; 534712 susceptible; 0 dead
Calculated states in 543.66 seconds.


In [203]:
np.savetxt('sim_states_full2.csv', sim_results["states"], delimiter=',')

# Alternative: compute full contacts with heavy parallelism

In [None]:
def calculate_contacts(polygon, points, date):
    
    _people = points[points.geometry.centroid.within(polygon.geometry.buffer(100))]
#     print(len(_people))
    data = {
        'date': np.repeat(date, len(_people)),
        'patient1': np.repeat(polygon['patient'], len(_people)),
        'patient2': list(_people['patient'])
    }
    
    exposure_df = pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"])
    return exposure_df

def calculate_daily_contacts(people_proj, sim, load_from_csv=False):
    
    if load_from_csv:
        return pd.read_csv("improved_daily_contacts.csv")
    
    start = timer()

    # calculate daily contacts for day 1 as they will be repeated for the other days
    date = sim["dates"]["date"][0]
    daily_contacts = people_proj.swifter.apply(calculate_contacts, axis=1, args=[people_proj, date])

    original_daily_contacts = pd.concat(list(daily_contacts), sort=False)
    original_daily_contacts.reset_index(drop=True, inplace=True)

    # calculate the table for all the dates
    improved_daily_contacts = original_daily_contacts.copy()
    for date in sim["dates"]["date"]:
        if len(original_daily_contacts[original_daily_contacts['date'] == date]) < 1:
            new_df = original_daily_contacts.copy()
            new_df['date'] = date
            improved_daily_contacts = improved_daily_contacts.append(new_df)

    improved_daily_contacts.reset_index(drop=True, inplace=True)

    end = timer()
    print("Compute time:",end-start)
    
    return improved_daily_contacts

full_daily_contacts = calculate_daily_contacts(large_people_df, sim_full, load_from_csv=False)
sim_full["contacts"] = full_daily_contacts
sim_full["N_c"] = contacts.calculate_Nc(sim_full)
print("Average daily contacts: {}".format(sim_full["N_c"]))

In [None]:
print(full_daily_contacts)

In [None]:
sim_results = run_sim(sim_full, N_infected=0, infected = infected_ids)