# SImulation notebook - Colombia data

In [14]:
import sys
import os
sys.path.append("../..")
import model 
import geopandas as gpd
import pandas as pd
import numpy as np
import scipy
from model.sim import location, state, contacts
from IPython.display import display
from timeit import default_timer as timer

In [15]:
crs = {'epsg:3857'}
shp_path = '../../../shp' # Change to your local shape file directory

In [16]:
file1 = gpd.read_file(os.path.join(shp_path,"Barrios_info_DANE.shp")) 
file1['SEXO_TOT_1'].sum()

551559

In [17]:
print(file1['SEXO_TOT_1'])

0      3911
1      2079
2      2222
3      5425
4      2666
       ... 
227     184
228     418
229    3000
230       0
231       0
Name: SEXO_TOT_1, Length: 232, dtype: int64


In [18]:
people = gpd.read_file(os.path.join(shp_path,"Censo_personas_manzanas_2018.shp"))

In [19]:
len(people)

5964

In [20]:
people['SEXO_TOTAL'].sum()

534786

In [21]:
inhabitants = people['SEXO_TOTAL'].sum()

In [22]:
len(people['SEXO_TOTAL'])

5964

In [23]:
# utils functions  

def calculate_contacts(polygon, points, date):
    _people = points[points.geometry.centroid.within(polygon.geometry.buffer(100))]
    data = {
        'date': np.repeat(date, len(_people)),
        'patient1': np.repeat(polygon['patient'], len(_people)),
        'patient2': list(_people['patient'])
    }
    
    exposure_df = pd.DataFrame(data=data, columns = ["date","patient1","patient2"])
    return exposure_df

# def extrapolation(df):
#     transformation_table = []
#     for i, value in df.sort_values("patient").iterrows():
#         original_index = value['patient']
#         new_index = 0
#         step = value['SEXO_TOTAL']
#         transformation_table[original_index] = [np.arange(new_index,new_index+step-1)]
#         new_index = new_index + step
#     return transformation_table

In [24]:
def instantiate_sim(neigh_info_path, people_df, T=100, reindex = True): 
    sim = {"map":gpd.read_file(neigh_info_path)}
    pop_per_neigh = sim["map"]['SEXO_TOT_1']
    N0 = len(people_df)
    
    if reindex:
        people_df['patient'] = np.arange(len(people_df))
    #N0 = people.SEXO_TOTAL.sum()
    #people = people.reindex(people.index.repeat(people.SEXO_TOTAL))
                
    lat = people_df.centroid.y
    lng = people_df.centroid.x

    people_proj['calculated_centroid'] = people_proj.centroid

    sim["location"] = pd.DataFrame(
    {
        "patient": np.repeat(np.arange(N0), T),
        "date": np.tile(np.arange(T), N0),
        "latitude": np.repeat(lat, T),
        "longitude": np.repeat(lng, T),
        }
    )

#     people_points = people_df.centroid

    distance_cutoff = 0.012
    sim["patients"] = pd.DataFrame({"patient": np.unique(sim["location"]["patient"])})
    sim["dates"] = pd.DataFrame({"date": np.unique(sim["location"]["date"])})
    
    return sim

neigh_info_path = shp_path + "/Barrios_info_DANE.shp"  
census_path = shp_path + "/Censo_personas_manzanas_2018.shp"
people = gpd.read_file(census_path, index=None) 
people_proj = people.to_crs({"init": "EPSG:3857"})
sim = instantiate_sim(neigh_info_path, people_proj)

In [28]:
def calculate_daily_contacts(people_proj, sim, load_from_csv=True):
    
    if load_from_csv:
        return pd.read_csv("improved_daily_contacts.csv")
    
    start = timer()

    # calculate daily contacts for day 1 as they will be repeated for the other days
    date = sim["dates"]["date"][0]
    daily_contacts = people_proj.apply(calculate_contacts, axis=1, args=[people_proj, date])

    original_daily_contacts = pd.concat(list(daily_contacts), sort=False)
    original_daily_contacts.reset_index(drop=True, inplace=True)

    # calculate the table for all the dates
    improved_daily_contacts = original_daily_contacts.copy()
    for date in sim["dates"]["date"]:
        if len(original_daily_contacts[original_daily_contacts['date'] == date]) < 1:
            new_df = original_daily_contacts.copy()
            new_df['date'] = date
            improved_daily_contacts = improved_daily_contacts.append(new_df)

    improved_daily_contacts.reset_index(drop=True, inplace=True)

    end = timer()
    print("Compute time:",end-start)
    
    return improved_daily_contacts

improved_daily_contacts = calculate_daily_contacts(people_proj, sim)
sim["contacts"] = improved_daily_contacts
sim["N_c"] = contacts.calculate_Nc(sim)
print("Average daily contacts: {}".format(sim["N_c"]))

Average daily contacts: 330.0335345405768


In [29]:
save=False
if save:   
    improved_daily_contacts.to_csv("improved_daily_contacts.csv")

In [88]:
# Run sim
def run_sim(sim, N_infected=15, infected=None):
    sim["states"], sim["tests"] = state.simulate_states(sim, N_infected=N_infected, infected=infected)
    sim["hospital"] = state.get_first_occurrence(sim["states"], 6)
    sim["deaths"] = state.get_first_occurrence(sim["states"], 8)
    return sim
    
# run_sim(sim)

In [124]:
# gdf = gpd.GeoDataFrame(
#     df,
#     geometry=gpd.points_from_xy(
#         df["longitude"],
#         df["latitude"],
#     ),
#     crs={"init":"EPSG:4326"},
# )

# # 10 records
# filtered_df

# filtered_gdf = gpd.GeoDataFrame(
#     filtered_df, 
#     geometry=gpd.points_from_xy(
#         filtered_df["longitude"],
#         filtered_df["latitude"],
#     ),
#     crs={"init":"EPSG:4326"},
# )

# # EPSG:3857 converts it to meters, correct?

# gdf_proj = gdf.to_crs({"init": "EPSG:3857"})
# filtered_gdf_proj = filtered_gdf.to_crs({"init": "EPSG:3857"})

# # so 100 miles would be 160934 meters

# x = filtered_gdf_proj.buffer(100).unary_union

# neighbours = gdf_proj["geometry"].intersection(x)

# Simulation with 500k people from neighborhoods

In [30]:
# Build people_full dataframe containing census data, infected data + estimated people from neighborhood data


def create_people_df_from_neighborhood_data(neighborhood_data, dummy_row):
    people_df = dummy_row
    people_df.patient = [0]
    i=1
    for idx, row in neighborhood_data.iterrows():
        if not row["SEXO_TOT_1"]:
            continue
#         print("row.geometry.centroid",row.geometry.centroid)
#         dummy_row.geometry = row.geometry
        dummy_row["calculated_centroid"] = row.geometry.centroid
#         print(dummy_row.columns)
        ppl_from_nb = [dummy_row]*row["SEXO_TOT_1"]
        ppl_from_nb = pd.concat(ppl_from_nb)
        ppl_from_nb.patient = np.arange(i, i+len(ppl_from_nb))
        i+=len(ppl_from_nb)
        people_df = people_df.append(ppl_from_nb, ignore_index=True)        
        
    print("Final shape of people_df:",people_df.shape)
    return people_df


sim["map"]["calculated_centroid"] = sim["map"].centroid
large_people_df = create_people_df_from_neighborhood_data(sim["map"], people_proj[["geometry","patient"]].iloc[:1])

Final shape of people_df: (551560, 3)


In [32]:
print(large_people_df.columns, large_people_df.shape)

Index(['geometry', 'patient', 'calculated_centroid'], dtype='object') (551560, 3)


# Extrapolate contacts

In [108]:
from shapely.ops import nearest_points

# def nearest(row, geom_union, df1, df2, geom1_col='geometry', geom2_col='geometry', src_column=None):
#     """Find the nearest point and return the corresponding value from specified column."""
#     # Find the geometry that is closest
#     nearest = df2[geom2_col] == nearest_points(row[geom1_col], geom_union)[1]
#     # Get the corresponding value from df2 (matching is based on the geometry)
#     value = df2[nearest][src_column].get_values()[0]
#     return value

def nearest(point, people_df):
    people_locs = people_df.geometry.centroid.unary_union
    
#     print("Trying to find nearest point to %s" % point)
#     print("Candidates are:", people_locs[:5], "and more (%d total)" % len(people_locs)) # "with ids:", people_df["patient"][:5]
    
    # find the nearest person for which we have data
#     print("people_df.geometry.centroid", people_df.geometry.centroid)
#     print("nearest_points(point, people_locs)", nearest_points(point, people_locs))
    nearest = people_df.geometry.centroid == nearest_points(point, people_locs)[1]
#     print(nearest)
    
    return people_df[nearest]

def calculate_contacts(polygon, points, date):
    _people = points[points.geometry.centroid.within(polygon.geometry.buffer(100))]
    data = {
        'date': np.repeat(date, len(_people)),
        'patient1': np.repeat(polygon['patient'], len(_people)),
        'patient2': list(_people['patient'])
    }
    
    exposure_df = pd.DataFrame(data=data, columns = ["date", "patient1", "patient2"])
    return exposure_df

def contacts_from_closest_person(row, people, contacts, this_id):
    print("\nGetting contacts for person", row["patient"])
    nearest_person = nearest(row.calculated_centroid, people)
    contact_df = contacts[contacts['patient1']==nearest_person["patient"].iloc[0]]
    contact_df.patient1 = row["patient"]
#     print(contact_df)
#     print("Found nearest person. ID: %s. Centroid: %s. Barrio: %s. Contacts: %d" % (nearest_person["patient"], nearest_person.geometry.centroid,
#                                                                                    nearest_person.BARRIO, len(contact_df))) 
    
    return contact_df

def contacts_from_neigh(row, people, contacts):
    
    contacts_for_this_id = neigh2contacts[row[""]]
    return contacts

cnt=0
def dummy_contacts(row, people, contacts):
    contact_df = contacts.iloc[:30]
    contact_df.patient1 = row["patient"]
    return contact_df
    
def extrapolate_contacts(contacts, people, full_people):
    print("Extrapolating contacts for full dataframe of length %d from small dataframe of length %d" % (len(full_people), len(people)))
    extrapolated_contacts = full_people.apply(dummy_contacts, axis=1, args=[people, contacts])
    return extrapolated_contacts
    

In [109]:
print(improved_daily_contacts.columns)

Index(['Unnamed: 0', 'date', 'patient1', 'patient2'], dtype='object')


In [110]:
extrapolated_contacts = extrapolate_contacts(improved_daily_contacts, people_proj, large_people_df)

Extrapolating contacts for full dataframe of length 551560 from small dataframe of length 5964


In [112]:
save=True
if save:
    extrapolated_contacts.to_csv("extrapolated_contacts.csv")

# Get list of infected

In [113]:

def include_infected(sim, path_to_infected_file=os.path.join(shp_path,"POSITIVOS_COVID_19.shp"), T=90, start_id=0):
    posi = gpd.read_file(path_to_infected_file)

    inf_active = posi[posi["estado_ate"]==1]
    latitudes = inf_active.geometry.centroid.y
    longitudes = inf_active.geometry.centroid.x
    infected_ids = np.arange(start_id, len(inf_active)+start_id)
    infected_locations = pd.DataFrame({"patient": np.tile(infected_ids, T), 
                                       "date": np.tile(np.arange(T), len(inf_active)), 
                                       "latitude": np.repeat(latitudes, T), 
                                       "longitude": np.repeat(longitudes, T)})
#     print(infected_locations)
    sim["location"].append(infected_locations)
    return sim, infected_ids
    
include_infected(sim)

({'map':       Id              BARRIO           area        COMUNA  \
  0      0        Las Gaviotas  103710.737439  Centro Norte   
  1      1        Los Balkanes   58750.503679  Centro Norte   
  2      2     Las Gaviotas II   80384.347238  Centro Norte   
  3      3    Villa Del Carmen  133642.372386  Centro Norte   
  4      4       Villa Cecilia   73397.793457  Centro Norte   
  ..   ...                 ...            ...           ...   
  227  232  Urb. DoÃ±a Manuela   86397.705219           Sur   
  228  233      Matha  Gisella   17258.161905           Sur   
  229  234         Urb. El Rio  113650.306502      Oriental   
  230  199    Rigoberta Menchu   11870.886466      Oriental   
  231  212      Linda Maria II   12053.239505           Sur   
  
                                     GlobalID  CreationDa           Creator  \
  0    {DA37628A-E472-4563-A5D9-BAFBCE2FF0C5}  2020-05-11  alcaldia_soledad   
  1    {DA43EBEF-6C64-42D0-9C20-DD7BFC415CFD}  2020-05-11  alcaldia_soledad 

# Run full simulation

In [119]:
T=90
sim_full = instantiate_sim(neigh_info_path, large_people_df, T=T, reindex=True)
sim_full, infected_ids = include_infected(sim, T=T, start_id=len(large_people_df))
sim_full["contacts"] = extrapolated_contacts
sim_full["N_c"] = pd.DataFrame(extrapolated_contacts).calculate_Nc(sim_full)
print("Average daily contacts for full df: {}".format(sim_full["N_c"]))
sim_results = run_sim(sim_full, N_infected=0, infected = infected_ids)
with open("sim_results_full.json", "w+") as f:
    json.dump(sim_results, f)

AttributeError: 'DataFrame' object has no attribute 'calculate_Nc'