In [None]:
import pandas as pd
import numpy as np
import calendar
import time
import glob
import os

#Disable the 'SettingWithCopyWarning' when adding new columns to an existing dataframe
pd.options.mode.chained_assignment = None  # default='warn'

#Data cleaning
from functools import reduce
import unicodedata
import re

#Ignore warnings about the DataFrameGroupBy() method
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#For the graphics
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.patches as mpatches

#Feature selection
from sklearn.inspection import permutation_importance

#Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import ParameterGrid   #to implement manual GridSearch

#Dynamic time warping: grouping
from dtaidistance import dtw
from dtaidistance import dtw_ndim

#Data

# Dengue or Zika: Run this first

In [25]:
disease = 'zika'
path = "your_path/"

In [33]:
#Read all data
#-> IMPORTANT: I'm using a naming convention of choice here for the data coming from the "data_cleaning" notebooks.
#-> If you're doing it differently, remember to chance it below.
df = pd.DataFrame()

if disease == 'dengue':
  init_year = 2014
  end_year = 2023
  columns_filter = 9

if disease == 'zika':
  init_year = 2016
  end_year = 2024
  columns_filter = 3

for year in range(init_year, end_year, 1):   #2014-2022 to weekly data.
  #Open a table for each year that we have weekly dengue data.
  df_year = pd.read_csv(str(path + '/SINAN_' + disease + '_weekly_' + str(year) + '.csv'))
  df_year.drop(columns='Unnamed: 0', inplace=True)

  full_city_to_dengue_list = []   #rather inefficient vector, but it works.

  #These loops in UFs and cities are here to guarantee that I'm selecting unique
  #-> cities in each state, as identical city names is a common occurrence in Brazil.
  for city_codes in list(df_year['cod_city'].unique()):
    city_data = df_year.loc[df_year['cod_city'] == city_codes].values[0][columns_filter:]
    for value in range(0, len(city_data), 1):
      full_city_to_dengue_list.append(city_data[value])   #saves each value in the selected row in this array

  df_year.drop(columns=df_year.columns[9:], inplace=True)   #no need to keep the dengue cases count anymore

  df_year_mod = pd.DataFrame(np.repeat(df_year.values, 52, axis=0))   #expand the dataframe to add the year and week columns later
  df_year_mod.columns = df_year.columns

  year_list = np.full(len(df_year_mod), fill_value=year)
  week_list = np.tile(list(range(1,53)), ( int(len(full_city_to_dengue_list)/52) ) )

  #Adds the year, week and cases columns to the new dataframe
  df_year_mod['Year'] = year_list
  df_year_mod['week'] = week_list
  df_year_mod['cases'] = full_city_to_dengue_list

  #Concatenate the previous result with what was already done
  df = pd.concat([df, df_year_mod])

#only one language in the dataframe, please.
df = df.rename(columns={'UF': 'state', 'Município de notificação': 'city'})

#Arrange by state and year
if disease == 'dengue':
  df.sort_values(by=['state', 'city', 'Year'], ascending=[True, True, True], inplace=True)
if disease == 'zika':
  df.sort_values(by=['cod_city', 'Year'], ascending=[True, True], inplace=True)
df.reset_index(inplace=True, drop=True)

#Remove region names and reorganize the dataframe to the same order used in filtered data
if disease == 'dengue':
  df.drop(columns=['name_rgi', 'name_rgint'], inplace=True)

if disease == 'dengue':
  df = df[['state', 'cod_rgi', 'cod_rgint', 'cod_city', 'city', 'x_coord', 'y_coord', 'week', 'Year', 'cases']]
if disease == 'zika':
  df = df[['cod_city','x_coord', 'y_coord', 'week', 'Year', 'cases']]

#COVID-19 or Influenza: Run this first

In [51]:
disease = 'covid'
#path = 'your_path/' + disease + '/'
path = '/media/llober/Time/Doutorado/Trabalhos/Projeto-dengue/test_verif/'

#packs all yearly files in a single data frame.
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
df.drop(columns='Unnamed: 0', inplace=True)

# Filtering the data by years

In [52]:
#save the cities and the amount of years each one contains in its time series.
test_years = []
test_cities = []
removed_cities = 0
removed_cities_names = []

df_filled = df.copy()
df_filled = df_filled.fillna(0)
city_counter = 0

for city in list(df_filled.cod_city.unique()):
  #print('City', city_counter+1, 'of', len(list(df_filled.cod_city.unique())))
  swp = df_filled.loc[ (df_filled['cod_city'] == city) ]
  years = list( swp.loc[swp['cod_city'] == city].Year.unique() )

  test_cities.append(city)
  test_years.append(len(years))

  city_counter = city_counter+1


cities_lenght_data = {'city': test_cities, 'n_years': test_years}
cities_lenght = pd.DataFrame(data=cities_lenght_data)
cities_lenght.sort_values(by='n_years', ascending=False, inplace=True)


df_max_years = pd.DataFrame()
cities = cities_lenght.loc[cities_lenght['n_years'] == max(cities_lenght.n_years.unique())]

for city in list(cities.city.unique()):
  swp = df_filled.loc[df_filled['cod_city'] == city]
  if (len(swp) !=0):
    df_max_years = pd.concat([df_max_years, swp])

# Adding lags

In [53]:
y_disease = df_max_years['cases']

df_lags = pd.DataFrame({
    #saving city info for ease-of-use
    'cod_city': df_max_years['cod_city'],
    'x_coord': df_max_years['x_coord'],
    'y_coord': df_max_years['y_coord'],
    'year': df_max_years['Year'],
    'week': df_max_years['week'],
    #dengue cases
    #'cases_percent': y_dengue_percent,
    'cases': y_disease,
    'cases_lag_1': y_disease.shift(1),
    'cases_lag_2': y_disease.shift(2),
    'cases_lag_3': y_disease.shift(3),
    'cases_lag_4': y_disease.shift(4),
    'cases_lag_5': y_disease.shift(5)
})

#Fill rows with missing data with zeros
df_lags = df_lags.fillna(0)

In [54]:
#path = 'C:/Users/your_path/' + disease + '/'
#df_lags.to_csv(path + disease + '_weekly_first_5_lags.csv')
df_lags.to_csv('covid_lags_test.csv')

#DTW of the time series of cases

In [None]:
path = 'your_path/' + disease + '/'  #path to save the resulting files

In [None]:
cases_by_cities = df_lags[['cod_city', 'cases']]

cases_by_cities = cases_by_cities.groupby(['cod_city'])['cases'].apply(lambda x: x.values.flatten())
cases_by_cities = pd.DataFrame(cases_by_cities.values.tolist(),index=cases_by_cities.index).add_prefix('obs_')

In [None]:
dtw_dist = dtw.distance_matrix_fast(cases_by_cities.to_numpy(dtype='float'))
df_dtw = pd.DataFrame(dtw_dist, index=cases_by_cities.index, columns=cases_by_cities.index)

In [None]:
df_distances_dtw = pd.DataFrame()

for city1 in list(df_dtw.index):
  first_city = []
  second_city = []
  dists = []
  for city2 in list(df_dtw.index):
    if (city1 != city2):
      first_city.append(city1)
      second_city.append(city2)
      dists.append(df_dtw[city1].loc[city2])
  data_distances = {'city_1': first_city, 'city_2': second_city, 'distance': dists}
  df_swp = pd.DataFrame(data_distances)

  #We'll only save the closest 10 cities, to avoid a 5237^2 sized (immense) dataframe
  df_distances_dtw = pd.concat([df_distances_dtw, df_swp.sort_values(by='distance', ascending=True).head(10)])

df_distances_dtw.to_csv(path + disease + '_distances_cases_dtw.csv')

#DTW of PIB/GDP

In [None]:
path = 'your_path/'
df_pib = pd.read_excel(path + 'pib_filtered_years.xls')

df_pib['city'] = df_pib['city'].str.upper()
#-> Removing accents
cols = df_pib.select_dtypes(include=[object]).columns
df_pib[cols] = df_pib[cols].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))

In [None]:
#Using another dataframe to recover correct city codes for the GDP dataframe.
#-> Here I use the COVID19 one given its ample coverage of cities.
path = 'your_path/COVID19/'
df_dengue = pd.read_csv(path + 'covid_weekly_first_5_lags.csv')

#Now we're going to compare this new database with the one containing only cities with continuous timeseries from 2020 to 2023.
df_pib_filtered = pd.DataFrame()

for state in list(df_pib.state.unique()):
  state_data1 = df_pib.loc[df_pib['state'] == state]
  state_data2 = df_dengue.loc[df_dengue['state'] == state]

  for city in list(state_data1.city.unique()):
    swp = state_data2.loc[state_data2['city'] == city]
    if(len(swp) != 0): #if the city is on the cluster list,
      swp_connections = state_data1.loc[state_data1['city'] == city]
      swp_connections['cod_city'] = np.repeat( swp['cod_city'].values[0], len(swp_connections) )
      df_pib_filtered = pd.concat([df_pib_filtered, swp_connections])

#Now, filter by the cities we have for the disease being studied
df_pib_filtered = df_pib_filtered.loc[df_pib_filtered['cod_city'].isin(df_max_years.cod_city.unique())]

Applies DTW:

In [None]:
pib_by_city = df_pib_filtered[['cod_city', 'pib_percapta_x1000rs']]

pib_by_city = pib_by_city.groupby(['cod_city'])['pib_percapta_x1000rs'].apply(lambda x: x.values.flatten())
pib_by_city = pd.DataFrame(pib_by_city.values.tolist(),index=pib_by_city.index).add_prefix('year_')

In [None]:
dtw_dist = dtw.distance_matrix_fast(pib_by_city.values)
df_dtw = pd.DataFrame(dtw_dist, index=pib_by_city.index, columns=pib_by_city.index)

In [None]:
path = 'your_path/'

df_pib_dtw = pd.DataFrame()

for city1 in list(df_dtw.index):
  first_city = []
  second_city = []
  dists = []
  for city2 in list(df_dtw.index):
    if (city1 != city2):
      first_city.append(city1)
      second_city.append(city2)
      dists.append(df_dtw[city1].loc[city2])
  data_distances = {'city_1': first_city, 'city_2': second_city, 'distance': dists}
  df_swp = pd.DataFrame(data_distances)

  #We'll only save the closest 10 cities, to avoid a 1772^2 sized (immense) dataframe
  df_pib_dtw = pd.concat([df_pib_dtw, df_swp.sort_values(by='distance', ascending=True).head(10)])

df_pib_dtw.to_csv(path + disease + '_pib_dtw.csv')

In [None]:
#Cities to be analyzed for all methods
cities_to_use = pd.DataFrame(data=df_pib_filtered.cod_city.unique(), columns=['cities_to_keep'])
cities_to_use.to_csv(path + disease + '_cities_to_keep.csv')

# Generate the distances dataframe

Depending on the amount of cities being investigated in a certain dataframe, this block can take a long while to finish its execution, as it does not use a parallel structure.

In [None]:
path = 'your_path/'

In [None]:
n = 1
df_distances = pd.DataFrame()
data_to_use = df_lags

for city in list(data_to_use['cod_city'].unique()):
  print('City', n, 'of', len(list(data_to_use['cod_city'].unique())))
  dist = []
  city1 = []
  city2 = []

  cities_to_compare = list(data_to_use['cod_city'].unique())
  cities_to_compare.remove(city)
  x1 = data_to_use.loc[data_to_use['cod_city'] == city]['x_coord'].values[0]
  y1 = data_to_use.loc[data_to_use['cod_city'] == city]['y_coord'].values[0]
  for other_city in cities_to_compare:
    city1.append(city)
    city2.append(other_city)
    x2 = data_to_use.loc[data_to_use['cod_city'] == other_city]['x_coord'].values[0]
    y2 = data_to_use.loc[data_to_use['cod_city'] == other_city]['y_coord'].values[0]
    dist.append( np.sqrt( np.power( (x1-x2) ,2) + np.power( (y1-y2) ,2) ) )

  data_distances = {'city_1': city1, 'city_2': city2, 'distance': dist}
  df_swp = pd.DataFrame(data_distances)

  #We'll only save the closest 10 cities
  df_distances = pd.concat([df_distances, df_swp.sort_values(by='distance', ascending=True).head(10)])
  n = n+1

df_distances.to_csv(path + disease + '_geo_distances_cities.csv')