# Preparation

## Imports and libraries

In [1]:
# ensuring requests library is installed
!pip install requests==v2.21.0 --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# ensuring the folium library is installed
!pip install folium==0.8.2 --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd
import numpy as np

import requests
import folium

## Global Constants

In [4]:
# base url to google drive direct download links
google_drive_baseurl = 'https://drive.google.com/uc?export=download&id='

# path to local folder of datasets
path_local = './csv/'

## Functions

In [5]:
# funciton to read a dataset from local or remote
def read_csv_master(src, verbose=False):
  
  # this result arises when running on Colab with file uploaded to virtual machine
  try:
    return pd.read_csv( src['filename'] )
  except:
    if verbose:
      print('fail to read from filename')
    pass
  
  # this result arises when running on Jupyter
  try:
    return pd.read_csv( src['src_local'] )
  except:
    if verbose:
      print('fail to read from src_local')
    pass
  
  # this result arises when running on Colab without file uploaded to virtual machine
  try:
    return pd.read_csv( src['src_remote'] )
  except:
    if verbose:
      print('fail to read from src_remote')
    return None

In [6]:
# function to verify value counts of a dataframe
def check_value_counts(df, cols=[], values_limit=50, print_size=15):
  
  # use all columns of the dataframe if not specified
  if len(cols) == 0:
    cols = df.columns.tolist()

  # checking counts for each column
  for col in cols:
    
    # value counts for current column
    counts = df[col].value_counts(dropna=False, ascending=True)
    
    # length of value counts
    length = counts.size
    
    # count of null values
    nulls = df[col].isnull().sum()
    
    # header to be printed
    header = 'Column [' + col + ']'
    
    # formatting the output
    if length > values_limit:
      
      # limiting counts output
      counts = counts.head(print_size)
      
      # printing extra information
      header += '\n' + str(length) + ' unique values'
      header += '\n' + str(nulls) + ' NaN values'
      
    # printing null values count'
    header += '\n'  
    
    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    print(header)
    print(counts)
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    print('')

## Creating Datasets Dictionaries

In [7]:
# list with all filenames
filename_list = [
	'estupro-por-capital.csv',
	'estupros-por-uf.csv',
	'homicidios-por-capital.csv',
	'homicidios-por-uf.csv',
	'homi-feminicidios-por-uf',
	'lesao-corporal-por-uf.csv'
]

# list with IDs from google drive links
fileid_list = [
	'1iMD4rjCHTcNG9XrrMy6HoCqN0-f9e8YD',
	'17vEfD2XIn7LFZV5NGgEdSqJrxXGwDdqZ',
	'1unci2LT7_94SOXBHtoWsYZZs72N-NfDg',
	'1eLwfW8ONzRQ6KKM_m_mjbvUP-bsCuAFk',
	'1U12Q3pL4R3nHPbB_1mDU-rex1uPUER_c',
	'1igMkgjuYIDTzslsiztYIWnIvuyeVpb6b'
]

# creating the files source dictionary
files_dict = dict(zip(filename_list, fileid_list))

# initializing and empty datasets dicitonary
datasets = {}

# populating the datasets dicitonary with inner dictionaries
for filename, driveid in files_dict.items():
  
  # name for key of inner dictionary
  innerkey = filename.replace('.csv', '')
  
  # creating and appending the inner dictionary
  datasets[innerkey] = { 
      'filename' : filename, 
      'driveid' : driveid,
      'path' : path_local,
      'src_remote' : google_drive_baseurl + driveid,
      'src_local' : path_local + filename
  }

## Preparing Datasets

### Homicides per Federative Unit

In [8]:
# reading the dataset
homicides = read_csv_master( datasets['homicidios-por-uf'] )

In [9]:
# replacing dots for NaN
homicides.replace(to_replace='...', value=np.nan, inplace=True)

In [10]:
# mask to rows about Brazil
mask_brazil = homicides.unidade_federativa == 'Brasil'

# mask to rows with null values
mask_null = homicides['2016'].isnull() | homicides['2017'].isnull()

# merging the masks
mask_to_drop = mask_brazil | mask_null

In [11]:
# list of indexes to be dropped
to_drop = homicides[mask_to_drop].index.tolist()

# dropping the selected rows
homicides.drop(index=to_drop, inplace=True)

In [12]:
# converting columns to string
for col in ['2016', '2017']:
  homicides[col] = homicides[col].str.replace(',', '.').apply(float)

In [13]:
# column label
col_fu = 'unidade_federativa'

# applying regex replacement
homicides[col_fu] = homicides[col_fu].str.replace('\(\d+\)', '', regex=True)

In [14]:
# stripping possible white space values
for col in ['unidade_federativa', 'grandeza', 'medida']:
  homicides[col] = homicides[col].str.strip()

In [15]:
# checking the dataset
# homicides.sample(10)

### Homicides and Femicides per FU

In [16]:
# reading the dataset
femicides = read_csv_master( datasets['homi-feminicidios-por-uf'] )

In [17]:
# replacing dots for NaN
femicides.replace(to_replace='...', value=np.nan, inplace=True)

In [18]:
# mask to rows about Brazil
mask_brazil = femicides.un_federativa == 'Brasil'

# mask to rows with null values
mask_null = femicides['2016'].isnull() | femicides['2017'].isnull()

# merging the masks
mask_to_drop = mask_brazil | mask_null

# list of indexes to be dropped
to_drop = femicides[mask_to_drop].index.tolist()

# dropping the selected rows
femicides.drop(index=to_drop, inplace=True)

In [19]:
# converting columns to float
for col in ['2016', '2017']:
  femicides[col] = femicides[col].str.replace(',', '.').apply(float)

In [20]:
# list of columns to remove marks
cols_clean_marks = ['un_federativa', 'grandeza', 'medida']

# cleaning marks for each column
for col in cols_clean_marks:
  
  # applying regex replacement
  femicides[col] = femicides[col].str.replace('\(\d+\)', '', regex=True)

  # stripping white spaces from the values
  femicides[col] = femicides[col].str.strip()

In [21]:
# checking the result
# femicides.sample(10)

**Note**

For convenience, let's rename the federative unit column labels.

In [22]:
femicides.rename(columns = {"un_federativa": "uf"}, inplace=True)

homicides.rename(columns = {"unidade_federativa": "uf"}, inplace=True)

# Visualizing Data

**Note**

Possible plots:

* Total of homicides per federative unit in absolute numbers for each year (from *homicides* dataset) [2 plots]
* Total of homicides against women and femicides per federative unit in absolute numbers for each year (from *femicides* dataset, doing a `groupby`) [2 plots]
* Total of femicides, specifically, por federative unit in absolute numbers for each year (from *femicides* dataset) [2 plots]
* Poportion between the total of homicides and total of femicides and women homicides in absolute numbers for each year [2 plots]
* The same plots above in terms of variation from 2016 to 2017 [8 plots]
* The set of plots in terms of rates [16 plots]

## GeoJSON Data

We will use the [IBGE](https://servicodados.ibge.gov.br/api/docs/malhas?versao=2) API to plot geographic meshes.

### Data Requests

In [23]:
# a generical header to request data
headers = {
    'Content-Type': 'application/json;charset=UTF-8',
    'User-Agent': 'google-colab',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
}

In [24]:
# url to request meshes of all Federative Units in Brazil in geojson format
meshes_url = 'https://servicodados.ibge.gov.br/api/v2/malhas/?resolucao=2&formato=application/vnd.geo+json'

# requesting geojson data and assigning the a variable
meshes_data = requests.get(meshes_url, headers=headers).json()

**Note**

The meshes data requested just before does not have the names of Federative Units (FU), but just its id number.

We need to link the meshes to the FUs names and their respective homicides and femicides data.

IBGE provides an API to retriev the identification number an name for each FU.

We can request this data and merge this information to the datasets about homicides and femicides.

In [25]:
# url to request information of all Federative Units in Brazil (contains the id number)
states_url = 'https://servicodados.ibge.gov.br/api/v1/localidades/estados'

# requesting informations and assigning the a variable
states_data = requests.get(states_url, headers=headers).json()

### Merging Dataframes

In [26]:
# creating lists to be populated by IBGE requested data
meshes_ids = []
states_ids = []
states_names = []
states_codes = []

# populating information about meshes
for feature in meshes_data['features']:
    meshes_ids.append( str(feature['properties']['codarea']) )

meshes_ids.sort()

# populating information about Federative Units
for state in states_data:
    states_ids.append( str(state['id']) )
    states_names.append( state['nome'] )
    states_codes.append( state['sigla'] )

states_ids.sort()

# checking if identification numbers matches
meshes_ids == states_ids

True

In [27]:
# creating a dataframe of Federative Units to be merged
states = pd.DataFrame( {'id': states_ids, 'nome': states_names, 'sigla': states_codes} )

# appending centroid coordinates columns
states['lat'] = 0
states['lng'] = 0

states.set_index('id', inplace=True)

# retrieving centroid data
for feature in meshes_data['features']:
    
    centroid = feature['properties']['centroide']
    lat = centroid[1]
    lng = centroid[0]
    
    cod = str(feature['properties']['codarea'])
    
    states.loc[cod,'lat'] = lat
    states.loc[cod,'lng'] = lng

states.reset_index(inplace=True)

In [28]:
states.head()

Unnamed: 0,id,nome,sigla,lat,lng
0,11,Rondônia,RO,-10.913325,-62.841698
1,12,Acre,AC,-9.212917,-70.473083
2,13,Amazonas,AM,-4.154223,-64.653187
3,14,Roraima,RR,2.08409,-61.399162
4,15,Pará,PA,-3.974815,-53.064197


#### Homicides

In [29]:
# checking if all Federative Units are contained in requested data
homicides.uf.isin(states_names).sum() == homicides.shape[0]

True

In [30]:
# merging dataframes
homicides = homicides.merge(states, left_on='uf', right_on='nome').drop(columns=['nome'])

In [31]:
# checking the result of merge
homicides.head()

Unnamed: 0,uf,grandeza,medida,2016,2017,id,sigla,lat,lng
0,Alagoas,Nº de Vítimas,Ns. Absolutos,1696.0,1703.0,27,AL,-9.513973,-36.624728
1,Alagoas,Nº de Vítimas,Taxas,50.491774,50.446958,27,AL,-9.513973,-36.624728
2,Alagoas,Nº de Ocorrências,Ns. Absolutos,1627.0,1617.0,27,AL,-9.513973,-36.624728
3,Alagoas,Nº de Ocorrências,Taxas,48.437568,47.899431,27,AL,-9.513973,-36.624728
4,Ceará,Nº de Vítimas,Ns. Absolutos,3331.0,5042.0,23,CE,-5.093338,-39.615608


#### Femicides

In [32]:
# checking if all Federative Units are contained in requested data
femicides.uf.isin(states_names).sum() == femicides.shape[0]

True

In [33]:
# merging dataframes
femicides = femicides.merge(states, left_on='uf', right_on='nome').drop(columns=['nome'])

In [34]:
# checking the result of merge
femicides.head()

Unnamed: 0,uf,grandeza,medida,2016,2017,id,sigla,lat,lng
0,Acre,Homicidios,Ns. Absolutos,26.0,34.0,12,AC,-9.212917,-70.473083
1,Acre,Homicidios,Taxas,6.422814,8.264423,12,AC,-9.212917,-70.473083
2,Acre,Feminicídios,Ns. Absolutos,14.0,13.0,12,AC,-9.212917,-70.473083
3,Acre,Feminicídios,Taxas,3.458438,3.159926,12,AC,-9.212917,-70.473083
4,Alagoas,Homicidios,Ns. Absolutos,54.0,74.0,27,AL,-9.513973,-36.624728


## Splitting Dataframes

Instead of splitting the dataframes into lots of copies, we can use a set of masks to select data when requested.

In [35]:
homi_victims = homicides.grandeza == 'Nº de Vítimas'
homi_occurences = homicides.grandeza == 'Nº de Ocorrências'

homi_absolute = homicides.medida == 'Ns. Absolutos'
homi_rates = homicides.medida == 'Taxas'

In [36]:
homicides[homi_occurences & homi_absolute].sample(5)

Unnamed: 0,uf,grandeza,medida,2016,2017,id,sigla,lat,lng
58,Amazonas,Nº de Ocorrências,Ns. Absolutos,981.0,1010.0,13,AM,-4.154223,-64.653187
20,Mato Grosso,Nº de Ocorrências,Ns. Absolutos,1086.0,924.0,51,MT,-12.948919,-55.911975
90,Rondônia,Nº de Ocorrências,Ns. Absolutos,535.0,459.0,11,RO,-10.913325,-62.841698
74,Paraná,Nº de Ocorrências,Ns. Absolutos,1450.0,2074.0,41,PR,-24.63584,-51.6164
66,Distrito Federal,Nº de Ocorrências,Ns. Absolutos,591.0,498.0,53,DF,-15.780746,-47.797341


In [37]:
femi_femicides = femicides.grandeza == 'Feminicídios'
femi_homicides = femicides.grandeza == 'Homicidios'

femi_absolute = femicides.medida == 'Ns. Absolutos'
femi_rates = femicides.medida == 'Taxas'

In [38]:
femicides[femi_femicides & femi_absolute].sample(5)

Unnamed: 0,uf,grandeza,medida,2016,2017,id,sigla,lat,lng
40,Mato Grosso do Sul,Feminicídios,Ns. Absolutos,34.0,27.0,50,MS,-20.327475,-54.845564
60,Pernambuco,Feminicídios,Ns. Absolutos,112.0,76.0,26,PE,-8.32605,-37.998299
6,Alagoas,Feminicídios,Ns. Absolutos,36.0,31.0,27,AL,-9.513973,-36.624728
56,Paraná,Feminicídios,Ns. Absolutos,20.0,21.0,41,PR,-24.63584,-51.6164
88,Santa Catarina,Feminicídios,Ns. Absolutos,54.0,48.0,42,SC,-27.24733,-50.474101


## Creating Maps

In [39]:
# coordinations of Federal District
federal_district = [-15.7757875,-48.0778477]

# creating the map object
basemap = folium.Map(
    location=federal_district,
    zoom_start=4,
    tiles='cartodbpositron'
)

# list of tiles
tiles = ['openstreetmap','Mapbox Bright','Mapbox Control Room',
        'stamenterrain','stamenterrain','stamentoner',
        'stamenwatercolor','cartodbpositron','cartodbdark_matter']

# iterating over the tiles and creating the maps
for tile in tiles:
  folium.TileLayer(tile).add_to(basemap)

year = '2016'

# plotting the choropleth
legends = 'Victims in Absolute Number ' + year
folium.Choropleth(
    geo_data=meshes_data,
    data=homicides[homi_victims & homi_absolute],
    name=legends,
    columns=['id',year],
    key_on='feature.properties.codarea',
    fill_color='YlOrRd',
    fill_opacity=1.0,
    line_opacity=0.7,
    legend_name=legends
).add_to(basemap)

# plotting the choropleth
legends = 'Victims in Rates ' + year
folium.Choropleth(
    geo_data=meshes_data,
    data=homicides[homi_victims & homi_rates],
    name=legends,
    columns=['id',year],
    key_on='feature.properties.codarea',
    fill_color='YlOrRd',
    fill_opacity=1.0,
    line_opacity=0.7,
    legend_name=legends
).add_to(basemap)

# adding all controls to the map
folium.LayerControl().add_to(basemap)

# finally displaying the map
basemap

In [47]:
# coordinations of Federal District
federal_district = [-15.7757875,-48.0778477]

# creating the map object
basemap = folium.Map(
    location=federal_district,
    zoom_start=4,
    tiles='cartodbpositron'
)

year = '2017'

# plotting the choropleth
legends = 'Victims in Absolute Number ' + year
folium.Choropleth(
    geo_data=meshes_data,
    data=homicides[homi_victims & homi_absolute],
    name=legends,
    columns=['id',year],
    key_on='feature.properties.codarea',
    fill_color='YlOrRd',
    fill_opacity=1.0,
    line_opacity=0.7,
    legend_name=legends
).add_to(basemap)

# plotting the choropleth
legends = 'Victims in Rates ' + year
folium.Choropleth(
    geo_data=meshes_data,
    data=homicides[homi_victims & homi_rates],
    name=legends,
    columns=['id',year],
    key_on='feature.properties.codarea',
    fill_color='YlOrRd',
    fill_opacity=1.0,
    line_opacity=0.7,
    legend_name=legends
).add_to(basemap)

# adding all controls to the map
folium.LayerControl().add_to(basemap)

# adding markers with popups to the map
for index, row in states.iterrows():
    
    # maks to rows of current state in homicides dataset
    mask_state_homicides = homicides.uf == row.nome
    
    # getting the number of homicides in string format
    df_homicides = homicides[homi_victims & homi_absolute & mask_state_homicides].loc[:,year]
    tot_homicides = df_homicides.astype(int).astype(str).item()
    
    # maks to rows of current state in femicides dataset
    mask_state_femicides = femicides.uf == row.nome
    
    # getting the number of women homicides in string format
    df_femicides = femicides[femi_femicides & femi_absolute & mask_state_femicides].loc[:,year]
    
    # try-except block because some states are missing in the dataset
    try:
        num_femicides = df_femicides.astype(int).astype(str).item()
    except:
        num_femicides = 'Unknown'
    
    # getting the number of femicides in string format
    df_femicides = femicides[femi_homicides & femi_absolute & mask_state_femicides].loc[:,year]
    
    # try-except block because some states are missing in the dataset
    try:
        num_homicides = df_femicides.astype(int).astype(str).item()
    except:
        num_homicides = 'Unknown'
    
    # making popup html content
    popup_content  = '<strong>' + row.nome + '</strong><br>'
    popup_content += 'Total of Homicides in ' + year + ': ' + tot_homicides + '<br>'
    popup_content += 'Total of Homicides against Women in ' + year + ': ' + num_homicides + '<br>'
    popup_content += 'Total of Femicides in ' + year + ': ' + num_femicides + '<br>'
    popup_content  = '<div style="width:300px">' + popup_content + '</div>'
    
    # adding the marker
    folium.Marker( 
        location = (row.lat, row.lng),
        popup = popup_content,
        tooltip = row.sigla
    ).add_to(basemap)

# saving the map to a file that can be opened in a browser
basemap.save('../maps/homicides.html')

# finally displaying the map
basemap