# Preparation

### Imports and libraries

In [1]:
import pandas as pd
import numpy as np

### Global Constants

In [2]:
# base url to google drive direct download links
google_drive_baseurl = 'https://drive.google.com/uc?export=download&id='

# path to local folder of datasets
path_local = './csv/'

### Functions

In [3]:
# funciton to read a dataset from local or remote
def read_csv_master(src, verbose=False):
  
  # this result arises when running on Colab with file uploaded to virtual machine
  try:
    return pd.read_csv( src['filename'] )
  except:
    if verbose:
      print('fail to read from filename')
    pass
  
  # this result arises when running on Jupyter
  try:
    return pd.read_csv( src['src_local'] )
  except:
    if verbose:
      print('fail to read from src_local')
    pass
  
  # this result arises when running on Colab without file uploaded to virtual machine
  try:
    return pd.read_csv( src['src_remote'] )
  except:
    if verbose:
      print('fail to read from src_remote')
    return None

In [4]:
# function to verify value counts of a dataframe
def check_value_counts(df, cols=[], values_limit=50, print_size=15):
  
  # use all columns of the dataframe if not specified
  if len(cols) == 0:
    cols = df.columns.tolist()

  # checking counts for each column
  for col in cols:
    
    # value counts for current column
    counts = df[col].value_counts(dropna=False, ascending=True)
    
    # length of value counts
    length = counts.size
    
    # count of null values
    nulls = df[col].isnull().sum()
    
    # header to be printed
    header = 'Column [' + col + ']'
    
    # formatting the output
    if length > values_limit:
      
      # limiting counts output
      counts = counts.head(print_size)
      
      # printing extra information
      header += '\n' + str(length) + ' unique values'
      header += '\n' + str(nulls) + ' NaN values'
      
    # printing null values count'
    header += '\n'  
    
    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    print(header)
    print(counts)
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
    print('')

### Creating Datasets Dictionaries

In [5]:
# list with all filenames
filename_list = [
	'estupro-por-capital.csv',
	'estupros-por-uf.csv',
	'homicidios-por-capital.csv',
	'homicidios-por-uf.csv',
	'homi-feminicidios-por-uf',
	'lesao-corporal-por-uf.csv'
]

# list with IDs from google drive links
fileid_list = [
	'1iMD4rjCHTcNG9XrrMy6HoCqN0-f9e8YD',
	'17vEfD2XIn7LFZV5NGgEdSqJrxXGwDdqZ',
	'1unci2LT7_94SOXBHtoWsYZZs72N-NfDg',
	'1eLwfW8ONzRQ6KKM_m_mjbvUP-bsCuAFk',
	'1U12Q3pL4R3nHPbB_1mDU-rex1uPUER_c',
	'1igMkgjuYIDTzslsiztYIWnIvuyeVpb6b'
]

# creating the files source dictionary
files_dict = dict(zip(filename_list, fileid_list))

# initializing and empty datasets dicitonary
datasets = {}

# populating the datasets dicitonary with inner dictionaries
for filename, driveid in files_dict.items():
  
  # name for key of inner dictionary
  innerkey = filename.replace('.csv', '')
  
  # creating and appending the inner dictionary
  datasets[innerkey] = { 
      'filename' : filename, 
      'driveid' : driveid,
      'path' : path_local,
      'src_remote' : google_drive_baseurl + driveid,
      'src_local' : path_local + filename
  }

# Initial Cleaning

### Datasets to Work

In [6]:
# printing the datasets to work with
for key in datasets.keys():
  print(key)

homicidios-por-uf
homicidios-por-capital
homi-feminicidios-por-uf
lesao-corporal-por-uf
estupro-por-capital
estupros-por-uf


### Homicides per Federative Unit

#### Initial Exploration

In [7]:
# reading the dataset
homicides = read_csv_master( datasets['homicidios-por-uf'] )

# making a backup of the original dataset
homicides_raw = homicides.copy()

In [8]:
# exploring
homicides.sample(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
6,Mato Grosso (4),Nº de Vítimas,Ns. Absolutos,1086,985
16,Amazonas,Nº de Vítimas,Ns. Absolutos,1023,1119
53,Sergipe,Nº de Vítimas,Taxas,576402199861505,489922713708571
29,Alagoas,Nº de Vítimas,Taxas,504917738004259,504469576752099
104,Paraná,Nº de Ocorrências,Taxas,128972348328518,183201111714519
11,Rio de Janeiro,Nº de Vítimas,Ns. Absolutos,5042,5346
84,Brasil,Nº de Ocorrências,Taxas,221873458255084,231160479880161
42,Acre (4),Nº de Vítimas,Taxas,433458595520683,605097038520092
35,Pará,Nº de Vítimas,Taxas,441088086584298,45657581525078
20,Paraná,Nº de Vítimas,Ns. Absolutos,2498,2187


In [9]:
# exploring
homicides.head(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
0,Brasil,Nº de Vítimas,Ns. Absolutos,"54.338,00","55.900,00"
1,Alagoas,Nº de Vítimas,Ns. Absolutos,1696,1703
2,Ceará (6),Nº de Vítimas,Ns. Absolutos,3331,5042
3,Espírito Santo,Nº de Vítimas,Ns. Absolutos,1181,1405
4,Goiás,Nº de Vítimas,Ns. Absolutos,2576,2254
5,Maranhão (7),Nº de Vítimas,Ns. Absolutos,2071,1816
6,Mato Grosso (4),Nº de Vítimas,Ns. Absolutos,1086,985
7,Pará,Nº de Vítimas,Ns. Absolutos,3649,3820
8,Paraíba (4),Nº de Vítimas,Ns. Absolutos,1280,1242
9,Pernambuco (4),Nº de Vítimas,Ns. Absolutos,4277,5139


In [10]:
# exploring
homicides.tail(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
102,Distrito Federal,Nº de Ocorrências,Taxas,198507599045551,163845756000111
103,Minas Gerais (4) (9),Nº de Ocorrências,Taxas,192498556975191,180780486844029
104,Paraná,Nº de Ocorrências,Taxas,128972348328518,183201111714519
105,Rio Grande do Sul (4),Nº de Ocorrências,Taxas,234439374473929,230153154294904
106,São Paulo,Nº de Ocorrências,Taxas,786820934817908,730460092729846
107,Mato Grosso do Sul (4) (8),Nº de Ocorrências,Taxas,213988590754649,195345110309172
108,Rondônia (4),Nº de Ocorrências,Taxas,299337708326456,254182661530589
109,Sergipe,Nº de Ocorrências,Taxas,...,...
110,Roraima (10),Nº de Ocorrências,Taxas,373374508244381,405636044972026
111,Tocantins,Nº de Ocorrências,Taxas,272033045817671,214811823552407


In [11]:
# exploring
homicides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
unidade_federativa    112 non-null object
grandeza              112 non-null object
medida                112 non-null object
2016                  112 non-null object
2017                  112 non-null object
dtypes: object(5)
memory usage: 4.5+ KB


**Note**

We will not use the values about Brazil for while, so we can drop them.

We also need to drop rows that do note have any value (that is, are filled with "...").

#### Dropping Rows

In [12]:
# replacing dots for NaN
homicides.replace(to_replace='...', value=np.nan, inplace=True)

In [13]:
# checking the replacement
homicides.tail()

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
107,Mato Grosso do Sul (4) (8),Nº de Ocorrências,Taxas,213988590754649.0,195345110309172.0
108,Rondônia (4),Nº de Ocorrências,Taxas,299337708326456.0,254182661530589.0
109,Sergipe,Nº de Ocorrências,Taxas,,
110,Roraima (10),Nº de Ocorrências,Taxas,373374508244381.0,405636044972026.0
111,Tocantins,Nº de Ocorrências,Taxas,272033045817671.0,214811823552407.0


**Note**

The replacement was successfull. Now, we can drop the rows.

In [14]:
# mask to rows about Brazil
mask_brazil = homicides.unidade_federativa == 'Brasil'

# mask to rows with null values
mask_null = homicides['2016'].isnull() | homicides['2017'].isnull()

# merging the masks
mask_to_drop = mask_brazil | mask_null

# checking the values to drop
homicides[mask_to_drop]

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
0,Brasil,Nº de Vítimas,Ns. Absolutos,"54.338,00","55.900,00"
28,Brasil,Nº de Vítimas,Taxas,26367246904612,269188817892652
56,Brasil,Nº de Ocorrências,Ns. Absolutos,45724,48003
60,Goiás,Nº de Ocorrências,Ns. Absolutos,,
68,Rio Grande do Norte,Nº de Ocorrências,Ns. Absolutos,,
81,Sergipe,Nº de Ocorrências,Ns. Absolutos,,
84,Brasil,Nº de Ocorrências,Taxas,221873458255084,231160479880161
88,Goiás,Nº de Ocorrências,Taxas,,
96,Rio Grande do Norte,Nº de Ocorrências,Taxas,,
109,Sergipe,Nº de Ocorrências,Taxas,,


In [15]:
# list of indexes to be dropped
to_drop = homicides[mask_to_drop].index.tolist()

# dropping the selected rows
homicides.drop(index=to_drop, inplace=True)

In [16]:
# exploring
homicides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 1 to 111
Data columns (total 5 columns):
unidade_federativa    102 non-null object
grandeza              102 non-null object
medida                102 non-null object
2016                  102 non-null object
2017                  102 non-null object
dtypes: object(5)
memory usage: 4.8+ KB


In [17]:
# exploring
homicides.sample(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
71,Amapá (5),Nº de Ocorrências,Ns. Absolutos,308,325
110,Roraima (10),Nº de Ocorrências,Taxas,373374508244381,405636044972026
1,Alagoas,Nº de Vítimas,Ns. Absolutos,1696,1703
2,Ceará (6),Nº de Vítimas,Ns. Absolutos,3331,5042
58,Ceará (6),Nº de Ocorrências,Ns. Absolutos,3163,4743
50,São Paulo,Nº de Vítimas,Taxas,821011108923883,777028586801877
92,Paraíba (4),Nº de Ocorrências,Taxas,312045636674364,299834209319553
111,Tocantins,Nº de Ocorrências,Taxas,272033045817671,214811823552407
59,Espírito Santo,Nº de Ocorrências,Ns. Absolutos,1177,1386
87,Espírito Santo,Nº de Ocorrências,Taxas,29619772217157,34508893135967


**Note**

We have a dataset free of null values.

Some values in *2016* and *2017* columns are in string format using comma as decimal separator.

We do need to transform and clean these columns.

#### Cleaning Columns 2016 and 2017

We will convert all data in these columns to string, replace from comma to dot in the values and convert to float.

In [18]:
# converting columns to string
for col in ['2016', '2017']:
  homicides[col] = homicides[col].str.replace(',', '.').apply(float)

In [19]:
# checking result
homicides.sample(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
77,Rio Grande do Sul (4),Nº de Ocorrências,Ns. Absolutos,2646.0,2606.0
22,São Paulo,Nº de Vítimas,Ns. Absolutos,3674.0,3504.0
37,Pernambuco (4),Nº de Vítimas,Taxas,45.450024,54.247395
43,Amapá (5),Nº de Vítimas,Taxas,39.243508,44.125648
100,Amazonas,Nº de Ocorrências,Taxas,24.514783,24.854723
65,Pernambuco (4),Nº de Ocorrências,Ns. Absolutos,4116.0,4894.0
24,Rondônia (4),Nº de Vítimas,Ns. Absolutos,545.0,481.0
107,Mato Grosso do Sul (4) (8),Nº de Ocorrências,Taxas,21.398859,19.534511
66,Piauí (4) (7),Nº de Ocorrências,Ns. Absolutos,642.0,597.0
7,Pará,Nº de Vítimas,Ns. Absolutos,3649.0,3820.0


In [20]:
# checking resulting type
homicides.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 1 to 111
Data columns (total 5 columns):
unidade_federativa    102 non-null object
grandeza              102 non-null object
medida                102 non-null object
2016                  102 non-null float64
2017                  102 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


**Note**

We also need to clean the names of Federative Units, removing the "(x)" marks.

We can do this performing a RegEx replacement.

#### Cleaning Federative Units Column

In [21]:
# column label
col_fu = 'unidade_federativa'

# applying regex replacement
homicides[col_fu] = homicides[col_fu].str.replace('\(\d+\)', '', regex=True)

# stripping white spaces from the values
homicides[col_fu] = homicides[col_fu].str.strip()

In [22]:
homicides.sample(10)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
107,Mato Grosso do Sul,Nº de Ocorrências,Taxas,21.398859,19.534511
69,Santa Catarina,Nº de Ocorrências,Ns. Absolutos,860.0,943.0
49,Rio Grande do Sul,Nº de Vítimas,Taxas,25.304567,25.302716
9,Pernambuco,Nº de Vítimas,Ns. Absolutos,4277.0,5139.0
53,Sergipe,Nº de Vítimas,Taxas,57.64022,48.992271
111,Tocantins,Nº de Ocorrências,Taxas,27.203305,21.481182
104,Paraná,Nº de Ocorrências,Taxas,12.897235,18.320111
8,Paraíba,Nº de Vítimas,Ns. Absolutos,1280.0,1242.0
3,Espírito Santo,Nº de Vítimas,Ns. Absolutos,1181.0,1405.0
30,Ceará,Nº de Vítimas,Taxas,37.161147,55.895154


**Note**

We need also check the other columns for possible inconsistencies.

#### Cleaning Other Columns

In [23]:
# checking value counts
homicides['grandeza'].value_counts(dropna=False)

 Nº de Vítimas       54
Nº de Ocorrências    48
Name: grandeza, dtype: int64

**Note**

For convenience and consistency, we can strip white spaces from the values.

In [24]:
# checking value counts
homicides['medida'].value_counts(dropna=False)

Taxas            51
Ns. Absolutos    51
Name: medida, dtype: int64

In [25]:
# stripping possible white space values
for col in ['grandeza', 'medida']:
  homicides[col] = homicides[col].str.strip()

In [26]:
# checking value counts
homicides['grandeza'].value_counts(dropna=False)

Nº de Vítimas        54
Nº de Ocorrências    48
Name: grandeza, dtype: int64

#### Exploring Value Counts

In [27]:
# applying function
check_value_counts(homicides, values_limit=15, print_size=10)

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [unidade_federativa]
27 unique values
0 NaN values

Sergipe                2
Rio Grande do Norte    2
Goiás                  2
Paraíba                4
Rondônia               4
Distrito Federal       4
Ceará                  4
Santa Catarina         4
Mato Grosso            4
Roraima                4
Name: unidade_federativa, dtype: int64
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [grandeza]

Nº de Ocorrências    48
Nº de Vítimas        54
Name: grandeza, dtype: int64
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [medida]

Ns. Absolutos    51
Taxas            51
Name: medida, dtype: int64
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [2016]
92 unique values
0 NaN values

1696.000000    1
39.040187      1
5964.000000    1
48.437568

### Homicides and Femicides per FU

#### Initial Exploration

In [28]:
# reading the dataset
femicides = read_csv_master( datasets['homi-feminicidios-por-uf'] )

# making a backup of the original dataset
femicides_raw = femicides.copy()

In [29]:
# exploring
femicides.head(5)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
0,Brasil,Homicidios (2),Ns. Absolutos,4245,4539
1,Acre,Homicidios (2),Ns. Absolutos,26,3400
2,Alagoas (4),Homicidios (2),Ns. Absolutos,54,7400
3,Amapá,Homicidios (2),Ns. Absolutos,20,2300
4,Amazonas,Homicidios (2),Ns. Absolutos,68,7300


In [30]:
# exploring
femicides.tail(5)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
107,Roraima,Feminicídios,Taxas,119672098450246,11768397928762
108,Santa Catarina,Feminicídios,Taxas,156749502973451,137533018669248
109,São Paulo (4),Feminicídios,Taxas,0264156371062039,471866922692338
110,Sergipe,Feminicídios,Taxas,...,512114932247195
111,Tocantins,Feminicídios,Taxas,502782504227342,418227942070202


In [31]:
# exploring
femicides.sample(10)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
78,Rondônia,Feminicídios,Ns. Absolutos,37,54
102,Piauí,Feminicídios,Taxas,188686824912017,157720191035548
108,Santa Catarina,Feminicídios,Taxas,156749502973451,137533018669248
3,Amapá,Homicidios (2),Ns. Absolutos,20,2300
79,Roraima,Feminicídios,Ns. Absolutos,3,3
36,Espírito Santo,Homicidios (2),Taxas,49721808989904,67066582212452
86,Alagoas (4),Feminicídios,Taxas,208540425561495,178474275236119
58,Alagoas (4),Feminicídios,Ns. Absolutos,36,31
5,Bahia,Homicidios (2),Ns. Absolutos,443,47400
71,Paraíba,Feminicídios,Ns. Absolutos,24,22


**Note**

We can follow the same steps used to clean the previous dataset.

#### Cleaning Dataset

The following code cells perform the cleaning of the dataset with the same steps applyied to previous one.

In [32]:
# replacing dots for NaN
femicides.replace(to_replace='...', value=np.nan, inplace=True)

In [33]:
# mask to rows about Brazil
mask_brazil = femicides.un_federativa == 'Brasil'

# mask to rows with null values
mask_null = femicides['2016'].isnull() | femicides['2016'].isnull()

# merging the masks
mask_to_drop = mask_brazil | mask_null

# list of indexes to be dropped
to_drop = femicides[mask_to_drop].index.tolist()

# dropping the selected rows
femicides.drop(index=to_drop, inplace=True)

In [34]:
# converting columns to string
for col in ['2016', '2017']:
  femicides[col] = femicides[col].str.replace(',', '.').apply(float)

In [35]:
# list of columns to remove marks
cols_clean_marks = ['un_federativa', 'grandeza', 'medida']

# cleaning marks for each column
for col in cols_clean_marks:
  
  # applying regex replacement
  femicides[col] = femicides[col].str.replace('\(\d+\)', '', regex=True)

  # stripping white spaces from the values
  femicides[col] = femicides[col].str.strip()

In [36]:
# checking the result
femicides.head(5)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
1,Acre,Homicidios,Ns. Absolutos,26.0,34.0
2,Alagoas,Homicidios,Ns. Absolutos,54.0,74.0
3,Amapá,Homicidios,Ns. Absolutos,20.0,23.0
4,Amazonas,Homicidios,Ns. Absolutos,68.0,73.0
5,Bahia,Homicidios,Ns. Absolutos,443.0,474.0


In [37]:
# checking the result
femicides.tail()

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
106,Rondônia,Feminicídios,Taxas,4.22212,6.095125
107,Roraima,Feminicídios,Taxas,1.196721,1.17684
108,Santa Catarina,Feminicídios,Taxas,1.567495,1.37533
109,São Paulo,Feminicídios,Taxas,0.264156,0.471867
111,Tocantins,Feminicídios,Taxas,5.027825,4.182279


In [38]:
# checking the result
femicides.sample(10)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
5,Bahia,Homicidios,Ns. Absolutos,443.0,474.0
54,Sergipe,Homicidios,Taxas,4.399787,5.889322
40,Mato Grosso do Sul,Homicidios,Taxas,7.775027,6.206132
67,Mato Grosso,Feminicídios,Ns. Absolutos,49.0,76.0
9,Goiás,Homicidios,Ns. Absolutos,198.0,197.0
91,Distrito Federal,Feminicídios,Taxas,1.27588,1.186181
83,Tocantins,Feminicídios,Ns. Absolutos,38.0,32.0
71,Paraíba,Feminicídios,Ns. Absolutos,24.0,22.0
73,Pernambuco,Feminicídios,Ns. Absolutos,112.0,76.0
89,Bahia,Feminicídios,Taxas,0.232776,0.951572


**Note**

We conclude we can easly clean our datasets by performing similar steps. This will be very useful when working with the other datasets.

#### Exploring Value Counts

In [39]:
# applying the function
check_value_counts(femicides)

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [un_federativa]

Maranhão               2
Ceará                  2
Amapá                  2
Sergipe                2
Santa Catarina         4
Paraíba                4
Rondônia               4
Distrito Federal       4
Rio Grande do Norte    4
Pará                   4
São Paulo              4
Mato Grosso            4
Roraima                4
Goiás                  4
Pernambuco             4
Paraná                 4
Piauí                  4
Minas Gerais           4
Amazonas               4
Mato Grosso do Sul     4
Bahia                  4
Rio de Janeiro         4
Acre                   4
Rio Grande do Sul      4
Alagoas                4
Espírito Santo         4
Tocantins              4
Name: un_federativa, dtype: int64
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Column [grandeza]

Feminicídios    46
Homicidios      54
Name: grandeza, dtype: int64
<<<<<<<<<<<<<<<<<<<<<<<<

# Basic Analysis

In [40]:
homicides.sample(8)

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
16,Amazonas,Nº de Vítimas,Ns. Absolutos,1023.0,1119.0
10,Piauí,Nº de Vítimas,Ns. Absolutos,642.0,597.0
1,Alagoas,Nº de Vítimas,Ns. Absolutos,1696.0,1703.0
12,Rio Grande do Norte,Nº de Vítimas,Ns. Absolutos,1748.0,1863.0
30,Ceará,Nº de Vítimas,Taxas,37.161147,55.895154
105,Rio Grande do Sul,Nº de Ocorrências,Taxas,23.443937,23.015315
74,Distrito Federal,Nº de Ocorrências,Ns. Absolutos,591.0,498.0
36,Paraíba,Nº de Vítimas,Taxas,32.004681,30.852866


In [41]:
femicides.sample(8)

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
4,Amazonas,Homicidios,Ns. Absolutos,68.0,73.0
54,Sergipe,Homicidios,Taxas,4.399787,5.889322
96,Mato Grosso do Sul,Feminicídios,Taxas,2.541836,1.994828
71,Paraíba,Feminicídios,Ns. Absolutos,24.0,22.0
105,Rio Grande do Sul,Feminicídios,Taxas,1.669756,1.438714
38,Maranhão,Homicidios,Taxas,3.499199,3.529754
22,Rondônia,Homicidios,Ns. Absolutos,44.0,66.0
39,Mato Grosso,Homicidios,Taxas,5.633998,5.135978


In [42]:
mask_homicides_bahia = homicides.unidade_federativa == 'Pernambuco'

homicides[mask_homicides_bahia]

Unnamed: 0,unidade_federativa,grandeza,medida,2016,2017
9,Pernambuco,Nº de Vítimas,Ns. Absolutos,4277.0,5139.0
37,Pernambuco,Nº de Vítimas,Taxas,45.450024,54.247395
65,Pernambuco,Nº de Ocorrências,Ns. Absolutos,4116.0,4894.0
93,Pernambuco,Nº de Ocorrências,Taxas,43.73914,51.661169


In [43]:
mask_femicides_bahia = femicides.un_federativa == 'Pernambuco'

femicides[mask_femicides_bahia]

Unnamed: 0,un_federativa,grandeza,medida,2016,2017
17,Pernambuco,Homicidios,Ns. Absolutos,280.0,316.0
45,Pernambuco,Homicidios,Taxas,5.768797,6.464756
73,Pernambuco,Feminicídios,Ns. Absolutos,112.0,76.0
101,Pernambuco,Feminicídios,Taxas,2.307519,1.554815


**Note**

The number of occurrences means registration at a police station and is expected to be equal or less than the number victims.

The rates are in number per 100 thousand inhabitants.

**Possible plots**

* Total of homicides per federative unit in absolute numbers for each year (from *homicides* dataset) [2 plots]
* Total of homicides against women and femicides per federative unit in absolute numbers for each year (from *femicides* dataset, doing a `groupby`) [2 plots]
* Total of femicides, specifically, por federative unit in absolute numbers for each year (from *femicides* dataset) [2 plots]
* Poportion between the total of homicides and total of femicides and women homicides in absolute numbers for each year [2 plots]
* The same plots above in terms of variation from 2016 to 2017 [8 plots]
* The set of plots in terms of rates [16 plots]

In [44]:
homicides.groupby(['grandeza','medida']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,unidade_federativa,2016,2017
grandeza,medida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nº de Ocorrências,Ns. Absolutos,24,24,24
Nº de Ocorrências,Taxas,24,24,24
Nº de Vítimas,Ns. Absolutos,27,27,27
Nº de Vítimas,Taxas,27,27,27


In [45]:
len(homicides.unidade_federativa.unique())

27

In [46]:
homicides.groupby(['unidade_federativa','grandeza','medida']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2016,2017
unidade_federativa,grandeza,medida,Unnamed: 3_level_1,Unnamed: 4_level_1
Acre,Nº de Ocorrências,Ns. Absolutos,341.000000,482.000000
Acre,Nº de Ocorrências,Taxas,41.754062,58.098959
Acre,Nº de Vítimas,Ns. Absolutos,354.000000,502.000000
Acre,Nº de Vítimas,Taxas,43.345860,60.509704
Alagoas,Nº de Ocorrências,Ns. Absolutos,1627.000000,1617.000000
Alagoas,Nº de Ocorrências,Taxas,48.437568,47.899431
Alagoas,Nº de Vítimas,Ns. Absolutos,1696.000000,1703.000000
Alagoas,Nº de Vítimas,Taxas,50.491774,50.446958
Amapá,Nº de Ocorrências,Ns. Absolutos,308.000000,325.000000
Amapá,Nº de Ocorrências,Taxas,39.371337,40.741010
