# EXPLORATORY DATA ANALYSIS - TIME SERIES OF WATER BAG INCIDENTS IN RIO DE JANEIRO

### Import modules and functions

In [2]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns; sns.set()
from IPython.display import clear_output as co

### Define data class to host data paths
class DATA:
    path = r'C:\Users\luisr\Desktop\Repositories\Dados\Desafio COR-Rio IV\\'
    AlertaAPI = r'http://websempre.rio.rj.gov.br/json/chuvas'

### Load data

In [3]:
catalog = pd.read_csv('dados/incident_catalog.csv', parse_dates=True, infer_datetime_format=True)

#### Extract records of incidents of water bag formation (copy data for cleaning)

In [4]:
catalog = catalog[catalog['POP_TITULO']=="Bolsão d'água em via"].copy()
data = catalog.copy()

---
## 0. Utility functions

#### Function to format and correct street number text variable

In [5]:
from text_formatter import text_transform_pipeline, get_not_number, drop_letters, drop_space, drop_chars, split_avg

---
# 1. Data Cleaning

#### Format and correct street number variable

In [9]:
not_number = get_not_number(catalog['street_number'])

not_number_corrected =  text_transform_pipeline(not_number, [drop_letters, drop_space, drop_chars, split_avg])

data.loc[not_number.index, ['street_number']] = not_number_corrected

data.dropna(subset=['street_number'], inplace=True) # drop rows where street number is missing

#### Data type conversion

In [22]:
float_cols = ['lat', 'lng', 'search_lat', 'search_lng', 'street_number']
data[float_cols] = data[float_cols].astype(float)