Idea of this notebook: 

Time series prediction of daily new Covid-Cases in Chile, with CASEN data as features

Predict what?
* For each comuna or for the whole country, predict future development (next five or ten days) of Covid cases/deaths
* y = number of cases
* x = casen data + more features (s. time series tips) 

data: CASEN data
Covid data: from MINSAL


1. Read in data: CASEN data, data from MINSAL
2. Analyze 
3. Metrics: low RSME (good for time series) 
4. Get features with high correlation
5. build model with those features (maybe LR)
6. GridSearch with different models (LR, Kregressor, RandomForestRegressor) 
7. Findings/conclusion


# Read in data

In [2]:
# Read in libraries
import pandas as pd
pd.options.mode.chained_assignment = None  

import numpy as np

from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

import seaborn as sns
sns.set_context('poster')

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn import neighbors
from sklearn.linear_model import LinearRegression


# Read in CASEN data about social vulnerability
casen = pd.read_stata('Casen 2017.dta', convert_categoricals = False)

# Read in data about Covid-19 in Chilean districts
cases_districts = pd.read_csv(
    'https://raw.githubusercontent.com/MinCiencia/Datos-COVID19/master/output/producto1/Covid-19.csv')
cases_districts = cases_districts[~cases_districts.Comuna.str.contains('Desconocido')]


deaths_districts = pd.read_csv(
    'https://raw.githubusercontent.com/MinCiencia/Datos-COVID19/master/output/producto38/CasosFallecidosPorComuna.csv')
deaths_districts = deaths_districts[~deaths_districts.Comuna.str.contains('Desconocido')]
deaths_districts = deaths_districts[~deaths_districts.Comuna.str.contains('Total')]

# Data exploration

Three dataframes: 
* casen: CASEN data
* cases_districts: data about cases per district
* deaths_districts: data about deaths per district

In [6]:
casen.head()

Unnamed: 0,folio,o,id_vivienda,hogar,region,provincia,comuna,zona,expr,expc,...,hh_d_estado,hh_d_servbas,hh_d_entorno,hh_d_accesi,hh_d_medio,hh_d_appart,hh_d_tsocial,hh_d_seg,pobreza_multi_4d,pobreza_multi_5d
0,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,,
1,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,,
3,110110000000.0,2,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,,
4,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,1.0,1.0,0.0,1.0,,1.0,0.0,0.0,


In [7]:
cases_districts.head()

Unnamed: 0,Region,Codigo region,Comuna,Codigo comuna,Poblacion,2020-03-30,2020-04-01,2020-04-03,2020-04-06,2020-04-08,...,2020-07-13,2020-07-17,2020-07-20,2020-07-24,2020-07-27,2020-07-31,2020-08-03,2020-08-07,2020-08-10,Tasa
0,Arica y Parinacota,15,Arica,15101.0,247552.0,6.0,6.0,12.0,41.0,63.0,...,2721.0,3123.0,3372.0,3882.0,4211.0,4636.0,4874.0,5220.0,5543.0,2239.1
1,Arica y Parinacota,15,Camarones,15102.0,1233.0,0.0,0.0,0.0,0.0,0.0,...,8.0,13.0,13.0,13.0,15.0,16.0,21.0,23.0,23.0,1865.4
2,Arica y Parinacota,15,General Lagos,15202.0,810.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,6.0,740.7
3,Arica y Parinacota,15,Putre,15201.0,2515.0,0.0,0.0,0.0,0.0,0.0,...,28.0,28.0,30.0,36.0,37.0,39.0,40.0,50.0,55.0,2186.9
5,Tarapacá,1,Alto Hospicio,1107.0,129999.0,0.0,0.0,0.0,5.0,6.0,...,2191.0,2241.0,2313.0,2371.0,2414.0,2523.0,2595.0,2679.0,2740.0,2107.7


In [8]:
deaths_districts.head()

Unnamed: 0,Region,Codigo region,Comuna,Codigo comuna,Poblacion,2020-06-12,2020-06-15,2020-06-19,2020-06-23,2020-06-28,...,2020-07-10,2020-07-13,2020-07-17,2020-07-20,2020-07-24,2020-07-27,2020-07-31,2020-08-03,2020-08-07,2020-08-10
0,Arica y Parinacota,15,Arica,15101.0,247552.0,10.0,11.0,12.0,12.0,22.0,...,25.0,29.0,33.0,39.0,43.0,52.0,59.0,69.0,75.0,79.0
1,Arica y Parinacota,15,Camarones,15102.0,1233.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,Arica y Parinacota,15,General Lagos,15202.0,810.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arica y Parinacota,15,Putre,15201.0,2515.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Tarapaca,1,Alto Hospicio,1107.0,129999.0,11.0,15.0,18.0,22.0,25.0,...,33.0,34.0,34.0,35.0,39.0,44.0,45.0,47.0,49.0,49.0


In [5]:
def first_look(lst):
    """
    Provides useful information about a list of dataframes: Name, Shape, NaNs and describe().
    
    Input: 
        lst(List): List of DataFrames
    
    Output: 
        Prints name, shape, share of NaNs, describe()
    """
    for item in lst:
        def get_df_name(df):
            name =[x for x in globals() if globals()[x] is df][0]
            return name
        print('Information for DataFrame ' + get_df_name(item))
        print('The shape of this DataFrame is {}.'.format(item.shape))
        print('NaN share for each column:')
        print((np.sum(item.isna() == True))/item.shape[0])
        print('Describe() for each column:')
        print(item.describe())
        print('---')

In [4]:
casen.head()

Unnamed: 0,folio,o,id_vivienda,hogar,region,provincia,comuna,zona,expr,expc,...,hh_d_estado,hh_d_servbas,hh_d_entorno,hh_d_accesi,hh_d_medio,hh_d_appart,hh_d_tsocial,hh_d_seg,pobreza_multi_4d,pobreza_multi_5d
0,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,,
1,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,,
3,110110000000.0,2,1101100000.0,1,1,11,1101,1,39,33,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,,
4,110110000000.0,1,1101100000.0,1,1,11,1101,1,39,33,...,1.0,1.0,1.0,0.0,1.0,,1.0,0.0,0.0,
