In [1]:
import pandas as pd
import sqlalchemy 
import psycopg2
import os
import numpy as np
import requests
import re
import copy
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine
import datetime
from datetime import datetime as dt
from sqlalchemy import inspect
import sys
import sqlalchemy.ext.declarative as sqld
import roman

In [2]:
sys.path.append('..')

In [3]:
from modules.transforms import *

In [4]:
DBname=os.environ['DB_NAME']
postgres_psswd=os.environ['POSTGRES_PASSWORD']
postgres_user=os.environ['POSTGRES_USER']
postgres_port=str(os.environ['POSTGRES_PORT'])

In [5]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://'+postgres_user+':'+postgres_psswd+'@'+DBname+':'+postgres_port+'/superset')
# Create the connection
cnx = create_engine(postgres_str)

In [6]:
#guardar paso en csv
path='/data/ETLcache/'
now = dt.now()
timestamp = now.strftime("_%d%m%Y_%H%M%S")

In [7]:
inspector = inspect(cnx)
schemas = inspector.get_schema_names()

In [8]:
tables={}
for schema in schemas:
    #print("schema: %s" % schema)
    tables[schema]=[]
    for table_name in inspector.get_table_names(schema=schema):
        #print("table: %s" % table_name)
        #tables[schema][table_name]=list(inspector.get_columns(table_name, schema=schema)[0].keys())
        tables[schema].append(table_name)

In [9]:
INFO_COMUNAL=pd.read_sql_table('info_comunal@ivanMSC', con=cnx,schema='@ivanMSC') 
INFO_COMUNAL=INFO_COMUNAL.rename(columns={'Nombre':'Comuna'})
INFO_COMUNAL['Codigo region']=INFO_COMUNAL['Codigo region'].astype(int)

In [10]:
iso=pd.read_sql_table('iso', con=cnx,schema='geo') 
iso=iso.loc[iso['Country Name']=='Chile',('Country Code','Subdivision Name Used','Code','Number',
                                      'Latitude','Longitude')]
iso['Number']=iso['Number'].apply(lambda x: roman.fromRoman(x))
iso=iso.rename(columns={'Number':'Codigo region','Subdivision Name Used':'Region'})

In [11]:
popREG=pd.DataFrame(INFO_COMUNAL.groupby('Codigo region')['Poblacion'].sum().reset_index())

In [12]:
iso=iso.merge(popREG,on='Codigo region')

In [13]:
iso=iso.drop('Region',axis=1)

In [15]:
derived_tables={}
for s in tables.keys():
    for n in tables[s]:
        #print(n)
        if 'cumulativo' in n.lower():
            print('reading table '+n+'from schema '+s)
            table=pd.read_sql_table(n, con=cnx,schema=s) 
            print('generating diff table for cumulative table: '+n)
            nn=n.lower().replace('cumulativo','_diff_')
            if s not in derived_tables.keys():
                derived_tables[s]={}
            print('new name: '+nn)
            if 'std' not in nn:
                derived_tables[s][nn]=to_diff(table)
            else:
                print('standard form with linear regression')
                derived_tables[s][nn]=to_diff_std(table)
                derived_tables[s][nn]['Fecha']=derived_tables[s][nn]['Fecha'].dt.to_pydatetime()
        
        elif 'producto19_CasosActivosPorComuna_std' in n:
            print('reading table '+n+' from schema '+s)
            table=pd.read_sql_table(n, con=cnx,schema=s) 
            if s not in derived_tables.keys():
                derived_tables[s]={}
            nn=n
            derived_tables[s][nn]=table
            print('new name: '+nn)
            derived_tables[s][nn]=derived_tables[s][nn].merge(INFO_COMUNAL[['Region','Comuna','Superficie','Lat','Lon','CUT','Provincia']],
           on=['Region','Comuna'])
            #derived_tables[s][nn]=derived_tables[s][nn].drop('level_0',axis=1)
        
        elif 'producto53' in n.lower():
            print('reading table '+n+' from schema '+s)
            table=pd.read_sql_table(n, con=cnx,schema=s) 
            if s not in derived_tables.keys():
                derived_tables[s]={}
                
            print('adding descriptive stats to table: '+n)
            derived_tables[s][n]=casos_nuevos_desc(table,numeric_col_string='confirmados',group_col='Region')
            
            nn=n.lower()+'_casos_activos'
            #nn=n
            
            if ('producto53_confirmados_regionale' in n.lower()) & ('_casos_activos' not in n.lower()):
                print('calculando casos activos con rolling avg de 14 días')
                print('new name: '+nn)
                derived_tables[s][n]=derived_tables[s][n].merge(iso,on='Codigo region',how='left')
                derived_tables[s][nn]=casos_activos(table,numeric_col_string='confirmados',group_col='Region',window=14)
                derived_tables[s][nn]=derived_tables[s][nn].merge(iso,on='Codigo region',how='left')
                
            #elif ('producto53_confirmados_nacional' in n.lower()) & ('_casos_activos' not in n.lower()):
            #    print('calculando casos activos con rolling avg de 14 días')
            #    print('new name: '+nn)
            #    derived_tables[s][nn]=casos_activos(table,numeric_col_string='confirmados',group_col=None,window=14)
            #elif ('confirmados_provinciale' in n.lower()) & ('_casos_activos' not in n.lower()):
            #    print('calculando casos activos con rolling avg de 14 días')
            #    print('new name: '+nn)
            #    derived_tables[s][nn]=casos_activos(table,numeric_col_string='confirmados',group_col='Provincia',window=14)  
            #elif ('producto53_confirmados_' in n.lower()) & ('_casos_activos' not in n.lower()):
            #    print('calculando casos activos con rolling avg de 14 días')
            #    print('new name: '+nn)
            #    derived_tables[s][nn]=casos_activos(table,numeric_col_string='confirmados',group_col='servicio.salud',window=14)            

reading table producto13_CasosNuevosCumulativofrom schema producto13
generating diff table for cumulative table: producto13_CasosNuevosCumulativo
new name: producto13_casosnuevos_diff_
reading table producto13_CasosNuevosCumulativo_Tfrom schema producto13
generating diff table for cumulative table: producto13_CasosNuevosCumulativo_T
new name: producto13_casosnuevos_diff__t
reading table producto13_CasosNuevosCumulativo_stdfrom schema producto13
generating diff table for cumulative table: producto13_CasosNuevosCumulativo_std
new name: producto13_casosnuevos_diff__std
standard form with linear regression
reading table producto14_FallecidosCumulativofrom schema producto14


  result = getattr(ufunc, method)(*inputs, **kwargs)


generating diff table for cumulative table: producto14_FallecidosCumulativo
new name: producto14_fallecidos_diff_
reading table producto14_FallecidosCumulativo_Tfrom schema producto14
generating diff table for cumulative table: producto14_FallecidosCumulativo_T
new name: producto14_fallecidos_diff__t
reading table producto14_FallecidosCumulativo_stdfrom schema producto14
generating diff table for cumulative table: producto14_FallecidosCumulativo_std
new name: producto14_fallecidos_diff__std
standard form with linear regression


  result = getattr(ufunc, method)(*inputs, **kwargs)


reading table producto19_CasosActivosPorComuna_std from schema producto19
new name: producto19_CasosActivosPorComuna_std
reading table producto3_CasosTotalesCumulativofrom schema producto3
generating diff table for cumulative table: producto3_CasosTotalesCumulativo
new name: producto3_casostotales_diff_
reading table producto3_CasosTotalesCumulativo_Tfrom schema producto3
generating diff table for cumulative table: producto3_CasosTotalesCumulativo_T
new name: producto3_casostotales_diff__t
reading table producto3_CasosTotalesCumulativo_stdfrom schema producto3
generating diff table for cumulative table: producto3_CasosTotalesCumulativo_std
new name: producto3_casostotales_diff__std
standard form with linear regression
reading table producto53_confirmados_nacional from schema producto53


  result = getattr(ufunc, method)(*inputs, **kwargs)


adding descriptive stats to table: producto53_confirmados_nacional
reading table producto53_confirmados_provinciale from schema producto53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nuevos['Promedio']=nuevos.mean(axis=1)


adding descriptive stats to table: producto53_confirmados_provinciale
reading table producto53_confirmados_regionale from schema producto53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nuevos['Min']=nuevos.min(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nuevos['Max']=nuevos.max(axis=1)


adding descriptive stats to table: producto53_confirmados_regionale
calculando casos activos con rolling avg de 14 días
new name: producto53_confirmados_regionale_casos_activos
reading table producto53_confirmados_ from schema producto53
adding descriptive stats to table: producto53_confirmados_


In [16]:
derived_tables['producto53'].keys()

dict_keys(['producto53_confirmados_nacional', 'producto53_confirmados_provinciale', 'producto53_confirmados_regionale', 'producto53_confirmados_regionale_casos_activos', 'producto53_confirmados_'])

In [19]:
test=derived_tables['producto53']['producto53_confirmados_regionale_casos_activos']

In [20]:
test

Unnamed: 0,confirmados.1,confirmados.2,confirmados.3,confirmados.4,confirmados.5,confirmados.6,confirmados.7,confirmados.8,confirmados.9,confirmados.10,...,fecha,Region,Region_Metropolitana,index,Codigo region,Country Code,Code,Latitude,Longitude,Poblacion
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,2020-03-01,Tarapacá,Resto de Chile,0,1,CL,CL-TA,-20.202880,-69.287754,382773.0
1,2.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,2.0,1.0,...,2020-03-02,Tarapacá,Resto de Chile,1,1,CL,CL-TA,-20.202880,-69.287754,382773.0
2,2.0,1.0,0.0,0.0,0.0,3.0,1.0,0.0,2.0,1.0,...,2020-03-03,Tarapacá,Resto de Chile,2,1,CL,CL-TA,-20.202880,-69.287754,382773.0
3,2.0,1.0,0.0,0.0,0.0,4.0,3.0,0.0,2.0,1.0,...,2020-03-04,Tarapacá,Resto de Chile,3,1,CL,CL-TA,-20.202880,-69.287754,382773.0
4,2.0,2.0,0.0,0.0,0.0,4.0,5.0,1.0,2.0,1.0,...,2020-03-05,Tarapacá,Resto de Chile,4,1,CL,CL-TA,-20.202880,-69.287754,382773.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,2029.0,2071.0,2196.0,2095.0,2070.0,2061.0,2247.0,2150.0,2077.0,2092.0,...,2020-09-12,Magallanes,Resto de Chile,3195,12,CL,CL-MA,-52.368047,-70.986285,178362.0
3196,1985.0,2057.0,2161.0,2085.0,2048.0,2036.0,2189.0,2148.0,2011.0,2055.0,...,2020-09-13,Magallanes,Resto de Chile,3196,12,CL,CL-MA,-52.368047,-70.986285,178362.0
3197,1920.0,2062.0,2095.0,2026.0,2014.0,2025.0,2181.0,2085.0,1959.0,1988.0,...,2020-09-14,Magallanes,Resto de Chile,3197,12,CL,CL-MA,-52.368047,-70.986285,178362.0
3198,1872.0,2030.0,2008.0,1971.0,1974.0,1991.0,2089.0,1988.0,1915.0,1901.0,...,2020-09-15,Magallanes,Resto de Chile,3198,12,CL,CL-MA,-52.368047,-70.986285,178362.0


In [21]:
for schema in derived_tables.keys():
    for name in derived_tables[schema].keys():
        try:
            df=derived_tables[schema][name]
            name=name.replace('-','_')
            print("creating table "+name+' ,schema: '+schema)
            df.to_sql(name, schema=schema,con=cnx,if_exists='replace')
            print("saving table"+path+name+timestamp+'.csv in cache')
            df.to_csv(path+name+timestamp+'.csv',encoding='utf-8')
        except Exception as e: 
            print(str(e))
            pass

creating table producto13_casosnuevos_diff_ ,schema: producto13
saving table/data/ETLcache/producto13_casosnuevos_diff__20092020_225701.csv in cache
creating table producto13_casosnuevos_diff__t ,schema: producto13
saving table/data/ETLcache/producto13_casosnuevos_diff__t_20092020_225701.csv in cache
creating table producto13_casosnuevos_diff__std ,schema: producto13
saving table/data/ETLcache/producto13_casosnuevos_diff__std_20092020_225701.csv in cache
creating table producto14_fallecidos_diff_ ,schema: producto14
saving table/data/ETLcache/producto14_fallecidos_diff__20092020_225701.csv in cache
creating table producto14_fallecidos_diff__t ,schema: producto14
saving table/data/ETLcache/producto14_fallecidos_diff__t_20092020_225701.csv in cache
creating table producto14_fallecidos_diff__std ,schema: producto14
saving table/data/ETLcache/producto14_fallecidos_diff__std_20092020_225701.csv in cache
creating table producto19_CasosActivosPorComuna_std ,schema: producto19
saving table/dat

In [22]:
os.system('jupyter nbconvert --output /home/jovyan/work/ETLdocs/' + 'ETL_covid-chile.html' + ' --to html ' + 
          '/home/jovyan/work/ETL/covid-chile.ipynb')

65280