# lab_params.py

In [1]:
#!/usr/bin/env python

import os

# DB config
DB_HOST='localhost'
DB_NAME='postgres'
DB_USER='postgres'
DB_PASSWORD='postgres'
DB_PORT=5432 # Default is 5432, if some service was using that port already then PostgreSQL's port may be 5433 instead

# File paths
RAW_DATA_FOLDER_PATH = 'raw_data'
MOVE_RANGE_FILE_PATH = os.path.join(RAW_DATA_FOLDER_PATH ,'movement-range-2021-09-09.txt')
COUNTRY_REGION_FILE_PATH = os.path.join(MOVE_RANGE_FILE_PATH , 'country_regions.txt')

# Table names
TEST_PREFIX=''
MOVE_RANGE_TABLE_NAME = f'{TEST_PREFIX}rpl_move_range'
COVID_SURVEY_TABLE_NAME = f'{TEST_PREFIX}rpl_covid_survey'

# Time range (for COVID survey API)
FROM_DATE = '2021-01-01'
TO_DATE = '2021-06-30'

# Other parameters
SURVEY_INDICATORS = [
    'mask',
    'covid',
    'tested_positive_14d',
    'anosmia',
    'covid_vaccine',
]

# lab_utils.py

In [2]:
#!/usr/bin/env python

import pandas as pd
import luigi
import sqlite3
import csv

from lab_params import *


# Returns objects to interact with database
def connect_db():
    conn = sqlite3.connect(DB_NAME)
    cur = conn.cursor()
    return conn, cur


# Creates empty table in database
def create_table(table_name, table_schema, drop_if_exists=False):
    conn, cur = connect_db()
    cols = ',\n'.join([f'{col_name} {col_type}' for col_name, col_type in table_schema])
    
    if drop_if_exists:
        sql = f'DROP TABLE IF EXISTS {table_name};'
        print(sql)
        print('\n\n')
        cur.execute(sql)

    sql = f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            {cols}
        );
    """
    print(sql)
    print('\n\n')
    cur.execute(sql)
    conn.commit()
    conn.close()


# Loads data from file to table in database
def load_file_in_table(
    file_path,
    table_name,
    table_schema=None,
    sep='\t',
    skip_header=True,
    file_encoding='utf-8',
    run_create_table=False,
    overwrite_filter=None,
):
    conn, cur = connect_db()
    
    if run_create_table:
        create_table(table_name, table_schema)
    
    if overwrite_filter:
        sql = f'DELETE FROM {table_name} WHERE {overwrite_filter}'
        print(sql)
        cur.execute(sql)
        print('\n\n')

    print('-----------------------\nFile loading: STARTED\n-----------------------\n')
    insert_statement = f"INSERT INTO {table_name} ({', '.join([col[0] for col in table_schema])}) VALUES({', '.join(['?' for col in table_schema])})"
    print(insert_statement)
    df = pd.read_csv(file_path, sep=sep)
    df.to_sql(table_name, con=conn, if_exists='append', index=False)
    conn.close()
    print('-----------------------\nFile loading: FINISHED\n-----------------------\n')


# Runs query on database, returns output as pandas.DataFrame
def query_db(query):
    conn, cur = connect_db()
    output = pd.io.sql.read_sql_query(query, conn)
    conn.close()
    return output


# Runs query, returns nothing
def run_query(query, verbose=False):
    conn, cur = connect_db()
    if verbose:
        print(query)
    cur.execute(query)
    conn.commit()
    conn.close()


# Returns a dataframe listing all tables in database
def get_tables_list():
    return query_db("SELECT name FROM sqlite_master WHERE type='table'")


# Returns True if table exists, False otherwise
def table_exists(table_name):
    return table_name in set(get_tables_list()["name"])


# ##########################
# Luigi auxiliary utilities
# ##########################

class TableExists(luigi.Target):
    def __init__(self, table_name):
        super().__init__()
        self.table_name = table_name

    def exists(self):
        return table_exists(self.table_name)


class DataExists(luigi.Target):
    def __init__(self, table_name, where_clause):
        super().__init__()
        self.table_name = table_name
        self.where_clause = where_clause

    def exists(self):
        if TableExists(self.table_name).exists():
            print(f'LOGGING: Table exists: {self.table_name}')
            return query_db(f'SELECT * FROM {self.table_name} WHERE {self.where_clause} LIMIT 1').size > 0
        else:
            return False


class DummyOutput(luigi.Target):
    def __init__(self, dummy_variable=False):
        super().__init__()
        self.dummy_variable = dummy_variable

    def exits(self):
        print(f'DummyOutput: {self.dummy_variable}')
        if self.dummy_variable:
            return True
        else:
            return False


# #########################
# Other auxiliary functions
# #########################

def get_indicator_code(indicator):
    indicator_to_code = {
        'flu': 'ili',
        'mask': 'mc',
        'contact': 'dc',
        'finance': 'hf',
        'anosmia': 'anos',
        'vaccine_acpt': 'vu',
    }
    return indicator_to_code.get(indicator, indicator)


In [3]:
import luigi
import requests
import json
import os
import pandas as pd

# rpl_covid_survey.py

In [5]:
# ###################
# AUXILIARY FUNCTIONS
# ###################

# Renders the file path for the report to download
def get_file_path(indicator, country, date):
    date_no_dash = date.replace('-', '')
    return os.path.join(RAW_DATA_FOLDER_PATH, f'covid_survey__{indicator}__{country}__{date_no_dash}__{date_no_dash}.txt')

# Renders the table name for each different indicator
def get_table_name(indicator, test_prefix):
    return f'{test_prefix}{COVID_SURVEY_TABLE_NAME}_{indicator}'


# ##############
# PIPELINE TASKS
# ##############

class CreateTable(luigi.Task):
    """
    Creates table in database.
    """
    test_prefix = luigi.Parameter(default='')
    indicator = luigi.Parameter()

    def run(self):
        indicator_code = get_indicator_code(self.indicator)
        percent = 'pct' if self.indicator == 'covid' else 'percent'
        table_schema = (
            (f'{percent}_{indicator_code}', 'float'),
            (f'{indicator_code}_se', 'float'),
            (f'{percent}_{indicator_code}_unw', 'float'),
            (f'{indicator_code}_se_unw', 'float'),
            ('sample_size', 'NUMERIC'),
            ('country', 'text'),
            ('iso_code', 'text'),
            ('gid_0', 'text'),
            ('survey_date', 'NUMERIC'),
        )
        create_table(get_table_name(self.indicator, self.test_prefix), table_schema)

    def output(self):
        return TableExists(get_table_name(self.indicator, self.test_prefix))


class DownloadAPIReport(luigi.Task):
    """
    Downloads report from API.
    """
    indicator = luigi.Parameter()
    country = luigi.Parameter()
    date = luigi.Parameter()

    def run(self):
        # Create folder is not existing
        if RAW_DATA_FOLDER_PATH not in os.listdir():
            os.mkdir(RAW_DATA_FOLDER_PATH)
        file_path = get_file_path(self.indicator, self.country, self.date)
        # Get params for API call
        date_no_dash = self.date.replace('-', '')
        url = f"https://covidmap.umd.edu/api/resources?indicator={self.indicator}&type=daily&country={self.country}&daterange={date_no_dash}-{date_no_dash}"
        # Call API and save respond in CSV
        print("CALLING API: ", url)
        response = requests.get(url).text
        response_dict = json.loads(response)
        df = pd.DataFrame.from_dict(response_dict['data'])
        df.to_csv(file_path, sep='\t', index=False, encoding='utf-8')

    def output(self):
        file_path = get_file_path(self.indicator, self.country, self.date)
        return luigi.LocalTarget(file_path)
        

class LoadReportIntoDB(luigi.Task):
    """
    Loads report file into database.
    """
    test_prefix = luigi.Parameter(default='')
    indicator = luigi.Parameter()
    country = luigi.Parameter()
    date = luigi.Parameter()

    def get_sql_filter(self):
        return f"country='{self.country}' AND survey_date = {self.date.replace('-','')}"

    def requires(self):
        yield DownloadAPIReport(indicator=self.indicator, country=self.country, date=self.date)
        yield CreateTable(indicator=self.indicator, test_prefix=self.test_prefix)

    def run(self):
        file_path = get_file_path(self.indicator, self.country, self.date)
        indicator_code = get_indicator_code(self.indicator)
        table_schema = (
            (f'percent_{indicator_code}', 'float'),
            (f'{indicator_code}_se', 'float'),
            (f'percent_{indicator_code}_unw', 'float'),
            (f'{indicator_code}_se_unw', 'float'),
            ('sample_size', 'NUMERIC'),
            ('country', 'text'),
            ('iso_code', 'text'),
            ('gid_0', 'text'),
            ('survey_date', 'NUMERIC'),
        )
        load_file_in_table(
            file_path,
            get_table_name(self.indicator, self.test_prefix),
            table_schema=table_schema,
            overwrite_filter=self.get_sql_filter(),
            skip_header=True,
        )

    def output(self):
        return DataExists(table_name=get_table_name(self.indicator, self.test_prefix), where_clause=self.get_sql_filter())


class MasterTask(luigi.WrapperTask):
    """
    Generates tasks for several survey indicators and countries.
    """
    test_prefix = luigi.Parameter(default='')
    date = luigi.Parameter()

    def requires(self):
        for indicator in ['mask', 'covid']:
            for country in ['Germany', 'Japan']:
                yield LoadReportIntoDB(indicator=indicator, country=country, date=self.date, test_prefix=self.test_prefix)


if __name__ == '__main__':
     luigi.build([MasterTask(date='2021-01-01', test_prefix = 'test_1_')], workers=5, local_scheduler=True)


DEBUG: Checking if MasterTask(test_prefix=test_1_, date=2021-01-01) is complete
DEBUG: Checking if LoadReportIntoDB(test_prefix=test_1_, indicator=mask, country=Germany, date=2021-01-01) is complete
DEBUG: Checking if LoadReportIntoDB(test_prefix=test_1_, indicator=mask, country=Japan, date=2021-01-01) is complete
DEBUG: Checking if LoadReportIntoDB(test_prefix=test_1_, indicator=covid, country=Germany, date=2021-01-01) is complete
DEBUG: Checking if LoadReportIntoDB(test_prefix=test_1_, indicator=covid, country=Japan, date=2021-01-01) is complete
INFO: Informed scheduler that task   MasterTask_2021_01_01_test_1__53563b9853   has status   PENDING
DEBUG: Checking if DownloadAPIReport(indicator=covid, country=Japan, date=2021-01-01) is complete
DEBUG: Checking if CreateTable(test_prefix=test_1_, indicator=covid) is complete
INFO: Informed scheduler that task   LoadReportIntoDB_Japan_2021_01_01_covid_c26c5ff14b   has status   PENDING
INFO: Informed scheduler that task   CreateTable_covid_

LOGGING: Table exists: test_1_rpl_covid_survey_mask
LOGGING: Table exists: test_1_rpl_covid_survey_mask
LOGGING: Table exists: test_1_rpl_covid_survey_mask
LOGGING: Table exists: test_1_rpl_covid_survey_covid
LOGGING: Table exists: test_1_rpl_covid_survey_covid


DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 8
DEBUG: Asking scheduler for work...
INFO: [pid 13068] Worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910) running   DownloadAPIReport(indicator=covid, country=Japan, date=2021-01-01)
DEBUG: Pending tasks: 7


CALLING API:  https://covidmap.umd.edu/api/resources?indicator=covid&type=daily&country=Japan&daterange=20210101-20210101
CALLING API: 

DEBUG: Asking scheduler for work...
INFO: [pid 13069] Worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910) running   DownloadAPIReport(indicator=covid, country=Germany, date=2021-01-01)


 

DEBUG: Pending tasks: 6


https://covidmap.umd.edu/api/resources?indicator=covid&type=daily&country=Germany&daterange=20210101-20210101


objc[13068]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[13068]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
INFO: [pid 13070] Worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910) running   DownloadAPIReport(indicator=mask, country=Japan, date=2021-01-01)
DEBUG: Asking scheduler for work...
objc[13069]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[13069]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.


CALLING API: 

DEBUG: Done


 

DEBUG: There are no more tasks to run at this time


https://covidmap.umd.edu/api/resources?indicator=mask&type=daily&country=Japan&daterange=20210101-20210101

DEBUG: DownloadAPIReport_Japan_2021_01_01_covid_ab4ec58261 is currently run by worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910)





INFO: [pid 13071] Worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910) running   DownloadAPIReport(indicator=mask, country=Germany, date=2021-01-01)
DEBUG: DownloadAPIReport_Germany_2021_01_01_covid_68845450a4 is currently run by worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910)
objc[13070]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[13070]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.


CALLING API: 

DEBUG: DownloadAPIReport_Japan_2021_01_01_mask_376d5d59c5 is currently run by worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910)
DEBUG: DownloadAPIReport_Germany_2021_01_01_mask_90a754514d is currently run by worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910)


 https://covidmap.umd.edu/api/resources?indicator=mask&type=daily&country=Germany&daterange=20210101-20210101

INFO: Task DownloadAPIReport_Japan_2021_01_01_covid_ab4ec58261 died unexpectedly with exit code -6





INFO: Task DownloadAPIReport_Germany_2021_01_01_covid_68845450a4 died unexpectedly with exit code -6
INFO: Task DownloadAPIReport_Japan_2021_01_01_mask_376d5d59c5 died unexpectedly with exit code -6
objc[13071]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[13071]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
INFO: Informed scheduler that task   DownloadAPIReport_Japan_2021_01_01_covid_ab4ec58261   has status   FAILED
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
DEBUG: DownloadAPIReport_Germany_2021_01_01_covid_68845450a4 is currently run by worker Worker(salt=655874230, workers=5, host=cabanzon-mbp, username=cabanzon, pid=11910)
DEBUG: DownloadAPIReport_Japan_2021_01_01_m

# covid_survey_covid_mask.py

# covid_survey_covid_mask_2.py

# covid_survey_json.py