## Python 1 Project
#### IE MBD Apr 2024
#### Group 4

In [25]:
import pandas as pd
import sqlalchemy as sql

In [26]:
def read(path: str = None, db_config: dict = None):
    """
    This function should read a filepath/DB and return a dataframe
    """

    if db_config == None:
        return pd.read_csv(path)
    elif path == None:
        return pd.read_sql_table("COVID_DATA", db_config)
    else:
        raise TypeError("What should we read???")

In [27]:
def processFiles(paths: dict = None, dbConfig: dict= None) -> tuple:
    """ 
    Read files from the relevant paths and use functions above to clean
    """

    retVal = []
    if paths != None:
        for path in paths['lst']:
            raw_df = read(paths['path_root']+path)
            raw_df.dropna(axis=1, how="all", inplace=True)
            retVal.append(raw_df)
    
    return retVal



In [37]:
def joinFrames(df_list: list) -> pd.DataFrame:
    """
    Join DF's together
    """
    retVal = 1
    for df in df_list:
        if isinstance(retVal, int):
            retVal = df
        else:
            if 'date' in df.columns:
                if 'date' in retVal.columns:
                    retVal = retVal.merge(df, on=["location_key", "date"], how="outer")
                    continue

            retVal = retVal.merge(df, on="location_key", how="outer")
    

    retVal = retVal.drop(['place_id', 'wikidata_id',
       'datacommons_id', 'country_code', 'subregion1_code',
       'subregion1_name', 'subregion2_code', 'subregion2_name',
       'locality_code', 'locality_name', 'iso_3166_1_alpha_2',
       'iso_3166_1_alpha_3', 'aggregation_level', "location_key"], axis=1)
    
    retVal['date'] = pd.to_datetime(retVal['date'])
    retVal.dropna(axis=1, how="all", inplace=True)
    true_frame = retVal.groupby(["country_name", pd.Grouper(key='date', freq='W-MON')]).last()
    return true_frame

In [29]:
def write(df: pd.DataFrame, dbConfig: dict = None, path: str = None):
    """
    Write to CSV/DB
    """
    if not dbConfig and not path:
        raise KeyError("Nowhere to Write")
    elif not dbConfig:
        df.to_csv(path)
        return 1
    else:
        df.to_sql("COVID_DATA", dbConfig)
        return


In [30]:
def plot(df: pd.DataFrame) -> None:
    """
    Plot relevant data from the DF
    """

In [38]:
def createData() -> None:

    # DB Config
    #engine = sql.create_engine("test+testdb://nilesh:password@localhost/test")

    files_config = {
        'lst': ["demographics", "epidemiology", "health", "hospitalizations", "index", "vaccinations"],
        'path_root': "./data/"
    }
    clean_dfs = processFiles(files_config)
    merged_dfs = joinFrames(clean_dfs)
    write(merged_dfs, path="./data/combined_dataset.csv")

    return merged_dfs

df = createData()

In [39]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,population,population_male,population_female,population_density,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,...,cumulative_deceased,cumulative_recovered,cumulative_tested,life_expectancy,new_hospitalized_patients,cumulative_hospitalized_patients,current_hospitalized_patients,current_intensive_care_patients,new_persons_fully_vaccinated,cumulative_persons_fully_vaccinated
country_name,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Germany,2020-01-06,412120.0,202817.0,209303.0,390.5,36916.0,41026.0,45473.0,46678.0,50565.0,71914.0,...,0.0,1.0,,,,,,,,
Germany,2020-01-20,583109.0,283005.0,300104.0,2817.6,53673.0,51504.0,77365.0,77554.0,71304.0,88329.0,...,0.0,1.0,,,,,,,,
Germany,2020-01-27,329708.0,162003.0,167705.0,802.7,27365.0,28017.0,42946.0,41198.0,37300.0,53495.0,...,0.0,1.0,,,,,,,,
Germany,2020-02-03,132206.0,64915.0,67291.0,531.0,10460.0,11172.0,13912.0,15199.0,15136.0,23606.0,...,0.0,1.0,,,,,,,,
Germany,2020-02-10,451007.0,219214.0,231793.0,795.4,42978.0,43755.0,45263.0,53264.0,59425.0,76859.0,...,0.0,1.0,,,,,,,,


In [40]:
df.columns

Index(['population', 'population_male', 'population_female',
       'population_density', 'population_age_00_09', 'population_age_10_19',
       'population_age_20_29', 'population_age_30_39', 'population_age_40_49',
       'population_age_50_59', 'population_age_60_69', 'population_age_70_79',
       'population_age_80_and_older', 'new_confirmed', 'new_deceased',
       'new_recovered', 'new_tested', 'cumulative_confirmed',
       'cumulative_deceased', 'cumulative_recovered', 'cumulative_tested',
       'life_expectancy', 'new_hospitalized_patients',
       'cumulative_hospitalized_patients', 'current_hospitalized_patients',
       'current_intensive_care_patients', 'new_persons_fully_vaccinated',
       'cumulative_persons_fully_vaccinated'],
      dtype='object')