In [None]:
#@markdown <b>Run me to import underscore module</b><br/>   {display-mode: "form"}
#@markdown <small>Method signatures:</small><br/> 
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _(source_path, target_path)</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _set_gh_token(token)</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _from_gh(user_name, repo_name, release_name) &nbsp; &nbsp; &nbsp; <b>Returns:</B> dictionary of arrays { 'array_name' : np.ndarray }</small></small><br/>
#@markdown <small><small>&nbsp; &nbsp; &nbsp; _to_gh(user_name, repo_name, release_name, split_size=600, **arr_kwargs)</small></small><br/>

!pip install -q githubrelease
import numpy as np
import os, glob, re, time
import github_release

compressed_dirs = set()


def _compress(source_path, target_path, target_dir=None):
    if target_dir:
        !mkdir -p {target_dir}
    if target_path.endswith('.tar.gz'):
        !tar -czf {target_path} -C {source_path} .
    elif target_path.endswith('.tar'):
        !tar -cf {target_path} -C {source_path} .
    elif target_path.endswith('.zip'):
        !(cd {source_path} && zip -q -r {target_path} .)


def _extract(source_path, target_path):
    !mkdir -p {target_path}
    if source_path.endswith('.tar.gz'):
        !tar -xzf {source_path} -C {target_path}
    elif source_path.endswith('.tar'):
        !tar -xf {source_path} -C {target_path}
    elif source_path.endswith('.zip'):
        !unzip -qq {source_path} -d {target_path}


def _(source_path, target_path):
    """
    Use cases:
        Movement:
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Compression (e.g. from dir to .tar.gz):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Extraction (e.g. from .zip to dir):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
            
        Extraction & compression (e.g. from .zip to .tar.gz):
            - GCS -> GCS
            - GCS -> LOCAL
            - LOCAL -> GCS
            - LOCAL -> LOCAL
    """
    COMPRESSION_FORMATS = ('zip', 'tar', 'tar.gz')
    TEMP_DIR = "/tmp_"
    LOG_TEMPLATE = "{}    from    {}    to    {}"

    # Source
    source_dir, _, source_name = source_path.rpartition('/')
    source_isgcs = source_path.startswith("gs://")
    source_islocal = not source_isgcs
    source_isprefix, source_isfile, source_ext = source_name.partition('.')
    source_isdir = not source_isfile
    source_iscompression = source_ext in COMPRESSION_FORMATS

    # Target
    target_dir, _, target_name = target_path.rpartition('/')
    target_isgcs = target_path.startswith("gs://")
    target_islocal = not target_isgcs
    target_prefix, target_isfile, target_ext = target_name.partition('.')
    target_isdir = not target_isfile
    target_iscompression = target_ext in COMPRESSION_FORMATS

    # Flags
    MOVE_ONLY = source_ext == target_ext
    GCS_ONLY = source_isgcs and target_isgcs
    RENAME = source_isprefix != target_prefix
    COMPRESSION = source_isdir and target_iscompression
    EXTRACTION = source_iscompression and target_isdir
    EXTRACTION_COMPRESSION = source_iscompression and target_iscompression and source_ext != target_ext

    # Authenticate if writing to GCS
    if target_isgcs:
        from google.colab import auth
        auth.authenticate_user()

    # Assert that subdirectories exist if target is local
    if target_islocal:
        !mkdir -p {target_dir}

    # Movement commands
    if MOVE_ONLY:
        # GCS -> GCS
        if source_isgcs and target_isgcs:
            print(LOG_TEMPLATE.format("MOVING (1/1)", source_path, target_path))
            !gsutil -m -q mv {source_path} {target_path}
        
        # LOCAL -> LOCAL
        elif source_islocal and target_islocal:
            print(LOG_TEMPLATE.format("MOVING (1/1)", source_path, target_path))
            !mv {source_path} {target_path}
        
        # GCS -> LOCAL
        elif source_isgcs and target_islocal:
            if source_isdir:
                print(LOG_TEMPLATE.format("DOWNLOADING DIR (1/1)", source_path, target_dir))
                !gsutil -m -q cp -r {source_path} {target_dir}
                if RENAME:
                    print(LOG_TEMPLATE.format("\tRENAMING DIR", source_isprefix, target_prefix))
                    !mv {target_dir}/{source_isprefix} {target_dir}/{target_prefix}
            else:
                print(LOG_TEMPLATE.format("DOWNLOADING FILE (1/1)", source_path, target_path))
                !gsutil -m -q cp {source_path} {target_path}
        
        # LOCAL -> GCS
        if source_islocal and target_isgcs:
            if source_isdir:
                print(LOG_TEMPLATE.format("UPLOADING DIR (1/1)", source_path, target_path))
                !gsutil -m -q cp -r {source_path} {target_path}
            else:
                print(LOG_TEMPLATE.format("UPLOADING FILE (1/1)", source_path, target_path))
                !gsutil -m -q cp {source_path} {target_path}
        return


    # Create directory for intermediate storage if required
    if source_isgcs or target_isgcs or EXTRACTION_COMPRESSION:
        !mkdir -p {TEMP_DIR}
    

    # For remaining operations, download GCS source to temp and treat as local
    if source_isgcs:
        if source_isdir:
            print(LOG_TEMPLATE.format("\tDOWNLOADING DIR", source_path, TEMP_DIR))
            !gsutil -m -q cp -r {source_path} {TEMP_DIR}
        else:
            print(LOG_TEMPLATE.format("\tDOWNLOADING FILE", source_path, f"{TEMP_DIR}/{source_name}"))
            !gsutil -m -q cp {source_path} {TEMP_DIR}/{source_name}
        source_path = f"{TEMP_DIR}/{source_name}"
        source_dir = TEMP_DIR

    # Compression
    if COMPRESSION:
        if target_islocal:
            print(LOG_TEMPLATE.format("COMPRESSING (1/1)", source_path, target_path))
            _compress(source_path, target_path, target_dir=target_dir)
        else:
            print(LOG_TEMPLATE.format("COMPRESSING (1/2)", source_path, f"{TEMP_DIR}/{target_name}"))
            _compress(source_path, f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING FILE (2/2)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp {TEMP_DIR}/{target_name} {target_path}

    # Extraction
    elif EXTRACTION:
        if target_islocal:
            print(LOG_TEMPLATE.format("EXTRACTING (1/1)", source_path, target_path))
            _extract(source_path, target_path)
        else:
            print(LOG_TEMPLATE.format("EXTRACTING (1/2)", source_path, f"{TEMP_DIR}/{target_name}"))
            _extract(source_path, f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING DIR (2/2)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp -r {TEMP_DIR}/{target_name} {target_path}

    # Extraction & compression
    elif EXTRACTION_COMPRESSION:
        if target_islocal:
            print(LOG_TEMPLATE.format("EXTRACTING (1/2)", source_path, f"{TEMP_DIR}/{target_prefix}"))
            _extract(source_path, f"{TEMP_DIR}/{target_prefix}")
            print(LOG_TEMPLATE.format("COMPRESSING (2/2)", f"{TEMP_DIR}/{target_prefix}", target_path))
            _compress(f"{TEMP_DIR}/{target_prefix}", target_path, target_dir=target_dir)
        else:
            print(LOG_TEMPLATE.format("EXTRACTING (1/3)", source_path, f"{TEMP_DIR}/{target_prefix}"))
            _extract(source_path, f"{TEMP_DIR}/{target_prefix}")
            print(LOG_TEMPLATE.format("COMPRESSING (2/3)", f"{TEMP_DIR}/{target_prefix}", f"{TEMP_DIR}/{target_name}"))
            _compress(f"{TEMP_DIR}/{target_prefix}", f"{TEMP_DIR}/{target_name}")
            print(LOG_TEMPLATE.format("UPLOADING FILE (3/3)", f"{TEMP_DIR}/{target_name}", target_path))
            !gsutil -m -q cp {TEMP_DIR}/{target_name} {target_path}
    
    # Cleanup intermediate storage
    !rm -rf {TEMP_DIR}


def _set_gh_token(token):
    os.environ["GITHUB_TOKEN"] = token


def _export_array(array, release_name, prefix="", splits=3):
    dir_path = f"/tmp_/{release_name}"
    !mkdir -p {dir_path}
    n_digits = len(str(splits - 1))
    subarrays = np.array_split(array, splits)
    for i, subarray in enumerate(subarrays):
        filename = f"{prefix}__{str(i).zfill(n_digits)}.npy"
        np.save(f"{dir_path}/{filename}", subarray)


def _concat_arrays(paths):
    return np.concatenate([np.load(path, allow_pickle=True) for path in sorted(paths)])


def _to_gh(user_name, repo_name, release_name, split_size=600, **arr_kwargs):
    # Assert that GitHub Auth token is set
    if "GITHUB_TOKEN" not in os.environ:
        print("GitHub authentication token is not set.")
        print("Set token using the '_set_gh_token(token_string)' method.")
        print("Minimal required auth scope is 'repo/public_repo' for public repositories.")
        print("URL: https://github.com/settings/tokens/new")
        return

    # Split arrays
    for prefix, array in arr_kwargs.items():
        splits = int((array.nbytes/1_000_000) // split_size) + 1
        _export_array(array, release_name, prefix=prefix, splits=splits)

    # Upload arrays
    github_release.gh_release_create(
        f"{user_name}/{repo_name}", 
        release_name, 
        publish=True, 
        name=release_name, 
        asset_pattern=f"/tmp_/{release_name}/*"
    )
    !rm -rf /tmp_/*


def _from_gh(user_name, repo_name, release_name):
    # Download release to temporary directory
    print("Downloading dataset in parallell ... ", end='\t')
    t0 = time.perf_counter()
    assets = github_release.get_assets(f"{user_name}/{repo_name}", tag_name=release_name)
    download_urls = [asset['browser_download_url'] for asset in assets]
    urls_str = " ".join(download_urls)
    !echo {urls_str} | xargs -n 1 -P 8 wget -q -P /tmp_/{release_name}_dl/
    t1 = time.perf_counter()
    print(f"done! ({t1 - t0:.3f} seconds)")

    # Load data into numpy arrays
    paths = glob.glob(f"/tmp_/{release_name}_dl/*.npy")
    groups = {}
    for path in paths:
        match = re.match(r".*/(.*)__[0-9]*\.npy", path)
        if match:
            prefix = match.group(1)
            groups[prefix] = groups.get(prefix, []) + [path]
    arrays_dict = {name: _concat_arrays(paths) for name, paths in groups.items()}
    !rm -rf /tmp_/*
    return arrays_dict
    

def _log_to_gh(user, repo, tag, log_dir="/tmp/logs"):
    # Create temporary directory for compressed logs
    !mkdir -p /tmp/compressed_logs
    
    # Compress all directories in log dir
    for dirname in os.listdir(log_dir):
        # Skip files
        if "." in dirname or dirname in compressed_dirs:
            continue

        # Compress
        _(f"{log_dir}/{dirname}", f"/tmp/compressed_logs/{dirname}.tar.gz")
        compressed_dirs.add(dirname)

    # Upload compressed logs to GitHub
    github_release.gh_asset_upload(f"{user}/{repo}", tag, f"/tmp/compressed_logs/*.tar.gz")

    # Cleanup compressed logs
    !rm -rf /tmp/compressed_logs/*

#### Load dataset into DataFrames from .parquet files


In [None]:
import pandas as pd

# Download .parquet files
SOURCE_PATH = "gs://telenor-data-science/datasets/location_dataset_parquet"
TARGET_PATH = "/content/location_dataset_parquet"
_(SOURCE_PATH, TARGET_PATH)

# Load all .parquet files as dataframes
dataframes = {}     # Format: {location: pd.DataFrame}
for path in glob.glob(f"{TARGET_PATH}/**/*.parquet", recursive=True):
    df = pd.read_parquet(path)
    location = path.split(os.sep)[-1].split('.')[0]
    dataframes[location] = df

In [None]:
# Create column groupings
patterns = ["MET", "KV", "PRA"]
patterns = [
    ("PRA_(\d+)__(upTo5_6)", "PRA__upTo5_6"),
    ("PRA_(\d+)__(from5_6To7_6)", "PRA__from5_6To7_6"),
    ("PRA_(\d+)__(from7_6To12_5)", "PRA__from7_6To12_5"),
    ("PRA_(\d+)__(from12_5To16)", "PRA__from12_5To16"),
    ("PRA_(\d+)__(from16To24)", "PRA__from16To24"),
    ("PRA_(\d+)__(from24up)", "PRA__from24up"),
]

merged_df = pd.DataFrame(columns=['location', 'year'])

# Aggregate PRA columns for each locatioin
for location, df in dataframes.items():
    for pattern, target_column in patterns:
        col_group = [col for col in df.columns if re.match(pattern, col)]
        grouped_df = df[col_group]

        # Min-max normalization
        series_min = grouped_df.min(axis=0, skipna=True)
        series_max = grouped_df.max(axis=0, skipna=True)
        df_scaled = grouped_df.subtract(series_min, axis=1).divide(series_max, axis=1)

        # Calculate aggregated column
        df_scaled[target_column] = df_scaled.mean(axis=1, skipna=True)
        
        # Remove grouped columns from original dataframe and append new column
        df = df.drop(col_group, axis=1)
        df = pd.concat([df, df_scaled[target_column]], axis=1)
    
    # Generate non-existing columns in merged dataframe
    for col_name in set(df.columns) - set(merged_df.columns):
        merged_df[col_name] = None

    # Add processed location dataframe to merged dataframe
    merged_df = merged_df.append(df)
merged_df

In [None]:
col_order = [
    "location",
    "year",
    "KV_moving_0m_to_100m",
    "KV_moving_100m_to_300m",
    "KV_moving_300m_to_1000m",
    "KV_moving_1000m_to_3000m",
    "KV_moving_3000m_to_10000m",
    "KV_moving_10000m_to_30000m",
    "KV_stationary_0m_to_100m",
    "KV_stationary_100m_to_300m",
    "KV_stationary_300m_to_1000m",
    "KV_stationary_1000m_to_3000m",
    "KV_stationary_3000m_to_10000m",
    "KV_stationary_10000m_to_30000m",
    "MET_air_temperature_2m",
    "MET_air_temperature_10m",
    "MET_air_temperature_25m",
    "MET_air_temperature_30m",
    "MET_cloud_area_fraction",
    "MET_relative_humidity",
    "MET_sea_surface_temperature",
    "MET_surface_air_pressure",
    "MET_surface_snow_thickness",
    "MET_wind_from_direction_2m",
    "MET_wind_from_direction_10m",
    "MET_wind_speed_2m",
    "MET_wind_speed_10m",
    "NEA_NO",
    "NEA_NO2",
    "NEA_NOx",
    "NEA_PM1",
    "NEA_PM2_5",
    "NEA_PM10",
    "PRA__upTo5_6",
    "PRA__from5_6To7_6",
    "PRA__from7_6To12_5",
    "PRA__from12_5To16",
    "PRA__from16To24",
    "PRA__from24up",
]

In [None]:
merged_df = merged_df.reindex(sorted(merged_df.columns, key=lambda col: col_order.index(col)), axis=1)

In [None]:
merged_df.columns

In [None]:
merged_df

In [None]:
def export_to_parquet(dataframes, target_dir):
    if target_dir.endswith("/"):
        target_dir = target_dir[:-1]
    !mkdir -p {target_dir}

    # Export dataframes to PARQUET
    for location, df in dataframes.items():
        df.to_parquet(f"{target_dir}/{location}.parquet")

def bigquery_dataset_from_dataframes(dataframes, dataset_name):
    PROJECT_ID = 'telenor-data-science'
    TARGET_GCS_DIR =  f"gs://{PROJECT_ID}/datasets/{dataset_name}"
    
    # Export to local .parquet files
    export_to_parquet(dataframes, f"/tmp_/{dataset_name}")

    # Upload dataset to GCS
    _(f"/tmp_/{dataset_name}", TARGET_GCS_DIR)

    # Create new BigQuery dataset
    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project {PROJECT_ID}
    !bq mk --location europe-west1 --dataset {dataset_name}

    # Create new table for every parquet file
    for table_name in dataframes.keys():
        !bq load  --source_format=PARQUET --autodetect {dataset_name}.{table_name} {TARGET_GCS_DIR}/{table_name}.parquet

In [None]:
merged_dataframes = {"all_locations": merged_df}
bigquery_dataset_from_dataframes(merged_dataframes, "merged_dataset")

In [None]:

        col_groups = {provider: [col for col in df.columns if (col.startswith(provider) or col == 'year')]  for pattern, target in providers}

    # For each station
    for provider, col_group in col_groups.items():
        year_dfs = {x: pd.DataFrame(y) for x, y in df[col_group].groupby('year', as_index=False)}

for location, df in dataframes.items():
    # Group columns by provider
    col_groups = {provider: [col for col in df.columns if (col.startswith(provider) or col == 'year')]  for provider in providers}

    # For each provider
    for provider, col_group in col_groups.items():
        year_dfs = {x: pd.DataFrame(y) for x, y in df[col_group].groupby('year', as_index=False)}

        # For each year
        years_missing = {'location': location}      # Format: {year: missing_ratio]
        for year, year_df in year_dfs.items():
            year_df = year_df.drop('year', axis=1)
            missing_ratio = year_df.isnull().astype(int).sum().sum() / year_df.size
            #print(f"Year: {year},   Missing ratio: {missing_ratio}")
            years_missing[year] = [1 - missing_ratio]
        
        # Add this locations' missing ratios to provider 
        years_missing_df = pd.DataFrame(years_missing)
        provider_df = provider_dfs[provider]
        
        # Generate colummns in provider dataframe
        for column_name in set(years_missing_df.columns) - set(provider_df.columns):
            provider_df[column_name] = None

        # Add year/location data to provider
        provider_dfs[provider] = provider_df.append(years_missing_df)

# Rearrange columns
provider_dfs = {provider: df.reindex(sorted(df.columns), axis=1).fillna(0).sort_values('location') for provider, df in provider_dfs.items()}

# Create concatenated dataframe for stacked chart
concat_dfs = []
for provider, df in provider_dfs.items():
    df['provider'] = provider
    concat_dfs.append(df)
concat_df = pd.concat(concat_dfs, ignore_index=True)
concat_df = concat_df.set_index(['location', 'provider'])
concat_df = concat_df.sort_values(['location', 'provider'])

In [None]:
merged_df.to_csv("all_locations.csv")


In [None]:
_("/content/all_locations.csv", "gs://telenor-data-science/datasets/merged_dataset/all_locations.csv")