In [1]:
import glob
import pyarrow.csv as pv
import pandas as pd
import pathlib
import joblib
import os
from pydantic import BaseModel
from typing import *
from datetime import date

PYMMM_VERSION = '0.0.1'


class RawRequest(BaseModel):
    country: str
    use_files: List[str]
    file_types: str
    hierarchy_columns: List[str]
    shared_column: str
    brand_column: str
    date_column: str
    revenue_column: str
    vehicle_column: str
    sub_vehicle_column: str
    spend_column: str
    raw_power_columns: List[str]
    n_simulated_environments: int
    n_train: int
    min_contiguous: int
    dbg: bool
    visualize: int
    COVID_CUTOFF: date
    dynamics_layers: List[Dict[str,Union[str, List[str]]]]

class UpdatedRequest(BaseModel):
    raw_req: RawRequest
    filtered_data_info: Dict[str,Dict[str,str]]
    code_version: str

def file_by_country(country, csv_files):
    filtered_files = [csv_file for csv_file in csv_files if ((csv_file.find(country) > 0) and (csv_file.find('raw') > 0))]
    return filtered_files

def get_filtered_data_hashed_dict(data_info_hashed, use_data):
    dict_collector = {}
    for k, v in data_info_hashed.items():
        for k1,v1 in v.items():
            if k1 in use_data:
                dict_collector[k] = {k1:v1}
    return dict_collector

def compress_files(country, file_type, use_data):
    name_collector = []
    hash_collector = []
    compressed_files = []
    csv_files = glob.glob(f'**/*.{file_type}', recursive = True)
    csv_files = file_by_country(country = country , csv_files = csv_files)
    for csv_file in csv_files:
        df = (pv.read_csv(csv_file)).to_pandas()
        file_name = pathlib.Path(csv_file).stem
        file_hash = joblib.hash(df)
        compressed_file_name = csv_file.replace(file_name,file_hash + "_" + file_name).replace(".csv",".parquet.gzip")
        name_collector.append(file_name)
        hash_collector.append(file_hash)
        compressed_files.append(compressed_file_name)
        df.to_parquet(compressed_file_name, compression='gzip', index = None)
    data_info_hashed = {k:{m:n} for k,m,n in zip(hash_collector,name_collector,compressed_files)}
    filtered_data_info_hashed = get_filtered_data_hashed_dict(data_info_hashed, use_data)
    return data_info_hashed, filtered_data_info_hashed

In [2]:
raw_req = {
    "country": "Brazil",
    "use_files":['holidays_data','macroeconomic_data','media_data','pos_data','power_data'],
    "file_types":"csv",
    "hierarchy_columns": [
      "ZONE",
      "SHARED",
      "BRAND",
      "PRODUCT"
    ],
    "shared_column": "SHARED",
    "brand_column": "BRAND",
    "date_column": "TIMEDESC",
    "revenue_column": "SALES IN DOLLAR VALUE",
    "vehicle_column": "VEHICLE",
    "sub_vehicle_column": "SUBTYPE",
    "spend_column": "SPEND",
    "raw_power_columns": [
      "time_looker_dimdate_ref",
      "display_brand_name",
      "measure_name_displayed",
      "score"
    ],
    "n_simulated_environments":2,
    "n_train":2,
    "min_contiguous": 6,
    "dbg": True,
    "visualize": 0,
    "COVID_CUTOFF": "2020-01-01",
    "dynamics_layers": [
      {
        "name": "socialmedia",
        "veh": "socialmedia",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "radio",
        "veh": "radio",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "opentv",
        "veh": "opentv",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "ooh",
        "veh": "ooh",
        "curve_style": "sigmoid",
        "mix_cols": []
      }
    ]
  }

In [3]:
raw_request = RawRequest(**raw_req)
country = raw_req['country']
use_files = raw_req['use_files']
file_types = raw_req['file_types']
data_info_hashed, filtered_data_info_hashed = compress_files(country,file_types, use_files)
hashed_req = {"raw_req":raw_request, "filtered_data_info": filtered_data_info_hashed,"code_version": PYMMM_VERSION}
updated_request = UpdatedRequest(**hashed_req)
model_hash = joblib.hash(updated_request.dict())
request_file_name = model_hash + "_request.json"
response_file_name = model_hash + "_response.json"
model_file_name = model_hash + "_model.pickle"

In [4]:
request_file_name

'f742843747c5aac543ca55eecb79bdfd_request.json'

In [5]:
response_file_name

'f742843747c5aac543ca55eecb79bdfd_response.json'

In [6]:
model_file_name

'f742843747c5aac543ca55eecb79bdfd_model.pickle'