In [164]:
import glob
import pandas as pd
import pathlib
import joblib
import os
PYMMM_VERSION = '0.0.1'

def file_by_country(country, csv_files):
    filtered_files = [csv_file for csv_file in csv_files if ((csv_file.find(country) > 0) and (csv_file.find('raw') > 0))]
    return filtered_files

def get_filtered_data_hashed_dict(data_info_hashed, use_data):
    dict_collector = {}
    for k, v in data_info_hashed.items():
        for k1,v1 in v.items():
            if k1 in use_data:
                dict_collector[k] = {k1:v1}
    return dict_collector

def compress_files(country, file_type, use_data):
    name_collector = []
    hash_collector = []
    compressed_files = []
    csv_files = glob.glob(f'**/*.{file_type}', recursive = True)
    csv_files = file_by_country(country = country , csv_files = csv_files)
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        file_name = pathlib.Path(csv_file).stem
        file_hash = joblib.hash(df)
        compressed_file_name = csv_file.replace(file_name,file_hash + "_" + file_name + ".gzip")
        name_collector.append(file_name)
        hash_collector.append(file_hash)
        compressed_files.append(compressed_file_name)
        df.to_csv(compressed_path, index = None)
    data_info_hashed = {k:{m:n} for k,m,n in zip(hash_collector,name_collector,compressed_files)}
    filtered_data_info_hashed = get_filtered_data_hashed_dict(data_info_hashed, use_data)
    return data_info_hashed, filtered_data_info_hashed

In [165]:
raw_req = {
    "country": "Brazil",
    "use_files":['holidays_data','macroeconomic_data','media_data','pos_data','power_data'],
    "file_types":"csv",
    "hierarchy_columns": [
      "ZONE",
      "SHARED",
      "BRAND",
      "PRODUCT"
    ],
    "shared_column": "SHARED",
    "brand_column": "BRAND",
    "date_column": "TIMEDESC",
    "revenue_column": "SALES IN DOLLAR VALUE",
    "vehicle_column": "VEHICLE",
    "sub_vehicle_column": "SUBTYPE",
    "spend_column": "SPEND",
    "raw_power_columns": [
      "time_looker_dimdate_ref",
      "display_brand_name",
      "measure_name_displayed",
      "score"
    ],
    "n_simulated_environments":2,
    "n_train":2,
    "min_contiguous": 6,
    "dbg": True,
    "visualize": 0,
    "COVID_CUTOFF": "2020-01-01",
    "dynamics_layers": [
      {
        "name": "socialmedia",
        "veh": "socialmedia",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "radio",
        "veh": "radio",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "opentv",
        "veh": "opentv",
        "curve_style": "sigmoid",
        "mix_cols": []
      },
      {
        "name": "ooh",
        "veh": "ooh",
        "curve_style": "sigmoid",
        "mix_cols": []
      }
    ]
  }

In [166]:
country = raw_req['country']
use_files = raw_req['use_files']
file_types = raw_req['file_types']

In [167]:
data_info_hashed, filtered_data_info_hashed = compress_files(country,file_types, use_files)

In [170]:
data_info_hashed

{'3f67e6e30864ae8cbebe868ddd495122': {'media_data': 'data/Brazil/raw/3f67e6e30864ae8cbebe868ddd495122_media_data.gzip.csv'},
 '2c504f09daa7e79b2e844175a809af39': {'power_data': 'data/Brazil/raw/2c504f09daa7e79b2e844175a809af39_power_data.gzip.csv'},
 'e20cff04942cbe5b1bea079f8b01b4b1': {'holidays_data': 'data/Brazil/raw/e20cff04942cbe5b1bea079f8b01b4b1_holidays_data.gzip.csv'},
 'b3bb6e033fb9f003828c23d675b0d054': {'macroeconomic_data': 'data/Brazil/raw/b3bb6e033fb9f003828c23d675b0d054_macroeconomic_data.gzip.csv'},
 '172c61bf05bdfcea32f8546fd639c40d': {'pos_data': 'data/Brazil/raw/172c61bf05bdfcea32f8546fd639c40d_pos_data.gzip.csv'}}

In [171]:
filtered_data_info_hashed

{'3f67e6e30864ae8cbebe868ddd495122': {'media_data': 'data/Brazil/raw/3f67e6e30864ae8cbebe868ddd495122_media_data.gzip.csv'},
 '2c504f09daa7e79b2e844175a809af39': {'power_data': 'data/Brazil/raw/2c504f09daa7e79b2e844175a809af39_power_data.gzip.csv'},
 'e20cff04942cbe5b1bea079f8b01b4b1': {'holidays_data': 'data/Brazil/raw/e20cff04942cbe5b1bea079f8b01b4b1_holidays_data.gzip.csv'},
 'b3bb6e033fb9f003828c23d675b0d054': {'macroeconomic_data': 'data/Brazil/raw/b3bb6e033fb9f003828c23d675b0d054_macroeconomic_data.gzip.csv'},
 '172c61bf05bdfcea32f8546fd639c40d': {'pos_data': 'data/Brazil/raw/172c61bf05bdfcea32f8546fd639c40d_pos_data.gzip.csv'}}

In [168]:
hashed_req = {"raw_req":raw_req, "filtered_data_info": filtered_data_info_hashed,"code_version": PYMMM_VERSION}

In [169]:
hashed_req

{'raw_req': {'country': 'Brazil',
  'use_files': ['holidays_data',
   'macroeconomic_data',
   'media_data',
   'pos_data',
   'power_data'],
  'file_types': 'csv',
  'hierarchy_columns': ['ZONE', 'SHARED', 'BRAND', 'PRODUCT'],
  'shared_column': 'SHARED',
  'brand_column': 'BRAND',
  'date_column': 'TIMEDESC',
  'revenue_column': 'SALES IN DOLLAR VALUE',
  'vehicle_column': 'VEHICLE',
  'sub_vehicle_column': 'SUBTYPE',
  'spend_column': 'SPEND',
  'raw_power_columns': ['time_looker_dimdate_ref',
   'display_brand_name',
   'measure_name_displayed',
   'score'],
  'n_simulated_environments': 2,
  'n_train': 2,
  'min_contiguous': 6,
  'dbg': True,
  'visualize': 0,
  'COVID_CUTOFF': '2020-01-01',
  'dynamics_layers': [{'name': 'socialmedia',
    'veh': 'socialmedia',
    'curve_style': 'sigmoid',
    'mix_cols': []},
   {'name': 'radio', 'veh': 'radio', 'curve_style': 'sigmoid', 'mix_cols': []},
   {'name': 'opentv',
    'veh': 'opentv',
    'curve_style': 'sigmoid',
    'mix_cols': []

In [172]:
model_hash = joblib.hash(hashed_req)

In [174]:
request_file_name = model_hash + "_request.json"
response_file_name = model_hash + "_response.json"
model_file_name = model_hash + "_model.pickle"