In [7]:
# Clean Trash file to free up disk space
import subprocess
subprocess.call(['rm','-rf','../.local/share/Trash/files/*'])

0

In [None]:
import os
import tempfile

import pandas as pd
import gc
import dask.dataframe as dd

from custom_code.upload_file_to_gcs import upload_file_to_gcs
from custom_code.load_data import load_data
from custom_code.load_wa_forecast import load_wa_forecast
from custom_code.load_product import load_product
from custom_code.process_features import process_features
from custom_code.create_folds_and_slice_features import create_folds_and_slice_features
from custom_code.create_fold_aware_features import create_fold_aware_features
from custom_code.predict_and_save_results_per_fold import predict_and_save_results_per_fold
from custom_code.train_model_per_fold import train_model_per_fold
from custom_code.predict_and_save_results_per_fold import predict_and_save_results_per_fold
from custom_code.process_results import process_results
from custom_code.settings import RUNTAG, PROJECT, BUCKET, DATA_DIR, RESULTS_DIR, PARAMS


DATA = False
WA_FORECAST = False
PRODUCT = False
FEATURES = False
FOLDS_AND_SLICES = False
FOLD_AWARE_FEATURES = True
TRAIN_PER_FOLD = False
PREDICT_PER_FOLD = False
RESULTS = False


if DATA:
    data_df = load_data()
    print('Writing data to GCS')
    file_location = './temp.h5'
    with open(file_location, 'w') as tf:
        data_df.to_hdf(file_location, 'data_df')
        upload_file_to_gcs(PROJECT, BUCKET, file_location, '{}/actual_{}.h5'.format(DATA_DIR, RUNTAG))
    gc.collect()

if WA_FORECAST:
    wa_forecast_df = load_wa_forecast()
    print('Writing WA to GCS')
    with open(tempfile.NamedTemporaryFile().name, 'w') as temp_csv:
        wa_forecast_df.to_csv('{}.csv'.format(temp_csv.name), index=False)
        upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(temp_csv.name), '{}/wa_{}.csv'.format(DATA_DIR, RUNTAG))
    gc.collect()

if PRODUCT:
    product_df = load_product()
    print('Writing Product to GCS')
    with open(tempfile.NamedTemporaryFile().name, 'w') as temp_csv:
        product_df.to_csv('{}.csv'.format(temp_csv.name), index=False)
        upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(temp_csv.name), '{}/product_{}.csv'.format(DATA_DIR, RUNTAG))
    gc.collect()

if FEATURES:
    data_df = dd.read_csv('gs://{}/{}/actual_{}.csv'.format(BUCKET, DATA_DIR, RUNTAG))
    data_df = data_df.compute()
    features_df = process_features(data_df)
    print('Writing features to GCS')
    features_dd = dd.from_pandas(features_df, npartitions=5000)
    features_dd.to_csv('gs://{}/{}/features_{}/features_*.csv'.format(BUCKET, DATA_DIR, RUNTAG), index=False)
#     with open(tempfile.NamedTemporaryFile().name, 'w') as temp_csv:
#       features_df.to_csv('{}.csv'.format(temp_csv.name), index=False)
#       upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(temp_csv.name), '{}/features_{}.csv'.format(DATA_DIR, RUNTAG))
    gc.collect()
    
if FOLDS_AND_SLICES:
    create_folds_and_slice_features()
    gc.collect()

if FOLD_AWARE_FEATURES:
    create_fold_aware_features()
    gc.collect()
    
if TRAIN_PER_FOLD:
    train_model_per_fold()
    gc.collect()
    
if PREDICT_PER_FOLD:
    predict_and_save_results_per_fold()
    gc.collect()

if RESULTS:
    results_df = dd.read_csv('gs://{}/{}/results_*_{}.csv'.format(BUCKET, RESULTS_DIR, RUNTAG))
#     results_df = dd.read_csv('./results/results_*.csv')
    results_df = results_df.compute()
    features_importance_df = dd.read_csv('gs://{}/{}/importance_*_{}.csv'.format(BUCKET, RESULTS_DIR, RUNTAG))
#     features_importance_df = dd.read_csv('./results/importance_*.csv')
    features_importance_df = features_importance_df.compute()
    results_df = process_results(results_df, features_importance_df, PARAMS)
    print('Writing results to GCS')
    with open(tempfile.NamedTemporaryFile().name, 'w') as temp_csv:
        results_df.to_csv('{}.csv'.format(temp_csv.name), index=False)
        upload_file_to_gcs(PROJECT, BUCKET, '{}.csv'.format(temp_csv.name), '{}/{}_results_with_wa.csv'.format(RESULTS_DIR, RUNTAG))
    gc.collect()



Generating fold aware features for fold 8
Reading feature matrix


In [1]:
!pip install workalendar

Collecting workalendar
Collecting pyCalverter (from workalendar)
Collecting lunardate (from workalendar)
  Using cached https://files.pythonhosted.org/packages/4e/7e/377a3cbba646ec0cf79433ef858881d809a3b87eb887b0901cb83c66a758/lunardate-0.2.0-py3-none-any.whl
Collecting ephem (from workalendar)
Installing collected packages: pyCalverter, lunardate, ephem, workalendar
Successfully installed ephem-3.7.6.0 lunardate-0.2.0 pyCalverter-1.6.1 workalendar-3.1.1


In [2]:
!pip install lightgbm

Collecting lightgbm
  Using cached https://files.pythonhosted.org/packages/4c/3b/4ae113193b4ee01387ed76d5eea32788aec0589df9ae7378a8b7443eaa8b/lightgbm-2.2.2-py2.py3-none-manylinux1_x86_64.whl
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.2


In [3]:
!pip install shap

Collecting shap
Collecting tqdm (from shap)
  Using cached https://files.pythonhosted.org/packages/91/55/8cb23a97301b177e9c8e3226dba45bb454411de2cbd25746763267f226c2/tqdm-4.28.1-py2.py3-none-any.whl
Installing collected packages: tqdm, shap
Successfully installed shap-0.25.2 tqdm-4.28.1


In [4]:
!pip install gcsfs

Collecting gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.2.0


In [5]:
!pip install google-cloud-storage

Collecting google-cloud-storage
  Using cached https://files.pythonhosted.org/packages/d7/62/a2e3111bf4d1eb54fe86dec694418644e024eb059bf1e66ebdcf9f98ad70/google_cloud_storage-1.13.0-py2.py3-none-any.whl
Installing collected packages: google-cloud-storage
Successfully installed google-cloud-storage-1.13.0


In [6]:
!pip install tables

Collecting tables
  Using cached https://files.pythonhosted.org/packages/a5/df/d70f5df27f72082b4c6977d202788904f613089667e791c4aca0986bb229/tables-3.4.4-cp35-cp35m-manylinux1_x86_64.whl
Collecting numexpr>=2.5.2 (from tables)
  Using cached https://files.pythonhosted.org/packages/0e/5b/f26e64e96dbd8e17f6768bc711096e83777ed057b2ffc663a8f61d02e1a8/numexpr-2.6.8-cp35-cp35m-manylinux1_x86_64.whl
Installing collected packages: numexpr, tables
Successfully installed numexpr-2.6.8 tables-3.4.4
