In [15]:
%matplotlib inline  
import os, sys, yaml, tempfile
sys.path.append("/Users/matt.meng/dev/rnn_regressor")
import tensorflow as tf
import pandas as pd

In [2]:
from data_preprocess import load_training_data_from_gcs
from google_cloud_storage_util import GCS_Bucket

In [3]:
GCS_path = 'test/ML'
yaml_file_name = 'training_configuration.yaml'
processed_data_yaml_file = 'processed_data_configuration.yaml'

In [4]:
config_dict, local_data_file = load_training_data_from_gcs(GCS_path, yaml_file_name)

In [5]:
config_dict

{'GCS_path': 'test/ML',
 'data_file_name': 'NYDN_240min_fullWindow_120min_exposure_600seconds_interval_target_24hr_data.csv',
 'index_column': 'articleId',
 'label_column': 'total_views',
 'static_columns': ['minLocalDateInWeek',
  'minLocalTime',
  'createTime',
  'articleInfo_type',
  'articleInfo_authorName',
  'articleInfo_section'],
 'time_interval_columns': ['views',
  'US_counts',
  'sessionReferrer_DIRECT',
  'sessionReferrer_SOCIAL',
  'sessionReferrer_SEARCH',
  'sessionReferrer_OTHER',
  'platform_PHON',
  'platform_DESK',
  'platform_TBLT',
  'platform_OTHR',
  'pageReferrer_OTHER',
  'pageReferrer_SEARCH',
  'pageReferrer_DIRECT',
  'pageReferrer_EMPTY_DOMAIN',
  'pageReferrer_SOCIAL'],
 'time_step_list': ['0min_to_10min',
  '10min_to_20min',
  '20min_to_30min',
  '30min_to_40min',
  '40min_to_50min',
  '50min_to_60min',
  '60min_to_70min',
  '70min_to_80min',
  '80min_to_90min',
  '90min_to_100min']}

In [6]:
target_name = config_dict['label_column']

In [7]:
data = pd.read_csv(local_data_file, index_col=config_dict['index_column'])

In [8]:
data.head()

Unnamed: 0_level_0,total_views,views_0min_to_10min,views_10min_to_20min,views_20min_to_30min,views_30min_to_40min,views_40min_to_50min,views_50min_to_60min,views_60min_to_70min,views_70min_to_80min,views_80min_to_90min,...,pageReferrer_SOCIAL_80min_to_90min,pageReferrer_SOCIAL_90min_to_100min,pageReferrer_SOCIAL_100min_to_110min,pageReferrer_SOCIAL_110min_to_120min,minLocalDateInWeek,minLocalTime,createTime,articleInfo_type,articleInfo_authorName,articleInfo_section
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002aa1aeac470688,22220,33.0,131.0,87.0,182.0,273.0,267.0,279.0,252.0,230.0,...,34.0,27.0,30.0,34.0,0.0,1263.0,1272.0,19802.648827,21364.080645,14080.652253
0041d850a2d98cc5,8368,52.0,116.0,117.0,102.0,83.0,87.0,90.0,88.0,87.0,...,1.0,0.0,0.0,0.0,6.0,1044.0,1048.0,18123.160839,13408.365079,20232.00264
004d3c3d30b769f3,10927,63.0,205.0,242.0,258.0,215.0,217.0,177.0,186.0,196.0,...,9.0,7.0,13.0,4.0,2.0,1196.0,1199.0,19802.648827,9356.027586,14080.652253
00553f953ae05383,9344,0.0,0.0,0.0,399.0,678.0,535.0,679.0,555.0,363.0,...,13.0,10.0,13.0,5.0,2.0,644.0,678.0,19802.648827,18252.402778,23475.380183
0062fedc0669a575,9741,117.0,211.0,184.0,178.0,174.0,161.0,193.0,154.0,185.0,...,43.0,34.0,38.0,34.0,0.0,666.0,671.0,19802.648827,24216.654546,14080.652253


In [32]:
def normalized_columns_by_column_mean(data):
    norm_dict = {}
    processed_data = data.copy()
    for column in processed_data.columns:
        data_mean = processed_data[column].mean()
        norm_dict[column] = data_mean
        processed_data.loc[:, column] = processed_data[column] / data_mean
    return norm_dict, processed_data

def update_config_dict(config_dict, norm_dict, data_prefix='mean_normalized'):
    processed_config_dict = config_dict.copy()
    processed_config_dict['data_file_name'] = '{}_{}'.format(data_prefix, processed_config_dict['data_file_name'])
    processed_config_dict['norm_dict'] = norm_dict
    return processed_config_dict

def upload_content_to_GCS(config_dict, data, processed_data_yaml_file='processed_data_configuration.yaml'):
    local_data_file = tempfile.NamedTemporaryFile(delete=True).name
    local_yaml_file = tempfile.NamedTemporaryFile(delete=True).name
    data.to_csv(local_data_file)
    print local_yaml_file
    print local_data_file
    with open(local_yaml_file, 'w') as output:
        yaml.dump(config_dict, output)
    bucket = GCS_Bucket()
    bucket.put(local_data_file, "{}/{}".format(config_dict['GCS_path'], config_dict['data_file_name']))
    bucket.put(local_yaml_file, "{}/{}".format(config_dict['GCS_path'], processed_data_yaml_file))
    os.unlink(local_data_file)
    os.unlink(local_yaml_file)

In [10]:
norm_dict, processed_data = normalized_columns_by_column_mean(data)

In [18]:
#processed_data.head()

In [12]:
processed_dict = update_config_dict(config_dict, norm_dict)

In [19]:
#processed_dict

In [33]:
upload_content_to_GCS(processed_dict, processed_data)

/var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmpSFDLIY
/var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmps_ACaq


In [35]:
%%bash
du -ah /var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmps_ACaq

 20M	/var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmps_ACaq


In [None]:
data[target_name].median()

In [None]:
data_mean = data[target_name].mean()

In [None]:
data_mean

In [None]:
data.loc[:, target_name] = data[target_name] / data_mean

In [None]:
data.loc[target_name] = data[target_name] / data_mean

In [None]:
data.loc[data[target_name] > 10, target_name].hist(bins=50)