In [16]:
%matplotlib inline  
import os, sys, yaml, tempfile
sys.path.append("/Users/matt.meng/dev/rnn_regressor")
import tensorflow as tf
import pandas as pd

In [17]:
from data_preprocess import load_training_data_from_gcs
from google_cloud_storage_util import GCS_Bucket

In [18]:
GCS_path = 'test/MachineLearning'
yaml_file_name = 'training_configuration.yaml'
processed_data_yaml_file = 'processed_data_configuration.yaml'

In [22]:
config_dict, local_data_file = load_training_data_from_gcs(GCS_path, yaml_file_name)

In [23]:
config_dict

{'GCS_path': 'test/MachineLearning',
 'data_file_name': 'NYDN_240min_fullWindow_120min_exposure_600seconds_interval_target_24hr_data.csv',
 'index_column': 'articleId',
 'label_column': 'total_views',
 'static_columns': ['minLocalDateInWeek',
  'minLocalTime',
  'createTime',
  'articleInfo_type',
  'articleInfo_authorName',
  'articleInfo_section'],
 'time_interval_columns': ['views',
  'US_counts',
  'sessionReferrer_DIRECT',
  'sessionReferrer_SOCIAL',
  'sessionReferrer_SEARCH',
  'sessionReferrer_OTHER',
  'platform_PHON',
  'platform_DESK',
  'platform_TBLT',
  'platform_OTHR',
  'pageReferrer_OTHER',
  'pageReferrer_SEARCH',
  'pageReferrer_DIRECT',
  'pageReferrer_EMPTY_DOMAIN',
  'pageReferrer_SOCIAL'],
 'time_step_list': ['0min_to_10min',
  '10min_to_20min',
  '20min_to_30min',
  '30min_to_40min',
  '40min_to_50min',
  '50min_to_60min',
  '60min_to_70min',
  '70min_to_80min',
  '80min_to_90min',
  '90min_to_100min',
  '100min_to_110min',
  '110min_to_120min']}

In [24]:
target_name = config_dict['label_column']

In [25]:
print data[target_name].median(), data[target_name].mean()

10744.0 19809.9554376


#### load the local data

In [26]:
data = pd.read_csv(local_data_file, index_col=config_dict['index_column'])

In [27]:
data.head()

Unnamed: 0_level_0,total_views,views_0min_to_10min,views_10min_to_20min,views_20min_to_30min,views_30min_to_40min,views_40min_to_50min,views_50min_to_60min,views_60min_to_70min,views_70min_to_80min,views_80min_to_90min,...,pageReferrer_SOCIAL_80min_to_90min,pageReferrer_SOCIAL_90min_to_100min,pageReferrer_SOCIAL_100min_to_110min,pageReferrer_SOCIAL_110min_to_120min,minLocalDateInWeek,minLocalTime,createTime,articleInfo_type,articleInfo_authorName,articleInfo_section
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001cd0fd58dc13b2,21226,58.0,195.0,451.0,433.0,441.0,385.0,160.0,172.0,199.0,...,0.0,0.0,0.0,0.0,2.0,1005.0,1013.0,19888.094993,13263.348485,13257.382353
0035dc34e6c4dbc8,16272,117.0,381.0,406.0,406.0,243.0,253.0,229.0,248.0,235.0,...,0.0,0.0,0.0,0.0,6.0,1089.0,1096.0,19888.094993,15372.921212,10002.850746
0104a12e6d11db7d,19759,20.0,54.0,50.0,55.0,68.0,143.0,153.0,192.0,232.0,...,0.0,0.0,0.0,0.0,1.0,311.0,320.0,19888.094993,30518.52,19490.012658
0131a9d62b685261,68481,5.0,18.0,22.0,29.0,32.0,31.0,25.0,26.0,26.0,...,0.0,0.0,0.0,0.0,4.0,245.0,257.0,19888.094993,20405.309803,33636.359375
014122408131c2a0,8636,158.0,279.0,212.0,177.0,138.0,162.0,119.0,105.0,89.0,...,0.0,0.0,0.0,0.0,4.0,1378.0,1382.0,19888.094993,8556.717105,10002.850746


In [28]:
def normalized_columns_by_column_mean(data):
    norm_dict = {}
    processed_data = data.copy()
    for column in processed_data.columns:
        data_mean = processed_data[column].mean()
        norm_dict[column] = data_mean
        processed_data.loc[:, column] = processed_data[column] / data_mean
    return norm_dict, processed_data


def normalized_columns_by_target_max(data, target_column_name):
    norm_dict = {}
    processed_data = data.copy()
    target_max = processed_data[target_column_name].max()
    norm_dict['all_data_target_max'] = target_max
    processed_data = processed_data / target_max
    return norm_dict, processed_data


def normalized_columns_by_target_median(data, target_column_name):
    norm_dict = {}
    processed_data = data.copy()
    target_median = processed_data[target_column_name].median()
    norm_dict['all_data_target_median'] = target_median
    processed_data = processed_data / target_median
    return norm_dict, processed_data


def normalized_columns_by_target_mean(data, target_column_name):
    norm_dict = {}
    processed_data = data.copy()
    target_mean = processed_data[target_column_name].mean()
    norm_dict['all_data_target_mean'] = target_mean
    processed_data = processed_data / target_mean
    return norm_dict, processed_data


def update_config_dict(config_dict, norm_dict, data_prefix='mean_normalized'):
    processed_config_dict = config_dict.copy()
    processed_config_dict['data_file_name'] = '{}_{}'.format(data_prefix, processed_config_dict['data_file_name'])
    processed_config_dict['norm_dict'] = norm_dict
    return processed_config_dict

def upload_content_to_GCS(config_dict, data, processed_data_yaml_file='processed_data_configuration.yaml'):
    local_data_file = tempfile.NamedTemporaryFile(delete=True).name
    local_yaml_file = tempfile.NamedTemporaryFile(delete=True).name
    data.to_csv(local_data_file)
    print local_yaml_file
    print local_data_file
    with open(local_yaml_file, 'w') as output:
        yaml.dump(config_dict, output)
    bucket = GCS_Bucket()
    bucket.put(local_data_file, "{}/{}".format(config_dict['GCS_path'], config_dict['data_file_name']))
    bucket.put(local_yaml_file, "{}/{}".format(config_dict['GCS_path'], processed_data_yaml_file))
    os.unlink(local_data_file)
    os.unlink(local_yaml_file)

In [30]:
#norm_dict, processed_data = normalized_columns_by_column_mean(data) ## use mean for each column
#norm_dict, processed_data = normalized_columns_by_target_mean(data, target_name) ## use the target mean
#norm_dict, processed_data = normalized_columns_by_target_mean(data, target_name) ## use the target mean
norm_dict, processed_data = normalized_columns_by_target_median(data, target_name) ## use the target median

In [32]:
processed_data.head()

Unnamed: 0_level_0,total_views,views_0min_to_10min,views_10min_to_20min,views_20min_to_30min,views_30min_to_40min,views_40min_to_50min,views_50min_to_60min,views_60min_to_70min,views_70min_to_80min,views_80min_to_90min,...,pageReferrer_SOCIAL_80min_to_90min,pageReferrer_SOCIAL_90min_to_100min,pageReferrer_SOCIAL_100min_to_110min,pageReferrer_SOCIAL_110min_to_120min,minLocalDateInWeek,minLocalTime,createTime,articleInfo_type,articleInfo_authorName,articleInfo_section
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001cd0fd58dc13b2,1.975614,0.005398,0.01815,0.041977,0.040302,0.041046,0.035834,0.014892,0.016009,0.018522,...,0.0,0.0,0.0,0.0,0.000186,0.093541,0.094285,1.851089,1.234489,1.233934
0035dc34e6c4dbc8,1.51452,0.01089,0.035462,0.037789,0.037789,0.022617,0.023548,0.021314,0.023083,0.021873,...,0.0,0.0,0.0,0.0,0.000558,0.101359,0.10201,1.851089,1.430838,0.931017
0104a12e6d11db7d,1.839073,0.001862,0.005026,0.004654,0.005119,0.006329,0.01331,0.014241,0.01787,0.021593,...,0.0,0.0,0.0,0.0,9.3e-05,0.028946,0.029784,1.851089,2.840517,1.814037
0131a9d62b685261,6.373883,0.000465,0.001675,0.002048,0.002699,0.002978,0.002885,0.002327,0.00242,0.00242,...,0.0,0.0,0.0,0.0,0.000372,0.022803,0.02392,1.851089,1.899228,3.130711
014122408131c2a0,0.803797,0.014706,0.025968,0.019732,0.016474,0.012844,0.015078,0.011076,0.009773,0.008284,...,0.0,0.0,0.0,0.0,0.000372,0.128258,0.12863,1.851089,0.796418,0.931017


#### create a new `config_dict`

In [33]:
#processed_dict = update_config_dict(config_dict, norm_dict, data_prefix='target_mean_normalized')
#processed_dict = update_config_dict(config_dict, norm_dict, data_prefix='target_max_normalized')
processed_dict = update_config_dict(config_dict, norm_dict, data_prefix='target_median_normalized')

In [34]:
#processed_dict['norm_dict'][target_name]
processed_dict

{'GCS_path': 'test/MachineLearning',
 'data_file_name': 'target_median_normalized_NYDN_240min_fullWindow_120min_exposure_600seconds_interval_target_24hr_data.csv',
 'index_column': 'articleId',
 'label_column': 'total_views',
 'norm_dict': {'all_data_target_median': 10744.0},
 'static_columns': ['minLocalDateInWeek',
  'minLocalTime',
  'createTime',
  'articleInfo_type',
  'articleInfo_authorName',
  'articleInfo_section'],
 'time_interval_columns': ['views',
  'US_counts',
  'sessionReferrer_DIRECT',
  'sessionReferrer_SOCIAL',
  'sessionReferrer_SEARCH',
  'sessionReferrer_OTHER',
  'platform_PHON',
  'platform_DESK',
  'platform_TBLT',
  'platform_OTHR',
  'pageReferrer_OTHER',
  'pageReferrer_SEARCH',
  'pageReferrer_DIRECT',
  'pageReferrer_EMPTY_DOMAIN',
  'pageReferrer_SOCIAL'],
 'time_step_list': ['0min_to_10min',
  '10min_to_20min',
  '20min_to_30min',
  '30min_to_40min',
  '40min_to_50min',
  '50min_to_60min',
  '60min_to_70min',
  '70min_to_80min',
  '80min_to_90min',
  '90

#### save both data and `config_dict` to lcoal temp file and upload to GCS

In [35]:
#upload_content_to_GCS(processed_dict, processed_data, processed_data_yaml_file='target_mean_norm_configuration.yaml')
#upload_content_to_GCS(processed_dict, processed_data, processed_data_yaml_file='target_max_norm_configuration.yaml')
upload_content_to_GCS(processed_dict, processed_data, processed_data_yaml_file='target_median_norm_configuration.yaml')

/var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmpbkAIg0
/var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmpWsEbsi


In [None]:
%%bash
du -ah /var/folders/k5/nkzf3kxj2v721gq7p2nytvv48ppbxr/T/tmpQ22XFy

In [None]:
data[target_name].median()

In [None]:
data_mean = data[target_name].mean()

In [None]:
data_mean

In [None]:
data.loc[:, target_name] = data[target_name] / data_mean

In [None]:
data.loc[target_name] = data[target_name] / data_mean

In [None]:
data.loc[data[target_name] > 10, target_name].hist(bins=50)