# TODO:

- Ignore NaN values when calculating moving averages
- Clean dfs that have column headers but don't have data
- Create mechanism that will cut dataframe into hourly, 12 hour, daily, etc slices


# Imports and Function Declarations

In [1]:
import urllib
import requests
import pickle
from pandas.io.json import json_normalize
import json
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import time
import pytz
import math
import numpy as np
from scipy.stats import zscore
import sys


def generate_item_lookup(file_name):
    """
        Reads in item name/ids from file
        
            @param file_name: file containing item name/ids
    """
    return pd.read_csv(file_name, skiprows=[])

def get_item_records_from_pickle(file_name):
    """
        This method loads data from the Moophan provided Pickle files instead of from 
        the deprecated RSBuddy API
        
            @param file_name: the name of the pickle file from which to load data
    """
    # Open file and read into data structure
    with open(file_name, 'rb') as row:
        ge_data = pickle.load(row)
        
    # Get all item name/ids for lookup
    item_lookup = generate_item_lookup('./item_key.csv')
        
    # Create data frame from loaded data by iterating through all contained items
    item_records = pd.DataFrame(columns=['id', 'name', 'data'])
    item_keys = list(ge_data.keys())
    curr = 0
    for key in item_keys:
        #Get name of item
        name = item_lookup.loc[item_lookup['id'] == int(key)]['name'].item()
        sys.stdout.write('\rProcessing Item: {0} / {1}'.format(curr, len(item_keys)))
        sys.stdout.flush()
        #Grab data
        data = pd.DataFrame(ge_data[key])
        item_records = item_records.append({'id':int(key), 'name': name, 'data':data}, ignore_index=True)
        curr += 1
        
    # Print Complettion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Loading From Pickle Complete.'.format(completion_time))
    sys.stdout.flush()
        
    return item_records

def drop_empty_items(item_records):
    """
        Will check to see if item data is empty, if so, drop item
            
            @param item_records: pandas dataframe representing dataset
    """
    # Find all indices to drop
    drop_indices = []
    for idx, row in item_records.iterrows():
        #Drop row if item data empty
        if(row['data'].empty):
            drop_indices.append(idx)
        else:            
            #Drop row if data doesn't have all columns
            columns = set(['ts', 'buyingPrice', 'buyingCompleted', 'sellingPrice', 'sellingCompleted', 'overallPrice', 'overallCompleted'])
            if not columns.issubset(set(row['data'].columns)):
                drop_indices.append(idx)

            # Drop row if data has no rows
            if(row['data'].shape[0] == 0):
                drop_indices.append(idx)
              
    #Make drop_indices unique
    drop_indices = list(set(drop_indices)) 
    #Drop rows with empty data
    item_records = item_records.drop(drop_indices)    
    #Reset index
    item_records.reset_index(drop=True, inplace=True)
    
    # Print Completion
    print('Dropped {0} Rows from Dataset'.format(len(drop_indices)))
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    print('{0} -- Data Cleaning Complete.'.format(completion_time))
    
    return item_records
    

def format_item_record_dates(item_records):
    """ 
        Converts timestamp to Unix seconds, tacks on formatted date field, and creates Unix 
        seconds from most recent datapoint, and Unix seconds position from Jan-1.
        
            @param item_records: the master item data structure
    """
    #Get most recent timestamp in all the data (highest value) 
    most_recent_ts = 0
    for _, row in item_records.iterrows():
        max_ts = row['data']['ts'].max()
        if max_ts > most_recent_ts:
            most_recent_ts = max_ts / 1000 #Convert to seconds
    
    #Get the timestamp of Jan-1 for 2015, 2016, 2017, and 2018
    jan1_timestamps = {}
    time_adjust = 18000 #Seconds ahead of EST
    for year in [2015, 2016, 2017, 2018]:
        jan1 = datetime.date(year, 1, 1)
        jan1_ts = time.mktime(jan1.timetuple())
        jan1_timestamps[str(year)] = jan1_ts - time_adjust
        
    for idx, row in item_records.iterrows():
        #Convert timestamp from milliseconds to seconds
        row['data']['ts'] = row['data']['ts'] / 1000
        #Append column to indicate time delta from most recent record in dataset
        row['data']['tsFromCurrent'] = most_recent_ts - row['data']['ts']
        #Append column that converts ts to datetime object
        row['data']['date'] = pd.to_datetime(row['data']['ts'], unit='s')
        row['data']['year'] = row['data']['date'].dt.strftime('%Y')
        #Append a column representing the YTD seconds
        row['data']['tsYtd'] = row['data']['ts'] - pd.Series([jan1_timestamps[year] for year in row['data']['year']])
        row['data'].drop(columns=['year'], inplace=True)
        
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Date Formatting Complete.'.format(completion_time))
    sys.stdout.flush()
    print('')
            
    return item_records

def fill_nans_with_ma(item_records):
    '''
        Fill NaN values in basic cols (buyingCompletd, sellingPrice, etc.) with rolling average of previous 5
        entries
        
            @param item_records: dataframe containing all item data
    '''
    # Iterate through each item
    for idx, row in item_records.iterrows():
        # Progress 
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()        
        # Fill NaN values in columns below with rolling average of past 5 entries (excluding other NaNs)
        cols_to_fill = ['buyingCompleted', 'buyingPrice', 'sellingCompleted', 'sellingPrice', 'overallCompleted', 'overallPrice']
        row['data'][cols_to_fill] = row['data'][cols_to_fill].fillna(row['data'][cols_to_fill].rolling(window=6, min_periods=1).mean())
        
        # Drop any rows that were unable to be filled 
        row['data'] = row['data'].dropna()
        
    # Print complete 
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- NaN Replacement Complete.'.format(completion_time))
    sys.stdout.flush()
    
    return item_records

def generate_train_and_test_deltas(item_records):    
    """
        Calculate percent difference from previous record for each relevant column. 
            
            @param item_records: the master item data structure
    """
    #Iterate through each item, and calculate percentages for their data
    for idx, row in item_records.iterrows():
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        #Create temp dataframe for percent changes
        pct_change_df = row['data'][['buyingPrice', 'buyingCompleted', 'sellingPrice', 'sellingCompleted', 'overallPrice', 'overallCompleted']].copy()
        pct_change_df = pct_change_df.rename(index=int, 
                                         columns={'buyingPrice':'buyingPricePer', 'buyingCompleted':'buyingCompletedPer',
                                        'sellingPrice':'sellingPricePer', 'sellingCompleted':'sellingCompletedPer',
                                        'overallPrice':'overallPricePer', 'overallCompleted':'overallCompletedPer'})
        #Calculate percent change
        pct_change_df = pct_change_df.pct_change()
        
        #Join pct_change_df to existing row
        row['data'] = row['data'].join(pct_change_df)
        
    # Print Completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Delta Creation Complete.'.format(completion_time))
    sys.stdout.flush()
        
    return item_records
        
def get_next_period_overall_price(item_records):
    """
        Append the next period's overall price to each entry
        
            @param item_records: the master item data structure
    """
    #Iterate through each item in item_records
    for idx, row in item_records.iterrows():
        #Progress
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        #Append new column for next period value
        new_col_name = 'overallPriceNext'
        row['data'][new_col_name] = row['data']['overallPricePer'].shift(-1)
        #Drop any rows that didn't have a next period price
        row['data'] = row['data'].dropna(subset=[new_col_name])
        
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Next Period Price Calculation Complete.'.format(completion_time))
    sys.stdout.flush()
    
    return item_records
        
        
def generate_moving_averages(item_records, columns, window):
    """
        Calculate the moving average for each field with specified window size
        
            @param item_records: the master item data structure
            @param columns: a list of which columns (names) we are calculating average for
            @param window: the number of periods to average over -- e.g. 60 if doing 12 hour item records for 30 days
    """
    #Iterate through each item and calculate the moving average for each field
    for idx, row in item_records.iterrows():
        #Progress
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        for col in columns:
            ma_col_name = col + 'MA'
            row['data'][ma_col_name] = row['data'][col].rolling(window=window, min_periods=int(window/2)).mean()
            
            #Drop all rows where MA is NaN
            row['data'] = row['data'].dropna(subset=[ma_col_name])
            
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Moving Average Calculation Complete.'.format(completion_time))
    sys.stdout.flush()
            
    return item_records
            
def generate_z_scores(item_records, columns, outlier_thresh=3):
    """
        Generates z-scores for passed columns and appends to dataframe
        
            @param item_records: the master item data structure
            @param columns: the columns to calculate z-score of
            @param outlier_thresh: the zscore threshold after which we will remove a row
    """
    for idx, row in item_records.iterrows():
        #Iterate through each item and calculate z-scores for columns
        zscore_df = row['data'][columns].apply(zscore)
        #Rename columns
        rename_cols ={'buyingPricePer':'buyingPricePerZScore', 'buyingCompletedPer':'buyingCompletedPerZScore',
                                        'sellingPricePer':'sellingPricePerZScore', 'sellingCompletedPer':'sellingCompletedPerZScore',
                                        'overallPricePer':'overallPricePerZScore', 'overallCompletedPer':'overallCompletedPerZScore'}
        zscore_df.rename(columns=rename_cols, inplace=True)
        #Join zscore df to original item df
        row['data'] = row['data'].join(zscore_df)
        #Remove any outliers
        init_rows = row['data'].shape[0]
        for col in columns:
            col_name = col + 'ZScore'
            row['data'] = row['data'][abs(row['data'][col_name]) < outlier_thresh]
            
        #Display rows removed
        new_rows = row['data'].shape[0]
        sys.stdout.write('\rRemoved {0} Outlier Rows for {1}'.format(init_rows - new_rows, row['name']))
        sys.stdout.flush()
        
        #Reset Index
        row['data'].reset_index(drop=True, inplace=True)
        
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Z-Score Calculation Complete.'.format(completion_time))
    sys.stdout.flush()
    
    return item_records
        
def remove_excess_columns(item_records):
    '''
        Removes columns that aren't necessary to train or evaluate the ANN
        
            @param item_records: the master item data structure
    '''
    cols_to_keep = ['buyingPricePer', 'sellingPricePer', 'overallPricePer',
                    'buyingCompletedPer', 'sellingCompletedPer', 'overallCompletedPer',
                    'overallPriceNext', 'tsFromCurrent', 'tsYtd',
                    'buyingPricePerMA', 'sellingPricePerMA', 'overallPricePerMA',
                    'buyingCompletedPerMA', 'sellingPricePerMA', 'overallPricePerMA']
    #Iterate through each item and drop excess cols
    for idx, row in item_records.iterrows():
        #Progress
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        row['data'] = row['data'][cols_to_keep]
        
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Column Removal Complete.'.format(completion_time))
    sys.stdout.flush()

def append_names_to_item_records(item_records):
    """
        Appends the item name to each row in item record data
        
            @param item_record: the master item data structure
    """
    for idx, row in item_records.iterrows():
        #Progress
        sys.stdout.write('\rProcessing Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        row['data']['item_name'] = row['name']
        
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Name Appension Complete.'.format(completion_time))
    sys.stdout.flush()
    
def aggregate_cleaned_item_data(item_records):
    """
        Takes in cleaned master item data structure and returns a dataframe with all
        item data aggregated
        
            @param item_records: the master item data structure (cleaned)
            @returns: a new dataframe with aggregated item data
    """
    #Apply item names to all item records
    append_names_to_item_records(item_records)
    aggregate_df = pd.DataFrame()
    #Iterate through each item and append to aggregate df
    for idx, row in item_records.iterrows():
        #Progress
        sys.stdout.write('\rAggregating Item: {0}/{1}'.format(idx, item_records.shape[0]))
        sys.stdout.flush()
        aggregate_df = aggregate_df.append(row['data'], ignore_index=True)
    
    #Reset index in place
    aggregate_df.reset_index(drop=True, inplace=True)
    
    # Print completion
    completion_time = datetime.datetime.now().strftime('%H:%M:%S')
    sys.stdout.write('\r{0} -- Item Aggregation Complete.'.format(completion_time))
    sys.stdout.flush()
    
    return aggregate_df

# Generate Item Records

In [2]:
#----- GENERATE ITEM RECORDS -----#
item_records = get_item_records_from_pickle('ge_data_1.pickle')

17:22:51 -- Loading From Pickle Complete.

# Initial Cleaning

In [3]:
item_records = drop_empty_items(item_records)
item_records = format_item_record_dates(item_records)
item_records = fill_nans_with_ma(item_records)

Dropped 9 Rows from Dataset
17:29:44 -- Data Cleaning Complete.
17:33:16 -- Date Formatting Complete.
17:33:30 -- NaN Replacement Complete.

# Convert Data to Deltas

In [4]:
#----- CONVERT DATA TO DELTAS -----#
item_records = generate_train_and_test_deltas(item_records)

17:56:58 -- Delta Creation Complete.

# Get Period's Overall Price for Each Entry
These next period values are what we will be attempting to predict with the ANN. 

In [5]:
#----- GET NEXT PERIOD OVERALL PRICE FOR EACH ENTRY -----#
item_records = get_next_period_overall_price(item_records)

17:57:08 -- Next Period Price Calculation Complete.

# Generate 30-Day Averages For Columns

In [6]:
#------ CREATE 30 DAY AVERAGES FOR COLUMNS -----#
columns = ['overallPricePer', 'overallCompletedPer', 'buyingPricePer', 'buyingCompletedPer',\
           'sellingPricePer', 'sellingCompletedPer']
window = 1440 #24/.5 hour periods * 30 days = 1440 window
item_records = generate_moving_averages(item_records, columns, window)

17:57:53 -- Moving Average Calculation Complete.

# Generate Z-Scores and Remove Outliers

In [7]:
#----- Generate Z-Scores for Percentage columns and remove outliers -----#
item_records = drop_empty_items(item_records)
item_records = generate_z_scores(item_records, columns)

Dropped 92 Rows from Dataset
18:09:50 -- Data Cleaning Complete.
Removed 54350 Outlier Rows for Priest gowndetet))unf)

  x = asanyarray(arr - arrmean)
  return (a - mns) / sstd


18:10:17 -- Z-Score Calculation Complete.ts (e)ompss

# Remove Columns Not Required for ANN

In [8]:
remove_excess_columns(item_records)

18:10:31 -- Column Removal Complete.

# Aggregate all Cleaned Item Data

We set item_records equal to the aggregation to free up memory that would be wasted if they were both populated

In [9]:
item_records = aggregate_cleaned_item_data(item_records)

18:14:11 -- Item Aggregation Complete.

In [None]:
aggregate_df.iloc[0]