# Imports and Function Declarations

In [320]:
import urllib
import requests
from pandas.io.json import json_normalize
import json
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import time
import pytz
import math
import numpy as np
from scipy.stats import zscore

from neural_network import NeuralNetwork

"""
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Activation, Dense
from keras.models import Sequential
"""

def generate_item_records_from_summary():
    """ 
        Generate item records using the summary.json link from the OSBuddy API
    """
    df = pd.read_json(path_or_buf='https://rsbuddy.com/exchange/summary.json',orient='index', convert_axes=True)
    df = df[['id','name','buy_average','buy_quantity','sell_average','sell_quantity','overall_average','overall_quantity']]
    data = df.sort_values(by=['id']).reset_index()
    data = data.drop(labels='index',axis=1)
    
    #Output item id/name pairs to a csv file
    item_key = data[['id', 'name']]
    file_name = './item_key.csv'
    item_key.to_csv(path_or_buf=file_name, columns=('id','name'), index=False)

def generate_input_data_by_item_number(start_num, num_items): 
    """
        Pull items by range of Index values
        
            @param start_num: the starting index 
            @param num_items: the number of items to generate data for, starting from
                              the start_num
    """
    file_name = './item_key.csv'
    all_items = pd.read_csv(file_name, skiprows=[])
    items = all_items[start_num: start_num + num_items]
    return items

def generate_input_data_by_item_name(names): 
    """
        Pull specific items by name
        
            @param names: a list containing the item names to generate data for
    """
    file_name = './item_key.csv'
    data = pd.read_csv(file_name, skiprows=[])
    items = pd.DataFrame()
    for name in names:
        items = items.append(data.loc[data['name'] == name])
    items = items.reset_index().drop(labels='index',axis=1)
    return items

def get_item_records_from_url(input_data, frequency):
    """ 
        Iterate through all items, grab data from api and append to dataframe 
        
            @param input_data: a list of items with their associated index
            @param frequency: the period at which data is sampled, in minutes
    """
    i = 0
    item_lookup_failed = False
    item_records = pd.DataFrame(columns=['id', 'name', 'data'])
    while(i < input_data['id'].count()):
        key = input_data.iloc[i]['id']
        name = input_data.iloc[i]['name']
        #Print item information only on first data retrieval attempt 
        if not item_lookup_failed:
            print('Querying API for ' + name + ' data. ' + 'Item Id: ' + str(key))
        try:
            #Attempt to get data from API, retry if HTTP error
            url = f'https://api.rsbuddy.com/grandExchange?a=graph&g={frequency}&start=1474615279000&i={key}'
            temp_df = pd.DataFrame()
            temp_df = pd.read_json(path_or_buf=url, orient='records', convert_axes=False)
            item_records = item_records.append({'id':key, 'name': name, 'data':temp_df}, ignore_index=True)
            item_lookup_failed = False
            i+=1
        except:
            print("Retrying...")
            time.sleep(1) #Avoid getting blacklisted by API
            item_lookup_failed = True
            
    print('Item retrieval complete!')

    #Add correct formatting to item record dates/times
    item_records = format_item_record_dates(item_records)

    return item_records

def format_item_record_dates(item_records):
    """ 
        Converts timestamp to Unix seconds, tacks on formatted date field, and creates Unix 
        seconds from most recent datapoint, and Unix seconds position from Jan-1.
        
            @param item_records: the master item data structure
    """
    #Get most recent timestamp in all the data (highest value) 
    most_recent_ts = 0
    for row in item_records['data']:
        max_ts = row['ts'].max()
        if max_ts > most_recent_ts:
            most_recent_ts = max_ts
    
    #Get the timestamp of Jan-1 for 2015, 2016, 2017, and 2018
    jan1_timestamps = {}
    time_adjust = 18000 #Seconds ahead of EST
    for year in [2016, 2017, 2018]:
        jan1 = datetime.date(year, 1, 1)
        jan1_ts = time.mktime(jan1.timetuple())
        jan1_timestamps[str(year)] = jan1_ts - time_adjust

    for row in item_records['data']:
        #Convert timestamp from milliseconds to seconds
        row['ts'] = row['ts'] / 1000
        #Append column to indicate time delta from most recent record in dataset
        row['tsFromCurrent'] = most_recent_ts - row['ts']
        #Append column that converts ts to datetime object
        row['date'] = pd.to_datetime(row['ts'], unit='s')
        row['year'] = row['date'].dt.strftime('%Y')
        #Append a column representing the YTD seconds
        row['tsYtd'] = row['ts'] - pd.Series([jan1_timestamps[year] for year in row['year']])
        row.drop(columns=['year'], inplace=True)

            
    return item_records

def generate_train_and_test_deltas(item_records):    
    """
        Calculate percent difference from previous record for each relevant column. 
            
            @param item_records: the master item data structure
    """
    #Iterate through each item, and calculate percentages for their data
    for _, row in item_records.iterrows():
        #Create temp dataframe for percent changes
        pct_change_df = row['data'][['buyingPrice', 'buyingCompleted', 'sellingPrice', 
                            'sellingCompleted', 'overallPrice', 'overallCompleted']].copy()
        pct_change_df = pct_change_df.rename(index=int, 
                                         columns={'buyingPrice':'buyingPricePer', 'buyingCompleted':'buyingCompletedPer',
                                        'sellingPrice':'sellingPricePer', 'sellingCompleted':'sellingCompletedPer',
                                        'overallPrice':'overallPricePer', 'overallCompleted':'overallCompletedPer'})
        #Calculate percent change
        pct_change_df = pct_change_df.pct_change()
        
        #Join percent change df back to original dataframe
        row['data'] = row['data'].join(pct_change_df)
        
def generate_moving_averages(item_records, columns, window):
    """
        Calculate the moving average for each field with specified window size
        
            @param item_records: the master item data structure
            @param columns: a list of which columns (names) we are calculating average for
            @param window: the number of periods to average over -- e.g. 60 if doing 12 hour item records for 30 days
    """
    #Iterate through each item and calculate the moving average for each field
    for _, row in item_records.iterrows():
        for col in columns:
            ma_col_name = col + 'MA'
            row['data'][ma_col_name] = row['data'][col].rolling(window=window).mean()
            
            #Drop all rows where MA is NaN
            row['data'] = row['data'].dropna(subset=[ma_col_name])
            
def generate_z_scores(item_records, columns, outlier_thresh=3):
    """
        Generates z-scores for passed columns and appends to dataframe
        
            @param item_records: the master item data structure
            @param columns: the columns to calculate z-score of
            @param outlier_thresh: the zscore threshold after which we will remove a row
    """
    for _, row in item_records.iterrows():
        #Iterate through each item and calculate z-scores for columns
        zscore_df = row['data'][columns].apply(zscore)
        #Rename columns
        rename_cols ={'buyingPricePer':'buyingPricePerZScore', 'buyingCompletedPer':'buyingCompletedPerZScore',
                                        'sellingPricePer':'sellingPricePerZScore', 'sellingCompletedPer':'sellingCompletedPerZScore',
                                        'overallPricePer':'overallPricePerZScore', 'overallCompletedPer':'overallCompletedPerZScore'}
        zscore_df.rename(columns=rename_cols, inplace=True)
        #Join zscore df to original item df
        row['data'] = row['data'].join(zscore_df)
        #Remove any outliers
        init_rows = row['data'].shape[0]
        for col in columns:
            col_name = col + 'ZScore'
            row['data'] = row['data'][abs(row['data'][col_name]) < outlier_thresh]
            
        #Display rows removed
        new_rows = row['data'].shape[0]
        print('Removed {0} Outlier Rows for {1}'.format(init_rows - new_rows, row['name']))
        
        #Reset Index
        row['data'].reset_index(drop=True, inplace=True)

def append_names_to_item_records(item_records):
    """
        Appends the item name to each row in item record data
        
            @param item_record: the master item data structure
    """
    for _, row in item_records.iterrows():
        row['data']['item_name'] = row['name']
    
def aggregate_cleaned_item_data(item_records):
    """
        Takes in cleaned master item data structure and returns a dataframe with all
        item data aggregated
        
            @param item_records: the master item data structure (cleaned)
            @returns: a new dataframe with aggregated item data
    """
    #Apply item names to all item records
    append_names_to_item_records(item_records)
    aggregate_df = pd.DataFrame()
    #Iterate through each item and append to aggregate df
    for ind, row in item_records.iterrows():
        aggregate_df = aggregate_df.append(row['data'], ignore_index=True)
    
    #Reset index in place
    aggregate_df.reset_index(drop=True, inplace=True)
    
    return aggregate_df

# Generate Input Data

In [321]:
#------ GENERATE INPUT DATA ------#
#Get a list of all items
generate_item_records_from_summary()

#Pull items by range of Index values
#input_data = generate_input_data_by_item_number(0,3)

#Pull specific items by name
input_data = generate_input_data_by_item_name(['Leather','Dragon boots', 'Rune arrow'])

# Generate Item Records

In [323]:
#----- GENERATE ITEM RECORDS -----#
period_length = 240 #in minutes
item_records = get_item_records_from_url(input_data, period_length)

Querying API for Leather data. Item Id: 1741
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Querying API for Dragon boots data. Item Id: 11840
Retrying...
Retrying...
Querying API for Rune arrow data. Item Id: 892
Retrying...
Item retrieval complete!


# Convert Data to Deltas

In [324]:
#----- CONVERT DATA TO DELTAS -----#
generate_train_and_test_deltas(item_records)

# Generate 30-Day Averages For Columns

In [325]:
#------ CREATE 30 DAY AVERAGES FOR COLUMNS -----#
columns = ['overallPricePer', 'overallCompletedPer', 'buyingPricePer', 'buyingCompletedPer',\
           'sellingPricePer', 'sellingCompletedPer']
window = 180 #24/4 hour periods * 30 days = 180 window
generate_moving_averages(item_records, columns, window)

# Generate Z-Scores and Remove Outliers

In [326]:
#----- Generate Z-Scores for Percentage columns and remove outliers -----#
generate_z_scores(item_records, columns)

Removed 132 Outlier Rows for Leather
Removed 53 Outlier Rows for Dragon boots
Removed 69 Outlier Rows for Rune arrow


# Aggregate all Cleaned Item Data

In [327]:
aggregate_df = aggregate_cleaned_item_data(item_records)

In [328]:
print(aggregate_df)

      buyingCompleted  buyingPrice  overallCompleted  overallPrice  \
0             14632.0        236.0             21693           228   
1             22706.0        262.0             57787           269   
2             35096.0        264.0             73215           263   
3             50840.0        288.0             83081           284   
4             40249.0        293.0             65436           289   
5             27168.0        298.0             39808           297   
6             17375.0        262.0             37935           279   
7             32909.0        246.0             58176           252   
8             37473.0        237.0             68883           241   
9             39190.0        256.0             81779           252   
10            20238.0        287.0             44751           291   
11            10559.0        247.0             25244           248   
12            15125.0        267.0             29731           258   
13            22909.