# Imports and Function Declarations

In [70]:
import urllib
import requests
from pandas.io.json import json_normalize
import json
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import time
import pytz
import math
import numpy as np

from neural_network import NeuralNetwork

"""
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Activation, Dense
from keras.models import Sequential
"""


def generate_item_records_from_summary():
    """ 
        Generate item records using the summary.json link from the OSBuddy API
    """
    df = pd.read_json(path_or_buf='https://rsbuddy.com/exchange/summary.json',orient='index', convert_axes=True)
    df = df[['id','name','buy_average','buy_quantity','sell_average','sell_quantity','overall_average','overall_quantity']]
    data = df.sort_values(by=['id']).reset_index()
    data = data.drop(labels='index',axis=1)
    
    #Output item id/name pairs to a csv file
    item_key = data[['id', 'name']]
    file_name = './item_key.csv'
    item_key.to_csv(path_or_buf=file_name, columns=('id','name'), index=False)


def generate_input_data_by_item_number(start_num, num_items): 
    """
        Pull items by range of Index values
        
            @param start_num: the starting index 
            @param num_items: the number of items to generate data for, starting from
                              the start_num
    """
    file_name = './item_key.csv'
    all_items = pd.read_csv(file_name, skiprows=[])
    items = all_items[start_num: start_num + num_items]
    return items

def generate_input_data_by_item_name(names): 
    """
        Pull specific items by name
        
            @param names: a list containing the item names to generate data for
    """
    file_name = './item_key.csv'
    data = pd.read_csv(file_name, skiprows=[])
    items = pd.DataFrame()
    for name in names:
        items = items.append(data.loc[data['name'] == name])
    items = items.reset_index().drop(labels='index',axis=1)
    return items

def get_item_records_from_url(input_data, frequency):
    """ 
        Iterate through all items, grab data from api and append to dataframe 
        
            @param input_data: a list of items with their associated index
            @param frequency: the period at which data is sampled, in minutes
    """
    i = 0
    item_lookup_failed = False
    item_records = pd.DataFrame(columns=['id', 'name', 'data'])
    while(i < input_data['id'].count()):
        key = input_data.iloc[i]['id']
        name = input_data.iloc[i]['name']
        #Print item information only on first data retrieval attempt 
        if not item_lookup_failed:
            print('Querying API for ' + name + ' data. ' + 'Item Id: ' + str(key))
        try:
            #Attempt to get data from API, retry if HTTP error
            url = f'https://api.rsbuddy.com/grandExchange?a=graph&g={frequency}&start=1474615279000&i={key}'
            temp_df = pd.DataFrame()
            temp_df = pd.read_json(path_or_buf=url, orient='records', convert_axes=False)
            item_records = item_records.append({'id':key, 'name': name, 'data':temp_df}, ignore_index=True)
            item_lookup_failed = False
            i+=1
        except:
            print("Retrying...")
            time.sleep(1) #Avoid getting blacklisted by API
            item_lookup_failed = True
            
    print('Item retrieval complete!')

    #Add correct formatting to item record dates/times
    item_records = format_item_record_dates(item_records)

    return item_records

def format_item_record_dates(item_records):
    """ 
        Converts timestamp to Unix seconds, tacks on formatted date field, and creates Unix 
        seconds from most recent datapoint, and Unix seconds position from Jan-1.
        
            @param item_records: the master item data structure
    """
    #Get most recent timestamp in all the data (highest value) 
    most_recent_ts = 0
    for row in item_records['data']:
        max_ts = row['ts'].max()
        if max_ts > most_recent_ts:
            most_recent_ts = max_ts
    
    #Get the timestamp of Jan-1 for 2015, 2016, 2017, and 2018
    jan1_timestamps = {}
    time_adjust = 18000 #Seconds ahead of EST
    for year in [2016, 2017, 2018]:
        jan1 = datetime.date(year, 1, 1)
        jan1_ts = time.mktime(jan1.timetuple())
        jan1_timestamps[str(year)] = jan1_ts - time_adjust


    for row in item_records['data']:
        #Convert timestamp from milliseconds to seconds
        row['ts'] = row['ts'] / 1000
        #Append column to indicate time delta from most recent record in dataset
        row['tsFromCurrent'] = most_recent_ts - row['ts']
        #Append column that converts ts to datetime object
        row['date'] = pd.to_datetime(row['ts'], unit='s')
        #Iterate through each year, and append tsYtd for all values within that year
        for year, jan_1_ts in jan1_timestamps.items():
            # Select sub-dataframe for just this year
            year_df = row[row['date'].dt.strftime('%Y-%m-%d %HH:%MM:%SS').str.contains(year)]
            year_df['tsYtd'] = year_df['ts'] - jan_1_ts
            
    return item_records

def generate_train_and_test_deltas(item_records):    
    """
        Calculate percent difference from previous record for each relevant column. 
            
            @param item_records: the master item data structure
    """
    #Iterate through each item, and calculate percentages for their data
    for _, row in item_records.iterrows():
        #Create temp dataframe for percent changes
        pct_change_df = row['data'][['buyingPrice', 'buyingCompleted', 'sellingPrice', 
                            'sellingCompleted', 'overallPrice', 'overallCompleted']].copy()
        pct_change_df = pct_change_df.rename(index=int, 
                                         columns={'buyingPrice':'buyingPricePer', 'buyingCompleted':'buyingCompletedPer',
                                        'sellingPrice':'sellingPricePer', 'sellingCompleted':'sellingCompletedPer',
                                        'overallPrice':'overallPricePer', 'overallCompleted':'overallCompletedPer'})
        #Calculate percent change
        pct_change_df = pct_change_df.pct_change()
        
        #Join percent change df back to original dataframe
        row['data'] = row['data'].join(pct_change_df)


def createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit):
    """
        Create the training and testing data sets for a particular epoch starting at start_stamp, and extending
        epoch_size units into the future.
        
            @param item_records: master item data structure
            @param start_stamp: the starting time stamp of this epoch
            @param epoch_size: the epoch size in days
            @param periods_per_epoch_unit: the number of periods ahead of the last training record
                that we will use this training data to predict
    """
    print("Creating Test and Training Datasets...")
    # Create dataframe to hold training/test data for the epoch
    epoch_dataframe = pd.DataFrame(columns=['id', 'name', 'train_data', 'test_data', 'pred_data'])
    record_index = 0
    end_stamp = start_stamp + math.ceil(epoch_size * periods_per_epoch_unit) 
    
    # Iterate through the each item in item_records
    while(record_index < item_records['id'].count()): 
        #Get item id
        item_id = item_records.iloc[record_index]['id'] 
        
        # Find all item records where ID matchest item_id
        test_set = item_records.loc[item_records['id'] == item_id]
        
        # The time stamp of the value we'll use this training data to predict
        predict_stamp = end_stamp + periods_per_epoch_unit
        
        #Ensure that the value we are attempting to predict exists for testing purposes
        if predict_stamp < len(test_set.iloc[0]['data']):
            # Create training dataset
            train_df = test_set.iloc[0]['data'].iloc[start_stamp:end_stamp]
            train_data = train_df[train_df.columns[-9:]][1:]          
            
            # Get test value -- the price 6 periods from the end of the training set (24 hours) 
            price_col = 14
            test_data = test_set.iloc[0]['data'].iloc[predict_stamp][price_col]

            # Append this item's data to the overall dataset for this epoch
            epoch_dataframe = epoch_dataframe.append({'id':item_id, 'name': test_set.iloc[0]['name'], 'train_data': train_data, 'test_data': test_data}, ignore_index=True)
            
        record_index += 1

    return epoch_dataframe

# Generate Input Data

In [89]:
#------ GENERATE INPUT DATA ------#
#Get a list of all items
generate_item_records_from_summary()

#Pull items by range of Index values
#input_data = generate_input_data_by_item_number(0,3)

#Pull specific items by name
input_data = generate_input_data_by_item_name(['Leather','Dragon boots', 'Rune arrow'])

# Generate Item Records

In [90]:
#----- GENERATE ITEM RECORDS -----#
item_records = get_item_records_from_url(input_data, 240)

Querying API for Leather data. Item Id: 1741
Querying API for Dragon boots data. Item Id: 11840
Retrying...
Querying API for Rune arrow data. Item Id: 892
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Retrying...
Item retrieval complete!


# Convert Data to Deltas

In [91]:
#----- CONVERT DATA TO DELTAS -----#
generate_train_and_test_deltas(item_records)

# Create Training and Test Datasets

In [74]:
#------- CREATE TRAINING AND TEST DATASETS -----#
#In Days - define the number of epochs to include in the test data
periods_per_epoch_unit = 6 #Each value is a 4 hour record
start_stamp = 0
epoch_size = 30
num_epochs = 3

#Create test_item object to log training and test sets
test_train_set = pd.DataFrame(columns=['epoch_id', 'item_data'])

#Calculate number of epochs required

#Get item data for each epoch
for i in range(num_epochs):
    curr_epoch = pd.DataFrame(columns=['epoch_id', 'item_data'])
    test_data = createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit)
    curr_epoch = curr_epoch.append([{'epoch_id': i, 'item_data': createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit)}])
    #print(curr_epoch)
    test_train_set = test_train_set.append(curr_epoch)

Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...


# Test Creation of Neural Net

In [None]:
#----- TEST CREATION OF NEURAL NET -----#
nn = NeuralNetwork()
nn.generateInitialNetwork([1,1])
nn.calculateNodeActivations()
print(nn.outputNode.getActivation())

# Train the ANN

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#import keras.backend as K
train_x = test_item.iloc[2]['train_x'] # test data. not real
 
def create_model(layesr,activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes,input_dim=train_x.shape[1]))
            model.add(Activation(activation))
        else: 
            model.add(Dense(nodes))
            model.add(Activation(activation))
    model.add(Dense(1)) #Note: no activations present beyond this point

    model.compile(optimizer='adadelta', loss='mse')
    return model

model = KerasRegressor(build_fn=create_model, verbose = 0)

In [None]:
layers = [[16], [4,2], [4], [16,4]]
activations = [tanh, relu]
param_grid = dict(layers=layers, activation=activations, batch_size = [42, 180], epochs[6])
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error')

In [61]:
test_train_set.iloc[0].item_data.iloc[0].train_data

Unnamed: 0,ts,tsFromCurrent,date,buyingPricePer,buyingCompletedPer,sellingPricePer,sellingCompletedPer,overallPricePer,overallCompletedPer
1,1.474618e+09,1.534242e+12,2016-09-23 08:00:00,-0.007143,-0.319024,0.000000,0.302844,-0.007143,-0.081604
2,1.474632e+09,1.534242e+12,2016-09-23 12:00:00,0.007194,0.997851,-0.035714,1.837941,-0.014388,1.452847
3,1.474646e+09,1.534242e+12,2016-09-23 16:00:00,0.007143,0.194765,0.029630,-0.352518,0.021898,-0.148181
4,1.474661e+09,1.534242e+12,2016-09-23 20:00:00,-0.007092,-0.209741,-0.028777,-0.135278,-0.014286,-0.174273
5,1.474675e+09,1.534242e+12,2016-09-24 00:00:00,0.000000,0.068014,0.022222,0.118857,0.007246,0.093375
6,1.474690e+09,1.534242e+12,2016-09-24 04:00:00,-0.007143,-0.522650,-0.021739,-0.760666,-0.014388,-0.644142
7,1.474704e+09,1.534242e+12,2016-09-24 08:00:00,0.007194,0.153371,-0.007407,0.528641,0.000000,0.282199
8,1.474718e+09,1.534242e+12,2016-09-24 12:00:00,0.000000,0.217063,0.007463,-0.740026,0.014599,-0.174653
9,1.474733e+09,1.534242e+12,2016-09-24 16:00:00,-0.014286,0.521836,-0.007407,12.477232,-0.021583,2.063095
10,1.474747e+09,1.534242e+12,2016-09-24 20:00:00,-0.014493,0.532062,0.007463,-0.540252,0.000000,-0.076177


In [94]:
#Get the timestamp of Jan-1 for 2015, 2016, 2017, and 2018
jan1_timestamps = {}
time_adjust = 18000 #Seconds ahead of EST
for year in [2016, 2017, 2018]:
    jan1 = datetime.date(year, 1, 1)
    jan1_ts = time.mktime(jan1.timetuple())
    jan1_timestamps[str(year)] = jan1_ts - time_adjust

for row in item_records['data']:
    #Append column that converts ts to datetime object
    row['date'] = pd.to_datetime(row['ts'], unit='s')
    row['year'] = row['date'].dt.strftime('%Y')
    #Iterate through each year, and append tsYtd for all values within that year
    row['tsYtd'] = row['ts'] - pd.Series([jan1_timestamps[year] for year in row['year']]) #Series of jan-1 timestamps
    row = row.drop(columns=['year'])
    
print(item_records.iloc[0]['data'])
        

      buyingCompleted  buyingPrice  overallCompleted  overallPrice  \
0               20500          140             33160           140   
1               13960          139             30454           139   
2               27890          140             74699           137   
3               33322          141             63630           140   
4               26333          140             52541           138   
5               28124          140             57447           139   
6               13425          139             20443           137   
7               15484          140             26212           137   
8               18845          140             21634           139   
9               28679          138             66267           136   
10              43938          136             61219           136   
11              35010          137             64623           137   
12              24732          135             44148           136   
13              1263