# Imports and Function Declarations

In [47]:
import urllib
import requests
from pandas.io.json import json_normalize
import json
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import time
import math
import numpy as np

from neural_network import NeuralNetwork

"""
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Activation, Dense
from keras.models import Sequential
"""


def generate_item_records_from_summary():
    """ 
        Generate item records using the summary.json link from the OSBuddy API
    """
    df = pd.read_json(path_or_buf='https://rsbuddy.com/exchange/summary.json',orient='index', convert_axes=True)
    df = df[['id','name','buy_average','buy_quantity','sell_average','sell_quantity','overall_average','overall_quantity']]
    
    #Output item id/name pairs to a csv file
    item_key = data[['id', 'name']]
    file_name = './item_key.csv'
    item_key.to_csv(path_or_buf=file_name, columns=('id','name'), index=False)


def generate_input_data_by_item_number(start_num, num_items): 
    """
        Pull items by range of Index values
        
            @param start_num: the starting index 
            @param num_items: the number of items to generate data for, starting from
                              the start_num
    """
    file_name = './item_key.csv'
    all_items = pd.read_csv(file_name, skiprows=[])
    items = all_items[start_num: start_num + num_items]
    return items

def generate_input_data_by_item_name(names): 
    """
        Pull specific items by name
        
            @param names: a list containing the item names to generate data for
    """
    file_name = './item_key.csv'
    data = pd.read_csv(file_name, skiprows=[])
    items = pd.DataFrame()
    for name in names:
        items = items.append(data.loc[data['name'] == name])
    items = items.reset_index().drop(labels='index',axis=1)
    return items

def get_item_records_from_url(input_data):
    """ 
        Iterate through all items, grab data from api and append to dataframe 
        
            @param input_data: a list of items with their associated index
    """
    i = 0
    item_lookup_failed = False
    item_records = pd.DataFrame(columns=['id', 'name', 'data'])
    while(i < input_data['id'].count()):
        key = input_data.iloc[i]['id']
        name = input_data.iloc[i]['name']
        #Print item information only on first data retrieval attempt 
        if not item_lookup_failed:
            print('Querying API for ' + name + ' data. ' + 'Item Id: ' + str(key))
        try:
            #Attempt to get data from API, retry if HTTP error
            url = f'https://api.rsbuddy.com/grandExchange?a=graph&g=240&start=1474615279000&i={key}'
            temp_df = pd.DataFrame()
            temp_df = pd.read_json(path_or_buf=url, orient='records', convert_axes=False)
            item_records = item_records.append({'id':key, 'name': name, 'data':temp_df}, ignore_index=True)
            item_lookup_failed = False
            i+=1
        except:
            print("Retrying...")
            time.sleep(1) #Avoid getting blacklisted by API
            item_lookup_failed = True
            
    print('Item retrieval complete!')

    #Add correct formatting to item record dates/times
    item_records = format_item_record_dates(item_records)

    return item_records

def format_item_record_dates(item_records):
    """ 
        Converts timestamp to Unix seconds, tacks on formatted date field, and creates Unix 
        seconds from most recent datapoint, and Unix seconds position from Jan-1.
        
            @param item_records: the master item data structure
    """
    #Get most recent timestamp in all the data (highest value) 
    most_recent_ts = 0
    for row in item_records['data']:
        max_ts = row['ts'].max()
        if max_tx > most_recent_ts:
            most_recent_ts = max_ts
            
    #Convert timestamp from milliseconds to seconds, append column to indicate time since most recent
    #timestamp, 
    for row in item_records['data']:
        row['ts'] = row['ts'] / 1000
        row['tsFromCurrent'] = most_recent_ts - row['ts']
    
    #Create column to indicate time since 
    
    
    """
    #Iterate through each item and convert timestamps from milliseconds to seconds
    for _, row in item_records.iterrows():
        print("Formatting item record dates for: " +  row['name'] + "...")
        row['data'] = row['data'][['ts','buyingPrice','buyingCompleted','sellingPrice','sellingCompleted', 'overallPrice','overallCompleted']]
        row['data'] = row['data'].sort_values(by=['ts'],ascending=1).reset_index()
        for ind, r in row['data'].iterrows():
            r['ts'] = int(r['ts']/1000)
            row['data'].loc[ind, 'ts'] = int(r['ts'])
            
        #Refactor unix timestamps where the most recent record is 0
        timestamps = pd.DataFrame(columns=['ts','tsFromCurrent', 'tsYtd'],data=row['data']['ts'])
        lastTs = timestamps['ts'][len(timestamps)-1]
        i = 0
        print ('Finding timestamps from most recent datapoint.')
        while(i<len(timestamps)):
            timestamps['tsFromCurrent'][i] = abs(timestamps['ts'][i] - lastTs)    
            i += 1
        
        #Create a list of all years and corresponding unix since 1970 (Unix = 0)
        years = pd.DataFrame(columns=['year', 'unix'])    
        i = 0
        for year in range(2015,2050):
            unix = (year - 1970) * 31557600
            years.loc[i] = [year,unix]
            i+=1

        #Refactor unix timestamps to show time since Jan-1
        i = 0
        j = 0
        newYear = True
        while(i<len(timestamps)):
            #Iterate through each year to find all timestamps wihtin that date range
            unixYear = years.iloc[j]['unix']
            if newYear == True:
                print("Checking entries for year: " + str(years.iloc[j]['year']))
            ytd = abs(timestamps['ts'][i] - unixYear)
            if(ytd < 31557600):
                ytd = abs(timestamps.iloc[i]['ts'] - unixYear) #this may be redundant. possibly deprecate
                timestamps['tsYtd'][i] = int(ytd)
                i += 1
                newYear = False
            else:
                j += 1
                newYear = True
        #Append the new timestamps to the master item DataFrame
        for ind, r in row['data'].iterrows():
            row['data'].loc[ind, 'date'] = datetime.datetime.fromtimestamp(r['ts']).isoformat()
            row['data'].loc[ind, 'tsFromCurrent'] = timestamps.iloc[ind]['tsFromCurrent']
            row['data'].loc[ind, 'tsYtd'] = timestamps.iloc[ind]['tsYtd']            
        row['data'] = row['data'].drop(labels='index',axis=1)
        
    """
    return item_records

def generate_train_and_test_deltas(item_records):    
    """
        Calculate percent difference from previous record for each relevant column. 
            
            @param item_records: the master item data structure
    """
    #Iterate through each item, and calculate percentages for their data
    for _, row in item_records.iterrows():
        #Create temp dataframe for percent changes
        pct_change_df = row['data'][['buyingPrice', 'buyingCompleted', 'sellingPrice', 
                            'sellingCompleted', 'overallPrice', 'overallCompleted']].copy()
        pct_change_df = pct_change_df.rename(index=int, 
                                         columns={'buyingPrice':'buyingPricePer', 'buyingCompleted':'buyingCompletedPer',
                                        'sellingPrice':'sellingPricePer', 'sellingCompleted':'sellingCompletedPer',
                                        'overallPrice':'overallPricePer', 'overallCompleted':'overallCompletedPer'})
        #Calculate percent change
        pct_change_df = pct_change_df.pct_change()
        
        #Join percent change df back to original dataframe
        row['data'] = row['data'].join(pct_change_df)
        #row['data'] = pd.concat([row['data'], pct_change_df], axis=1)
                

def createTrainAndTestSet(item_records, start_stamp, epoch_size, num_steps_ahead_to_predict):
    """
        Create the training and testing data sets for a particular epoch starting at start_stamp, and extending
        epoch_size units into the future.
        
            @param item_records: master item data structure
            @param start_stamp: the starting time stamp of this epoch
            @param epoch_size: the epoch size in days
            @param num_steps_ahead_to_predict: the number of periods ahead of the last training record
                that we will use this training data to predict
    """
    print("Creating Test and Training Datasets...")
    # Create dataframe to hold training/test data for the epoch
    epoch_dataframe = pd.DataFrame(columns=['id', 'name', 'train_data', 'test_data', 'pred_data'])
    record_index = 0
    end_stamp = start_stamp + math.ceil(epoch_size*6) #6 entries per epoch unit
    
    # Iterate through the each item in item_records
    while(record_index < item_records['id'].count()): 
        #Get item id
        item_id = item_records.iloc[record_index]['id'] 
        
        # Find all item records where ID matchest item_id
        test_set = item_records.loc[item_records['id'] == item_id]
        
        # The time stamp of the value we'll use this training data to predict
        predict_stamp = end_stamp + num_steps_ahead_to_predict
        
        #Ensure that the value we are attempting to predict exists for testing purposes
        if predict_stamp < len(test_set.iloc[0]['data']):
            # Create training dataset
            train_df = test_set.iloc[0]['data'].iloc[start_stamp:end_stamp]
            train_data = train_df[train_df.columns[-9:]][1:]          
            
            # Get test value -- the price 6 periods from the end of the training set (24 hours) 
            price_col = 14
            test_data = test_set.iloc[0]['data'].iloc[predict_stamp][price_col]

            # Append this item's data to the overall dataset for this epoch
            epoch_dataframe = epoch_dataframe.append({'id':item_id, 'name': test_set.iloc[0]['name'], 'train_data': train_data, 'test_data': test_data}, ignore_index=True)
            
        record_index += 1

    return epoch_dataframe

# Generate Input Data

In [2]:
#------ GENERATE INPUT DATA ------#
#Get a list of all items
generate_item_records_from_summary()

#Pull items by range of Index values
#input_data = generate_input_data_by_item_number(0,3)

#Pull specific items by name
input_data = generate_input_data_by_item_name(['Leather','Dragon boots', 'Rune arrow'])

Item List generated by name:
      id          name
0   1741       Leather
1  11840  Dragon boots
2    892    Rune arrow


# Generate Item Records

In [31]:
#----- GENERATE ITEM RECORDS -----#
item_records = get_item_records_from_url(input_data)

Querying API for Leather data. Item Id: 1741
Querying API for Dragon boots data. Item Id: 11840
Retrying...
Querying API for Rune arrow data. Item Id: 892
Item retrieval complete!
Formatting item record dates for: Leather...
Finding timestamps from most recent datapoint.
Checking entries for year: 2015
Checking entries for year: 2016
Checking entries for year: 2017
Checking entries for year: 2018
Formatting item record dates for: Dragon boots...
Finding timestamps from most recent datapoint.
Checking entries for year: 2015
Checking entries for year: 2016
Checking entries for year: 2017
Checking entries for year: 2018
Formatting item record dates for: Rune arrow...
Finding timestamps from most recent datapoint.
Checking entries for year: 2015
Checking entries for year: 2016
Checking entries for year: 2017
Checking entries for year: 2018


# Convert Data to Deltas

In [32]:
#----- CONVERT DATA TO DELTAS -----#
generate_train_and_test_deltas(item_records)

# Create Training and Test Datasets

In [48]:
#------- CREATE TRAINING AND TEST DATASETS -----#
#In Days - define the number of epochs to include in the test data
num_pred_vals = 6 #Each value is a 4 hour record
start_stamp = 0
epoch_size = 30
num_epochs = 3

#Create test_item object to log training and test sets
test_train_set = pd.DataFrame(columns=['epoch_id', 'item_data'])

#Calculate number of epochs required

#Get item data for each epoch
for i in range(num_epochs):
    curr_epoch = pd.DataFrame(columns=['epoch_id', 'item_data'])
    test_data = createTrainAndTestSet(item_records, start_stamp, epoch_size, num_pred_vals)
    #print('Data for Epoch {0}\n-----------------'.format(i))
    #print(test_data)
    #print('Processing Epoch {0}'.format(i))
    curr_epoch = curr_epoch.append([{'epoch_id': i, 'item_data': createTrainAndTestSet(item_records, start_stamp, epoch_size, num_pred_vals)}])
    #print(curr_epoch)
    test_train_set = test_train_set.append(curr_epoch)

Creating Test and Training Datasets...
                    date  tsFromCurrent       tsYtd  buyingPricePer  \
0    2016-09-23T00:00:00     61056000.0  22953600.0             NaN   
1    2016-09-23T04:00:00     61041600.0  22968000.0       -0.007143   
2    2016-09-23T08:00:00     61027200.0  22982400.0        0.007194   
3    2016-09-23T12:00:00     61012800.0  22996800.0        0.007143   
4    2016-09-23T16:00:00     60998400.0  23011200.0       -0.007092   
5    2016-09-23T20:00:00     60984000.0  23025600.0        0.000000   
6    2016-09-24T00:00:00     60969600.0  23040000.0       -0.007143   
7    2016-09-24T04:00:00     60955200.0  23054400.0        0.007194   
8    2016-09-24T08:00:00     60940800.0  23068800.0        0.000000   
9    2016-09-24T12:00:00     60926400.0  23083200.0       -0.014286   
10   2016-09-24T16:00:00     60912000.0  23097600.0       -0.014493   
11   2016-09-24T20:00:00     60897600.0  23112000.0        0.007353   
12   2016-09-25T00:00:00     60883200.

                    date  tsFromCurrent       tsYtd  buyingPricePer  \
0    2016-09-23T00:00:00     61056000.0  22953600.0             NaN   
1    2016-09-23T04:00:00     61041600.0  22968000.0       -0.001276   
2    2016-09-23T08:00:00     61027200.0  22982400.0        0.001285   
3    2016-09-23T12:00:00     61012800.0  22996800.0        0.007489   
4    2016-09-23T16:00:00     60998400.0  23011200.0       -0.001474   
5    2016-09-23T20:00:00     60984000.0  23025600.0        0.004299   
6    2016-09-24T00:00:00     60969600.0  23040000.0        0.001655   
7    2016-09-24T04:00:00     60955200.0  23054400.0        0.003155   
8    2016-09-24T08:00:00     60940800.0  23068800.0        0.003572   
9    2016-09-24T12:00:00     60926400.0  23083200.0        0.001401   
10   2016-09-24T16:00:00     60912000.0  23097600.0        0.009510   
11   2016-09-24T20:00:00     60897600.0  23112000.0        0.013249   
12   2016-09-25T00:00:00     60883200.0  23126400.0       -0.005464   
13   2

# Test Creation of Neural Net

In [None]:
#----- TEST CREATION OF NEURAL NET -----#
nn = NeuralNetwork()
nn.generateInitialNetwork([1,1])
nn.calculateNodeActivations()
print(nn.outputNode.getActivation())

In [100]:
test_train_set.iloc[0]['item_data'].iloc[0]['test_data']

0.042553191489361764

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#import keras.backend as K
train_x = test_item.iloc[2]['train_x'] # test data. not real
 
def create_model(layesr,activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes,input_dim=train_x.shape[1]))
            model.add(Activation(activation))
        else: 
            model.add(Dense(nodes))
            model.add(Activation(activation))
    model.add(Dense(1)) #Note: no activations present beyond this point

    model.compile(optimizer='adadelta', loss='mse')
    return model

model = KerasRegressor(build_fn=create_model, verbose = 0)

In [None]:
layers = [[16], [4,2], [4], [16,4]]
activations = [tanh, relu]
param_grid = dict(layers=layers, activation=activations, batch_size = [42, 180], epochs[6])
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error')

In [None]:
dft = pd.DataFrame(columns=['A','B','C'])
dft.rename(index=str, columns={'A':'a', 'B':'b', 'C':'c'})
dft

In [87]:
for row in item_records['data']:
    row['ts'] = row['ts'] * 10000

for row in item_records['data']:
    print(row)

                ts  buyingPrice  buyingCompleted  sellingPrice  \
0     1.474603e+09          140            20500         140.0   
1     1.474618e+09          139            13960         140.0   
2     1.474632e+09          140            27890         135.0   
3     1.474646e+09          141            33322         139.0   
4     1.474661e+09          140            26333         135.0   
5     1.474675e+09          140            28124         138.0   
6     1.474690e+09          139            13425         135.0   
7     1.474704e+09          140            15484         134.0   
8     1.474718e+09          140            18845         135.0   
9     1.474733e+09          138            28679         134.0   
10    1.474747e+09          136            43938         135.0   
11    1.474762e+09          137            35010         137.0   
12    1.474776e+09          135            24732         137.0   
13    1.474790e+09          135            12631         143.0   
14    1.47

In [67]:
timestamp_df

0                  ts  buyingPrice  buyingCompleted...
1                  ts  buyingPrice  buyingCompleted...
2                  ts  buyingPrice  buyingCompleted...
Name: data, dtype: object