# Imports and Function Declarations

In [155]:
import urllib
import requests
from pandas.io.json import json_normalize
import json
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import time
import pytz
import math
import numpy as np

from neural_network import NeuralNetwork

"""
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Activation, Dense
from keras.models import Sequential
"""


def generate_item_records_from_summary():
    """ 
        Generate item records using the summary.json link from the OSBuddy API
    """
    df = pd.read_json(path_or_buf='https://rsbuddy.com/exchange/summary.json',orient='index', convert_axes=True)
    df = df[['id','name','buy_average','buy_quantity','sell_average','sell_quantity','overall_average','overall_quantity']]
    data = df.sort_values(by=['id']).reset_index()
    data = data.drop(labels='index',axis=1)
    
    #Output item id/name pairs to a csv file
    item_key = data[['id', 'name']]
    file_name = './item_key.csv'
    item_key.to_csv(path_or_buf=file_name, columns=('id','name'), index=False)


def generate_input_data_by_item_number(start_num, num_items): 
    """
        Pull items by range of Index values
        
            @param start_num: the starting index 
            @param num_items: the number of items to generate data for, starting from
                              the start_num
    """
    file_name = './item_key.csv'
    all_items = pd.read_csv(file_name, skiprows=[])
    items = all_items[start_num: start_num + num_items]
    return items

def generate_input_data_by_item_name(names): 
    """
        Pull specific items by name
        
            @param names: a list containing the item names to generate data for
    """
    file_name = './item_key.csv'
    data = pd.read_csv(file_name, skiprows=[])
    items = pd.DataFrame()
    for name in names:
        items = items.append(data.loc[data['name'] == name])
    items = items.reset_index().drop(labels='index',axis=1)
    return items

def get_item_records_from_url(input_data, frequency):
    """ 
        Iterate through all items, grab data from api and append to dataframe 
        
            @param input_data: a list of items with their associated index
            @param frequency: the period at which data is sampled, in minutes
    """
    i = 0
    item_lookup_failed = False
    item_records = pd.DataFrame(columns=['id', 'name', 'data'])
    while(i < input_data['id'].count()):
        key = input_data.iloc[i]['id']
        name = input_data.iloc[i]['name']
        #Print item information only on first data retrieval attempt 
        if not item_lookup_failed:
            print('Querying API for ' + name + ' data. ' + 'Item Id: ' + str(key))
        try:
            #Attempt to get data from API, retry if HTTP error
            url = f'https://api.rsbuddy.com/grandExchange?a=graph&g={frequency}&start=1474615279000&i={key}'
            temp_df = pd.DataFrame()
            temp_df = pd.read_json(path_or_buf=url, orient='records', convert_axes=False)
            item_records = item_records.append({'id':key, 'name': name, 'data':temp_df}, ignore_index=True)
            item_lookup_failed = False
            i+=1
        except:
            print("Retrying...")
            time.sleep(1) #Avoid getting blacklisted by API
            item_lookup_failed = True
            
    print('Item retrieval complete!')

    #Add correct formatting to item record dates/times
    item_records = format_item_record_dates(item_records)

    return item_records

def format_item_record_dates(item_records):
    """ 
        Converts timestamp to Unix seconds, tacks on formatted date field, and creates Unix 
        seconds from most recent datapoint, and Unix seconds position from Jan-1.
        
            @param item_records: the master item data structure
    """
    #Get most recent timestamp in all the data (highest value) 
    most_recent_ts = 0
    for row in item_records['data']:
        max_ts = row['ts'].max()
        if max_ts > most_recent_ts:
            most_recent_ts = max_ts
    
    #Get the timestamp of Jan-1 for 2015, 2016, 2017, and 2018
    jan1_timestamps = {}
    time_adjust = 18000 #Seconds ahead of EST
    for year in [2016, 2017, 2018]:
        jan1 = datetime.date(year, 1, 1)
        jan1_ts = time.mktime(jan1.timetuple())
        jan1_timestamps[str(year)] = jan1_ts - time_adjust

    for row in item_records['data']:
        #Convert timestamp from milliseconds to seconds
        row['ts'] = row['ts'] / 1000
        #Append column to indicate time delta from most recent record in dataset
        row['tsFromCurrent'] = most_recent_ts - row['ts']
        #Append column that converts ts to datetime object
        row['date'] = pd.to_datetime(row['ts'], unit='s')
        row['year'] = row['date'].dt.strftime('%Y')
        #Append a column representing the YTD seconds
        row['tsYtd'] = row['ts'] - pd.Series([jan1_timestamps[year] for year in row['year']])
        row.drop(columns=['year'], inplace=True)

            
    return item_records

def generate_train_and_test_deltas(item_records):    
    """
        Calculate percent difference from previous record for each relevant column. 
            
            @param item_records: the master item data structure
    """
    #Iterate through each item, and calculate percentages for their data
    for _, row in item_records.iterrows():
        #Create temp dataframe for percent changes
        pct_change_df = row['data'][['buyingPrice', 'buyingCompleted', 'sellingPrice', 
                            'sellingCompleted', 'overallPrice', 'overallCompleted']].copy()
        pct_change_df = pct_change_df.rename(index=int, 
                                         columns={'buyingPrice':'buyingPricePer', 'buyingCompleted':'buyingCompletedPer',
                                        'sellingPrice':'sellingPricePer', 'sellingCompleted':'sellingCompletedPer',
                                        'overallPrice':'overallPricePer', 'overallCompleted':'overallCompletedPer'})
        #Calculate percent change
        pct_change_df = pct_change_df.pct_change()
        
        #Join percent change df back to original dataframe
        row['data'] = row['data'].join(pct_change_df)
        
def generate_moving_averages(item_records, columns, window):
    """
        Calculate the moving average for each field with specified window size
        
            @param item_records: the master item data structure
            @param columns: a list of which columns (names) we are calculating average for
            @param window: the number of periods to average over -- e.g. 60 if doing 12 hour item records for 30 days
    """
    #Iterate through each item and calculate the moving average for each field
    for _, row in item_records.iterrows():
        for col in columns:
            ma_col_name = col + 'MA'
            row['data'][ma_col_name] = row['data'][col].rolling(window=window).mean()
            
            #Drop all rows where MA is NaN
            row['data'] = row['data'].dropna(subset=[ma_col_name])


def createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit):
    """
        Create the training and testing data sets for a particular epoch starting at start_stamp, and extending
        epoch_size units into the future.
        
            @param item_records: master item data structure
            @param start_stamp: the starting time stamp of this epoch
            @param epoch_size: the epoch size in days
            @param periods_per_epoch_unit: the number of periods ahead of the last training record
                that we will use this training data to predict
    """
    print("Creating Test and Training Datasets...")
    # Create dataframe to hold training/test data for the epoch
    epoch_dataframe = pd.DataFrame(columns=['id', 'name', 'train_data', 'test_data', 'pred_data'])
    record_index = 0
    end_stamp = start_stamp + math.ceil(epoch_size * periods_per_epoch_unit) 
    
    # Iterate through the each item in item_records
    while(record_index < item_records['id'].count()): 
        #Get item id
        item_id = item_records.iloc[record_index]['id'] 
        
        # Find all item records where ID matchest item_id
        test_set = item_records.loc[item_records['id'] == item_id]
        
        # The time stamp of the value we'll use this training data to predict
        predict_stamp = end_stamp + periods_per_epoch_unit
        
        #Ensure that the value we are attempting to predict exists for testing purposes
        if predict_stamp < len(test_set.iloc[0]['data']):
            # Create training dataset
            train_df = test_set.iloc[0]['data'].iloc[start_stamp:end_stamp]
            train_data = train_df[train_df.columns[-12:]][1:]          
            
            # Get test value -- the price 6 periods from the end of the training set (24 hours) 
            price_col = 14
            test_data = test_set.iloc[0]['data'].iloc[predict_stamp][price_col]

            # Append this item's data to the overall dataset for this epoch
            epoch_dataframe = epoch_dataframe.append({'id':item_id, 'name': test_set.iloc[0]['name'], 'train_data': train_data, 'test_data': test_data}, ignore_index=True)
            
        record_index += 1

    return epoch_dataframe

# Generate Input Data

In [127]:
#------ GENERATE INPUT DATA ------#
#Get a list of all items
generate_item_records_from_summary()

#Pull items by range of Index values
#input_data = generate_input_data_by_item_number(0,3)

#Pull specific items by name
input_data = generate_input_data_by_item_name(['Leather','Dragon boots', 'Rune arrow'])

# Generate Item Records

In [149]:
#----- GENERATE ITEM RECORDS -----#
item_records = get_item_records_from_url(input_data, 240)

Querying API for Leather data. Item Id: 1741
Querying API for Dragon boots data. Item Id: 11840
Querying API for Rune arrow data. Item Id: 892
Retrying...
Retrying...
Item retrieval complete!


# Convert Data to Deltas

In [150]:
#----- CONVERT DATA TO DELTAS -----#
generate_train_and_test_deltas(item_records)

# Generate 30-Day Averages For Columns

In [151]:
#------ CREATE 30 DAY AVERAGES FOR COLUMNS -----#
columns = ['overallPricePer', 'overallCompletedPer', 'buyingPricePer', 'buyingCompletedPer',\
           'sellingPricePer', 'sellingCompletedPer']
window = 180 #24/4 hour periods * 30 days = 180 window
generate_moving_averages(item_records, columns, window)


In [152]:
print(item_records.iloc[0]['data'])

      buyingCompleted  buyingPrice  overallCompleted  overallPrice  \
1075            14632          236             21693           228   
1076            22706          262             57787           269   
1077            35096          264             73215           263   
1078            50840          288             83081           284   
1079            40249          293             65436           289   
1080            27168          298             39808           297   
1081            17375          262             37935           279   
1082            32909          246             58176           252   
1083            37473          237             68883           241   
1084            39190          256             81779           252   
1085            11029          320             23629           350   
1086            20238          287             44751           291   
1087            10559          247             25244           248   
1088            1512

# Create Training and Test Datasets

In [156]:
#------- CREATE TRAINING AND TEST DATASETS -----#
#In Days - define the number of epochs to include in the test data
periods_per_epoch_unit = 6 #Each value is a 4 hour record
start_stamp = 0
epoch_size = 30
num_epochs = 3

#Create test_item object to log training and test sets
test_train_set = pd.DataFrame(columns=['epoch_id', 'item_data'])

#Calculate number of epochs required

#Get item data for each epoch
for i in range(num_epochs):
    curr_epoch = pd.DataFrame(columns=['epoch_id', 'item_data'])
    test_data = createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit)
    curr_epoch = curr_epoch.append([{'epoch_id': i, 'item_data': createTrainAndTestSet(item_records, start_stamp, epoch_size, periods_per_epoch_unit)}])
    #print(curr_epoch)
    test_train_set = test_train_set.append(curr_epoch)

Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...
Creating Test and Training Datasets...


# Test Creation of Neural Net

In [None]:
#----- TEST CREATION OF NEURAL NET -----#
nn = NeuralNetwork()
nn.generateInitialNetwork([1,1])
nn.calculateNodeActivations()
print(nn.outputNode.getActivation())

# Train the ANN

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#import keras.backend as K
train_x = test_item.iloc[2]['train_x'] # test data. not real
 
def create_model(layesr,activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes,input_dim=train_x.shape[1]))
            model.add(Activation(activation))
        else: 
            model.add(Dense(nodes))
            model.add(Activation(activation))
    model.add(Dense(1)) #Note: no activations present beyond this point

    model.compile(optimizer='adadelta', loss='mse')
    return model

model = KerasRegressor(build_fn=create_model, verbose = 0)

In [None]:
layers = [[16], [4,2], [4], [16,4]]
activations = [tanh, relu]
param_grid = dict(layers=layers, activation=activations, batch_size = [42, 180], epochs[6])
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error')

In [157]:
test_train_set.iloc[0].item_data.iloc[0].train_data

Unnamed: 0,buyingPricePer,buyingCompletedPer,sellingPricePer,sellingCompletedPer,overallPricePer,overallCompletedPer,overallPricePerMA,overallCompletedPerMA,buyingPricePerMA,buyingCompletedPerMA,sellingPricePerMA,sellingCompletedPerMA
1076,0.110169,0.551804,0.292453,3.968276,0.179825,1.663855,0.006833,0.144960,0.008637,0.142913,0.006776,0.522253
1077,0.007634,0.545671,-0.043796,0.086600,-0.022305,0.266980,0.006709,0.145973,0.008541,0.145763,0.006672,0.521775
1078,0.090909,0.448598,0.057252,-0.154201,0.079848,0.134754,0.007256,0.141957,0.009181,0.148094,0.006954,0.509262
1079,0.017361,-0.208320,0.018051,-0.218790,0.017606,-0.212383,0.007354,0.143637,0.009381,0.147219,0.007054,0.512187
1080,0.017065,-0.325002,0.042553,-0.498154,0.027682,-0.391650,0.007367,0.142718,0.009335,0.147109,0.007114,0.509863
1081,-0.120805,-0.360461,-0.003401,0.626582,-0.060606,-0.047051,0.006859,0.144128,0.008424,0.145817,0.007164,0.516361
1082,-0.061069,0.894043,-0.112628,0.228940,-0.096774,0.533571,0.006688,0.144351,0.008512,0.146568,0.006677,0.518829
1083,-0.036585,0.138685,-0.053846,0.243123,-0.043651,0.184045,0.006303,0.144300,0.008273,0.147971,0.006128,0.508897
1084,0.080169,0.045820,0.012195,0.355906,0.045643,0.187216,0.006521,0.143487,0.008542,0.147735,0.006298,0.506639
1085,0.250000,-0.718576,0.514056,-0.704149,0.388889,-0.711063,0.008578,0.141032,0.009793,0.145731,0.009050,0.503689
