In [0]:
import tensorflow.keras.backend as K
import numpy as np
import sys
import pickle
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from constants import * 
from heuristic import *
from io_help import *
from solver import *

In [0]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Conv2D, Flatten, Input
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from keras.models import load_model
import keras.losses

Using TensorFlow backend.


In [0]:
def load_data(file_name):
    """
    This function reads in training data from a file and returns 
    the boards in X and their labels in Y as a tuple. 
    """
    file = open(file_name, "r")
    X = []
    Y = []
    

    for string in file: 
        (board, dist) = string_to_board_and_dist(string)
        X_temp = np.concatenate((board.reshape(16)), axis=None)
        X.append(X_temp)
        Y.append(dist)
        
    file.close()
    X_train = np.asarray(X)
    Y_train = np.asarray(Y)
    return(X_train, Y_train)

In [0]:
# Load dataset. 
# X: board inputs, Y: true output.
(X_train,Y_train) = load_data('All_Data.txt')
print(X_train.shape)

(395715, 16)


In [0]:
# Generates additional features.
# X: the input data file.
# X_train: the original training data file (not transformed).
def gen_features (X, X_train, knn_model):
    #data_arr = np.zeros([len(X), 16*2*2 + 2])
    data_arr = np.zeros([len(X), (16)*2*2 + 2])
   # disp_2D = np.zeros([len(X), 32])
    man_ham_2D = np.zeros([len(X), 2])
    one_hot_2D = np.zeros([len(X), 256+32])
    pred = knn_model.kneighbors(X)
    
    
    #for i in tqdm(range(len(X))):
    for i in range(len(X)):
        row = X[i]
        # Grabs the rows in X corresponding to 50 nearest neighbors of X[i].
        # pred[1][i] contains a list of the indices of the 50 nearest neighbors.
        data = X_train[pred[1][i]]
        # Divide X[i] by each of its neighbors. div should be a 
        # 50 x 16 matrix, i.e. div[j] = X[i] / X[j].
        div = (row / data)
        # Subtract X[i] by each of its neighbors. diff should be a 
        # 50 x 16 dimension matrix.
        diff = (row - data)
        # concat is a 50 x 32 matrix.
        concat = np.concatenate([div, diff], axis = 1)
        # means is a 50 x 32 matrix.
        # std is a 50 x 32 matrix.
        means, stds = np.nanmean(concat, axis = 0), np.nanstd(concat, axis = 0)
        # Populate data_arr with newly generated features.
        data_arr[i, :len(means)] = means
        data_arr[i, len(means):len(means) + len(stds)] = stds
        data_arr[i, -1] = np.nanmean(pred[0][i])
        data_arr[i, -2] = np.nanstd(pred[0][i])
        
        # Calculate Displacements
       # disp_2D[i] = nn.calc_displacements(row.reshape(4,4))
        
        # Manhattan, Hamming distances
        man = manhattan(row.reshape(4,4), None)
        ham = hamming(row.reshape(4,4), None)
        man_ham_2D[i,0] = man
        man_ham_2D[i,1] = ham
        one_hot_2D[i] = get_rep_2(row.reshape(4,4))
        
    # Concatenate generated features to the original dataset.
    return np.concatenate([data_arr, one_hot_2D, man_ham_2D], axis=1)

In [0]:
knn_model = NearestNeighbors(n_neighbors=151, n_jobs = -1).fit(X_train,Y_train)


AttributeError: ignored

In [0]:
X_train_2 = gen_features(X_train, X_train, knn_model)

AttributeError: ignored

In [0]:
def one_hot_encode(board):
    """ 
    This function one hot encodes the board into a length 256 array.
    The one hot encoding gives the location of each number in the board.
    For example, the first 16 of the 256 numbers will indicate where on
    the board the 1 tile is. 
    """

    flat = (board.reshape(SIZE ** 2)).tolist()

    X = []
    for i in np.arange(1,17): 
        encoding = np.zeros(SIZE ** 2)
        encoding[flat.index(i)] = 1

        X.append(encoding)

    X = (np.asarray(X).reshape(SIZE ** 4))

    # Potentially append Manhattan distance. 
    # np.append(X, manhattan(board))

    return X

In [0]:
def calc_displacements(board):
    """given a board, returns SIZE^2 array containing distances of tile in
    each entry to proper location"""
    dis_x = np.zeros(SIZE ** 2)
    dis_y = np.zeros(SIZE ** 2)

    for i in range(SIZE):
        for j in range(SIZE):
            curr = board[i,j]
            (x, y) = get_proper_loc(curr)
            dis_x[SIZE * i + j] = x-i
            dis_y[SIZE * i + j] = y-j
    return np.concatenate((dis_x, dis_y))

In [0]:
def get_rep_2(board):
    """returns representation of one-hot encoded board with additional 16 
    entries which encode distnaces entry in eqch square is from proper location"""
    encode = one_hot_encode(board)
    displacements = calc_displacements(board)
    return np.concatenate((encode, displacements))

In [0]:
def load_data_2(file_name):
    """same as load_data except that has additional 16 entries which
    encode distnaces entry in eqch square is from proper location"""
    file = open(file_name, "r")

    X = []
    Y = []
    for line in file:
        (board, dist) = string_to_board_and_dist(line)
        Y.append(dist)
        X.append(get_rep_2(board))


    file.close()
    return (np.asarray(X), np.asarray(Y))

In [0]:
def load_data(file_name):
    """
    This function reads in training data from a file and returns 
    the one-hot encoded data X and their labels Y as a tuple. 
    """
    file = open(file_name, "r")

    X = []
    Y = []

    for string in file: 
        (board, dist) = string_to_board_and_dist(string) 

        X.append(one_hot_encode(board))
        Y.append(dist)

    file.close()

    return(np.asarray(X),np.asarray(Y))

In [0]:
def load_data_csv(file_name):
    return pd.read_csv(file_name, index_col=0)

In [0]:
def shift_mse_2(y_true, y_pred):
    loss = (1 + 100/ (1 + K.exp(-(y_pred - y_true)))) * K.square(y_pred - y_true)
    loss = K.mean(loss, axis = 1)
    return loss

In [0]:
def exp_loss_2(y_true, y_pred):
    """
    Custom loss function. 
    """
    loss = K.exp((y_pred - y_true)) / 2
    loss = loss + K.square(y_pred - y_true)
    loss = K.mean(loss, axis = 1)

    return loss

In [0]:
def train(X_train, Y_train):
    
    #train_data = load_data("Yasmin_40360_50knn_Trans.csv")

    #X_train = train_data[train_data.columns[:-1]].values
    #Y_train = train_data[train_data.columns[-1]].values
    
    model = XGBClassifier(verbose_eval=True, tree_method='gpu_hist', 
                          learning_rate=0.3, max_depth=6, min_child_weight=4, 
                          n_estimators=200, objective='mse_shift_2', 
                          subsample=0.8, colsample_bytree=0.8,
                          verbosity=2, gamma=0.1)

    model.fit(X_train, Y_train)
    
    return model

In [0]:
model = train()

In [0]:
pickle.dump(model, open("xg_model_penalize_fe_200_6_4", "wb"))

In [0]:
def find_over_estimate(file_name, model):
    """
    This function takes in a model saved in model_file and data points in 
    file_name and prints out the percentage of times said model predicted 
    a distance greater than the actual distance and the percentage of times
    said model predicted a distance less than the Manhattan Distance
    """
    data = open(file_name, "r")
    over = []
    under = []

    count = 1
    for line in data:
        (board, dist) = string_to_board_and_dist(line)
        man_dist = manhattan(board, None)
        pred = xgboost_heuristic_2(board, model)
        over.append(pred > dist)
        under.append(pred < man_dist)
        
        if (count % 1000 == 1):
            print(count)
        count += 1

    print("prediction less than manhattan percent of the time", sum(under) * 100 / len(under))
    print("prediction greater than actual distance precent of the time", sum(over) * 100 / len(over))

In [0]:
find_over_estimate("Test_Data.txt", model)

In [0]:
def xgboost_heuristic_2(board, model):

    """
    This function takes in a board and a trained NN model and returns
    the heuristic the model predicts.
    """

    return model.predict(get_rep_2(board).reshape(1,288))