In [1]:
'''
Worked on by: Meena Hari.

Generated KNearestNeighbors model for "All_Data.txt"
with K=35.

From "knn_find_best.ipynb", found that taking 
the arithmetic mean of neighbors to make 
predictions with K = 35 generalizes best. 

Ran gen_portfolio_data function (copied from
"gen_portfolio.py"), which records predictions
of the knn model on a dataset to use in 
training of an ensemble model (to be done).

'''
from numba import njit, jit
from scipy import stats
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Conv2D, Flatten, Input
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from keras.models import load_model
import keras.losses

from constants import *
import heuristic as h
import io_help as io
import neural_net as nn
import solver as s

def load_data(file_name):
    """
    Reads in training data from a file and returns 
    the boards in X and their labels in Y as a tuple. 
    (Function copied from "neural_net.py").
    """
    file = open(file_name, "r")
    X = []
    Y = []
    

    for string in file: 
        (board, dist) = io.string_to_board_and_dist(string)
        X_temp = np.concatenate((board.reshape(16)), axis=None)
        X.append(X_temp)
        Y.append(dist)
        
    file.close()
    X_train = np.asarray(X)
    Y_train = np.asarray(Y)
    return(X_train, Y_train)

Using TensorFlow backend.


In [2]:
def knn_test(X_test, X_train, Y_train, model):
    '''
    Returns model's predictions on X_test as a 
    numpy array of size: (# testing samples) x 3.
    
    X test: array of test boards, each with size (16,)
    X_train: array of training boards, each with size (16,)
    Y_train: array of true output for X_train.
    model: array of k-nearest neighbors for X_train generated by trained knn model.
    
    1st column of predictions is generated using arithmetic
    mean.
    2nd column of predictions is generated using geometric
    mean.
    3rd column of predictions is generated using harmonic
    mean.
    '''
    
    # Stores predictions of the knn model.
    preds = np.zeros((len(X_test),3))

    for i in range(len(X_test)):
        neigh_inds = model[1][i]
        res_mean = np.mean(Y_train[neigh_inds])
        res_gmean = stats.gmean(Y_train[neigh_inds])
        res_hmean = stats.hmean(Y_train[neigh_inds])
        preds[i, 0] = res_mean
        preds[i, 1] = res_gmean
        preds[i, 2] = res_hmean
        #mode = stats.mode(Y_train[neigh_inds])[0][0]
        
    return preds

In [3]:
def string_to_test_info(string):
    """
    Given a string containing the standard form of test info, returns tuple of 
    board, number of states to solution, time, and lenght of solution.
    (Function copied from "run_testing.py").
    """
    split = string.split("!")
    board = io.string_to_board(split[0])
    n_states = int(split[1])
    time = float(split[2])
    sol_len = int(split[3])
    return (board, n_states, time, sol_len)

def load_boards(filename):
    """
    Given name of file containing test boards, loads all test boards.
    (Function copied from "run_testing.py").
    """
    file = open(filename, "r")

    boards = []
    n_states = []
    times = []
    dists = []

    for line in file:
        (board, c_states, c_time, sol_len) = string_to_test_info(line)
        boards.append(board)
        n_states.append(c_states)
        times.append(c_time)
        dists.append(sol_len)

    return (boards, n_states, times, dists)

In [4]:
# Load dataset. X: board inputs, Y: true output.
(X_train,Y_train) = load_data('All_Data.txt')
print(X_train.shape)

(395715, 16)


In [5]:
# Load test boards.
(X_test_mt,_,_, Y_test) = load_boards('Test_boards.txt')
X_test = np.asarray(X_test_mt).reshape(len(X_test_mt),16)

In [6]:
# K-values to try during model training.
K = [35]

# Store some statistics from model training to use for plotting.
avg_dist_over_mean = []
avg_e_admiss_mean = []
perc_over_man_mean = []
perc_under_man_mean = []
perc_over_ham_mean = []


In [7]:
# Only training with best K = 35, so storing the model here for later use.
knn_model = []

In [8]:
for g in range(len(K)):
    print("Starting next K...")
    NEIGHBORS = K[g]
    
    # Generate knn model for g neighbors on X_train.
    knn_model = NearestNeighbors(n_neighbors=NEIGHBORS, n_jobs = -1).fit(X_train,Y_train)

    # Get neighbors of test samples.
    pred = knn_model.kneighbors(X_test)
    # Get predictions on test samples.
    model = knn_test(X_test, X_train, Y_train, pred)
    
    #### Calculating heuristics/statistics. ####
    
    # Stats for knn that truncates prediction and uses arithmetic mean.
    dist_over_mean_i = []
    misclass_mean_i = 0
    dist_under_mean_i = []
    dist_over_man_mean_i = []
    dist_under_man_mean_i = []

    dist_over_ham_mean_i = []
    dist_under_ham_mean_i = []


    print("Starting stats for K: {} ...".format(NEIGHBORS))
    for i in range(len(X_test)):
        # Calculate knn prediction with mean.
        nn_heur_mean_i = int(model[i, 0])
        
        # Calculate Manhattan dist.
        man_heur = h.manhattan(X_test[i].reshape(4,4), None)
        # Calculate Hamming dist.
        ham_heur = h.hamming(X_test[i].reshape(4,4), None)
        y = Y_test[i]

        if (nn_heur_mean_i > y):
            dist_over_mean_i.append(nn_heur_mean_i - y)

        if (nn_heur_mean_i <= y):
            dist_under_mean_i.append(y - nn_heur_mean_i)

        if (nn_heur_mean_i != y):
            misclass_mean_i += 1

        if (nn_heur_mean_i > man_heur):
            dist_over_man_mean_i.append(nn_heur_mean_i - man_heur)

        if (nn_heur_mean_i < man_heur):
            dist_under_man_mean_i.append(man_heur - nn_heur_mean_i)

        if (nn_heur_mean_i > ham_heur):
            dist_over_ham_mean_i.append(nn_heur_mean_i - ham_heur)

        if (nn_heur_mean_i < ham_heur):
            dist_under_ham_mean_i.append(ham_heur - nn_heur_mean_i)        
        
        
    # Averages for truncation + mean.
    avg_dist_over_mean_i = np.mean(np.asarray(dist_over_mean_i))
    avg_dist_under_mean_i = np.mean(np.asarray(dist_under_mean_i))
    out_sample_error_mean_i = misclass_mean_i / len(X_test)
    avg_dist_over_man_mean_i = np.mean(np.asarray(dist_over_man_mean_i))
    avg_dist_under_man_mean_i = np.mean(np.asarray(dist_under_man_mean_i))

    avg_dist_over_ham_mean_i = np.mean(np.asarray(dist_over_ham_mean_i))
    avg_dist_under_ham_mean_i = np.mean(np.asarray(dist_under_ham_mean_i))

    
    # Append stats for later plotting.
    avg_e_admiss_mean.append(len(dist_over_mean_i)/len(X_test) * 100)
    avg_dist_over_mean.append(avg_dist_over_mean_i)
    perc_over_man_mean.append(len(dist_over_man_mean_i)/len(X_test) * 100)
    perc_under_man_mean.append(len(dist_under_man_mean_i)/len(X_test) * 100)
    perc_over_ham_mean.append(len(dist_over_ham_mean_i)/len(X_test) * 100)
    
    
    print("------ TRUCATION - KNN with Mean: ------")
    print("Avg distance overestimated: ", avg_dist_over_mean_i)
    print("Avg distance underestimated: ", avg_dist_under_mean_i)
    print("E_admiss: ", len(dist_over_mean_i)/len(X_test))
    print("E_out: ", out_sample_error_mean_i)
    print("Avg distance over Manhattan: ", avg_dist_over_man_mean_i)
    print("Avg distance under Manhattan: ", avg_dist_under_man_mean_i)
    print("Percent over Manhattan: ", len(dist_over_man_mean_i)/len(X_test))
    print("Percent under Manhattan: ", len(dist_under_man_mean_i)/len(X_test))

    print("Avg distance over Hamming: ", avg_dist_over_ham_mean_i)
    print("Avg distance under Hamming: ", avg_dist_under_ham_mean_i)
    print("Percent over Hamming: ", len(dist_over_ham_mean_i)/len(X_test))
    print("Percent under Hamming: ", len(dist_under_ham_mean_i)/len(X_test))
    print("\n")

Starting next K...
Starting stats for K: 35 ...
------ TRUCATION - KNN with Mean: ------
Avg distance overestimated:  1.1333333333333333
Avg distance underestimated:  9.186802030456853
E_admiss:  0.015
E_out:  0.987
Avg distance over Manhattan:  2.3013698630136985
Avg distance under Manhattan:  3.441860465116279
Percent over Manhattan:  0.292
Percent under Manhattan:  0.602
Avg distance over Hamming:  7.3058702368692074
Avg distance under Hamming:  2.111111111111111
Percent over Hamming:  0.971
Percent under Hamming:  0.018




In [35]:
def gen_portfolio_data(out_file, model, h_func):
    """for all datapoints in All_Data.txt, writes new file containing difference
    between actual distance and distance predicted by model"""
    output = open(out_file, "w")
    data = open("All_Data.txt", "r")

    count = 1 
    for line in data:
        if (count % 1000 == 1):
            print(count)

        (board, dist) = io.string_to_board_and_dist(line)
        #pred = h_func(board, model)
        pred = int(model[count-1, 0])
        diff = dist - pred
        n_line = io.board_and_dist_to_string(board, diff)
        output.write(n_line + '\n')

        count += 1

    data.close()
    output.close()

In [39]:
# Load test samples to be used for generating portfolio data.
(X_port_test,Y_port_test) = load_data('All_Data.txt')

# Get neighbors of test samples.
port_pred = knn_model.kneighbors(X_port_test)

# Get predictions on test samples.
port_model = knn_test(X_port_test, X_train, Y_train, port_pred)

  log_a = np.log(a)


In [38]:
gen_portfolio_data("portfolio_knn_try2.txt", port_model, knn_heuristic)

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
25001
26001
27001
28001
29001
30001
31001
32001
33001
34001
35001
36001
37001
38001
39001
40001
41001
42001
43001
44001
45001
46001
47001
48001
49001
50001
51001
52001
53001
54001
55001
56001
57001
58001
59001
60001
61001
62001
63001
64001
65001
66001
67001
68001
69001
70001
71001
72001
73001
74001
75001
76001
77001
78001
79001
80001
81001
82001
83001
84001
85001
86001
87001
88001
89001
90001
91001
92001
93001
94001
95001
96001
97001
98001
99001
100001
101001
102001
103001
104001
105001
106001
107001
108001
109001
110001
111001
112001
113001
114001
115001
116001
117001
118001
119001
120001
121001
122001
123001
124001
125001
126001
127001
128001
129001
130001
131001
132001
133001
134001
135001
136001
137001
138001
139001
140001
141001
142001
143001
144001
145001
146001
147001
148001
149001
150001
151001
152001
153001
154001
155001
156001
157001
158001


In [89]:
def run_testing(data_file, model, h_func):
    """
    given a data_file containing testing data, a model, and heuristic function
    for said model, computes average number of states to solution, number to 
    times solution length is non-optimal, and average estimates of solution 
    lengths
    """
    (boards, n_states, times, dists) = load_boards(data_file)

    cust_states = []
    cust_wrong = 0
    cust_distance = []

    for i in tqdm(range(len(boards))):
        if (i % 100 == 0):
            print(i)
        # need the index of the board rather than the board, so modify input to solve:
        (c_states, c_time, sol_path) = s.solve(boards[i], h_func, model)
        cust_states.append(c_states)
        sol_len = len(sol_path) - 1
        if not (sol_len == dists[i]):
            cust_wrong += 1
        cust_distance.append(sol_len)

    print("average number of states explored to find solution:")
    print("\tfor learned model: " + str(np.mean(cust_states)))
    print("\tfor manhattan distance: " + str(np.mean(n_states)))
    print("----------------------------------------------------")
    print("solution was non-optimal " + str(cust_wrong / NUM_TEST_BOARDS * 100) + "% of the time")
    print("----------------------------------------------------")
    print("average length of solution path was:")
    print("\tfor learned model: " + str(np.mean(cust_distance)))
    print("\tfor manhattan distance: " + str(np.mean(dists)))

In [90]:
def knn_heuristic(board, knnmodel):
    # board is originally 4x4
    board = board.reshape(16)
    run_pred = knn_model.kneighbors(board.reshape(1,-1))
    prediction = knn_test(np.array([board]), X_train, Y_train, run_pred)
    #print(prediction[0,0])
    return int(prediction[0,1])

In [91]:
# Load test samples to be used for generating portfolio data.
#(X_run_test,_,_,Y_run_test) = load_boards('baby_test.txt')
#X_run_test = np.asarray(X_run_test).reshape(len(X_run_test),16)

# Get neighbors of test samples.
#run_pred = knn_model.kneighbors(X_run_test)

# Get predictions on test samples.
#run_model = knn_test(X_run_test, X_train, Y_train, run_pred)

In [92]:
# run run_testing.py
run_testing("Test_boards.txt", knn_model, knn_heuristic)





  0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A

0


  log_a = np.log(a)
  log_a = np.log(a)




  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)




  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)




  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a 

  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)




  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log

  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)


  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)


  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)
  log_a = np.log(a)


KeyboardInterrupt: 