In [1]:
# Load necessary packages

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import time
%matplotlib inline

import random
from datetime import datetime

In [2]:
# Read data and present
tr_url = "https://raw.githubusercontent.com/maxuw/CardGame/master/trainingData.csv"
va_url = "https://raw.githubusercontent.com/maxuw/CardGame/master/validationData.csv"

train = pd.read_csv(tr_url)
valid = pd.read_csv(va_url)

In [3]:
# Helper functions to preprocess data to bag-of-cards format

def unnest(df, col):
    unnested = (df.apply(lambda x: pd.Series(x[col]), axis=1)
                .stack()
                .reset_index(level=1, drop=True))
    unnested.name = col
    return df.drop(col, axis=1).join(unnested)

def to_bag_of_cards(df):
    df['ind'] = np.arange(df.shape[0])# + 1
    df_orig = df.copy()
    df['deck'] = df['deck'].apply(lambda d: d.split(';'))
    df = unnest(df, 'deck')
    df['value'] = 1
    df_bag = df.pivot(index='ind', columns='deck', values='value')
    df_bag[df_bag.isna()] = 0
    df_bag = df_bag.astype('int')
    return pd.concat([df_orig.set_index('ind'), df_bag], axis=1)

In [4]:
bag_train = to_bag_of_cards(train)
bag_valid = to_bag_of_cards(valid)

In [5]:
valid.head()

Unnamed: 0,deck,nofGames,nOfPlayers,winRate,ind
0,"[archers, arrows, baby-dragon, bandit, elixir-...",130,32,0.48496,0
1,"[archers, arrows, baby-dragon, elixir-collecto...",495,75,0.47289,1
2,"[archers, arrows, baby-dragon, golem, lightnin...",271,25,0.52372,2
3,"[archers, arrows, baby-dragon, golem, lightnin...",125,20,0.58594,3
4,"[archers, arrows, balloon, barbarians, firebal...",130,23,0.5,4


In [6]:
# Specify example model fitting function and R squared metric

from sklearn.svm import SVR

def R2(x, y):
    return 1 - np.sum(np.square(x - y)) / np.sum(np.square(y - np.mean(y)))

def fit_svm(data):
    svr = SVR(kernel='rbf', gamma=1.0/90, C=1.0, epsilon=0.02, shrinking=False)
    svr.fit(data.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), data['winRate'])
    return svr



In [7]:
sizes = (np.arange(10) + 6) * 100
sizes

array([ 600,  700,  800,  900, 1000, 1100, 1200, 1300, 1400, 1500])

In [8]:
# Fit and predict on models of various training sizes

fit_list = list(map(lambda size: fit_svm(train.iloc[:size]), sizes))
pred_list = list(map(lambda fit: fit.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)),
                     fit_list))

In [9]:
# Calculate R squared scores

r2 = list(map(lambda p: R2(p, valid['winRate']), pred_list))
r2


[-0.04232767917971958,
 -0.03963921576228091,
 -0.05090293576042337,
 -0.06619693197263166,
 -0.06891790332198067,
 -0.0635866399958771,
 -0.08254572245137948,
 -0.07526826771478623,
 -0.08499150250986709,
 -0.0792418181864003]

In [10]:
# Save hyperparameteres and selected indices in submission format

with open('example_sub_python.txt', 'a') as f:
    for size in sizes:
        ind_text = ','.join(list(map(str, train.index.values[:size])))
        text = ';'.join(['0.02', '1.0', str(1.0 / 90), ind_text])
        f.write(text + '\n')

In [11]:
def train_predict(deck):
    model = fit_svm(deck)
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, valid['winRate'])
    
    return r2

In [12]:
# Sort games by something

train = train.sort_values('nofGames', ascending=False)

In [13]:
train_subset = pd.DataFrame()

In [14]:
train_subset = train_subset.append(train.sort_values('nofGames', ascending=False)[:200])
train_subset = train_subset.append(train.sort_values('nOfPlayers', ascending=False)[:200])
train_subset = train_subset.append(train.sort_values('winRate', ascending=False)[:200])

In [15]:
train.sort_values('nofGames', ascending=False)[:200].shape

(200, 5)

In [16]:
r2_scores = []

start = time.time()
for i in range(len(bag_train)):
    pred = train_predict(train.loc[i:i])
    r2_scores.append(pred)
end = time.time()
print (end - start)

ValueError: X.shape[1] = 90 should be equal to 1, the number of features at training time

In [None]:
def forward_search(data, num_initial_elements, step_size, desired_size):
    start_time = datetime.now()
    
    elements = set()
    
    while len(elements) < num_initial_elements:
        random_element = random.randint(0, len(data)-1)
        elements.add(random_element)

    initial_data = data.loc[elements]
    
    data_working = initial_data
    
    start = 0
    
    element_ended = 0
    
    while len(elements) < desired_size:
        
        elem_tentative = elements
        
        r2_current = train_predict(data_working)
        
        r2_temp_dict = {}
        
        for i in range(step_size):
            j = i + element_ended
#             print(j)
            
            if j not in elements:

                data_tentative = data_working
                data_tentative = data_tentative.append(data.loc[j:j])
#                 print(data.loc[j:j])
                
                r2_temp = train_predict(data_tentative)
                r2_temp_dict[j] = r2_temp
#                 print(r2_temp)
            
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]
        
        elements.add(max_elem)
        print("adding {}".format(max_elem))
        data_working = data_working.append(data.loc[max_elem:max_elem])
        
        element_ended = element_ended + step_size
        
    end_time = datetime.now()
    time_taken = end_time - start_time
    print('Time: ', time_taken)
    
    return elements
        

In [42]:
def forward_search_smaller(data, index_subset, num_initial_elements, desired_size, step_size = 0):
    
    data = data.loc[index_subset]
    
    if step_size == 0:
        step_size = int(float(len(index_subset)/desired_size))
#         print("desired_size :", desired_size)
#         print("num index_subset :", index_subset)
        print("counted stepsize is :", step_size)
    
    start_time = datetime.now()
    
#     elements = set()
    
    elements = random.sample(index_subset, num_initial_elements)
    print(type(elements))

    initial_data = data.loc[elements]
    
    data_working = initial_data
    
    start = 0
    
    element_ended = 0
    
    while len(elements) < desired_size:
        
        elem_tentative = elements
        
        r2_current = train_predict(data_working)
        
        r2_temp_dict = {}
        
        
        
        for i in index_subset:
            if i not in elements:
                
                bucket_set = set()

                data_tentative = data_working
                data_tentative = data_tentative.append(data.loc[i:i])
    #                 print(data.loc[j:j])


                r2_temp = train_predict(data_tentative)
                r2_temp_dict[i] = r2_temp
    #                 print(r2_temp)
                if len(r2_temp_dict) >= step_size:
                    
                
                
                    max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]

                    elements.append(max_elem)
                    
                    data_working = data_working.append(data.loc[max_elem:max_elem])
                    
                    print("adding {}".format(max_elem))
                    
                    r2_temp_dict = {}


        
                
        
    end_time = datetime.now()
    time_taken = end_time - start_time
    print('Time: ', time_taken)
    
    list_elements = list(elements)
    list_elements.sort()
    
    return elements
        

In [None]:
def forward_search_continue(data, initial_indexes, step_size, desired_size):
    start_time = datetime.now()
    
    elements = set()
    
    while len(elements) < num_initial_elements:
        random_element = random.randint(0, len(data)-1)
        elements.add(random_element)

    initial_data = data.loc[elements]
    
    data_working = initial_data
    
    start = 0
    
    element_ended = 0
    
    while len(elements) < desired_size:
        
        elem_tentative = elements
        
        r2_current = train_predict(data_working)
        
        r2_temp_dict = {}
        
        for i in range(step_size):
            j = i + element_ended
#             print(j)
            
            if j not in elements:

                data_tentative = data_working
                data_tentative = data_tentative.append(data.loc[j:j])
#                 print(data.loc[j:j])
                
                r2_temp = train_predict(data_tentative)
                r2_temp_dict[j] = r2_temp
#                 print(r2_temp)
            
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]
        
        elements.add(max_elem)
        print("adding {}".format(max_elem))
        data_working = data_working.append(data.loc[max_elem:max_elem])
        
        element_ended = element_ended + step_size
        
    end_time = datetime.now()
    time_taken = end_time - start_time
    print('Time: ', time_taken)
    
    return data_working
        

In [None]:
# forward_search_2000 = forward_search(bag_train, 10, 50, 2000)

In [None]:
forward_search_try.shape

In [None]:
indexes = forward_search_try.index.values

In [None]:
def s(filename, obj):
    with open(filename, 'w') as f:
        for item in obj:
            f.write("%s\n % item)
    
    print("file saved")

In [None]:
def forward_search_seconds(data, num_initial_elements, step_size, desired_size):
    start_time = datetime.now()
    
    elements_best = set()
    seconds = set()
    thirds = set()
    fourths = set()
    
    while len(elements_best) < num_initial_elements:
        random_element = random.randint(0, len(data)-1)
        elements_best.add(random_element)

    initial_data = data.loc[elements_best]
    
    data_working = initial_data
    
    start = 0
    
    element_ended = 0
    
    while len(elements_best) < desired_size:
        
        elem_tentative = elements_best
        
        r2_current = train_predict(data_working)
        
        r2_temp_dict = {}
        
        for i in range(step_size):
            j = i + element_ended
#           
            if j == len(data):
                break
            
            if j not in elements_best:

                data_tentative = data_working
                data_tentative = data_tentative.append(data.loc[j:j])
#                 print(data.loc[j:j])
                
                r2_temp = train_predict(data_tentative)
                r2_temp_dict[j] = r2_temp
#                 print(r2_temp)
            
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1] 
        
        elements_best.add(max_elem)
        print("adding {}".format(max_elem))
        
        print("max elem", max_elem, "trying to delete")
        del r2_temp_dict[max_elem]
        
        if max_elem not in elements_best:
            print("dupa: nie ma elementu")
            
        print("drugi największy")
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]
        seconds.add(max_elem)
        
        print("max elem", max_elem, "trying to delete")
        del r2_temp_dict[max_elem]
        
        if max_elem not in seconds:
            print("dupa: nie ma elementu")
        
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]
        thirds.add(max_elem)
        
        print("max elem", max_elem, "trying to delete")
        del r2_temp_dict[max_elem]
        
        if max_elem not in thirds:
            print("dupa: nie ma elementu")
        
        max_elem = max(zip(r2_temp_dict.values(), r2_temp_dict.keys()))[1]
        fourths.add(max_elem)
        
        element_ended = element_ended + step_size
        
    end_time = datetime.now()
    time_taken = end_time - start_time
    print('Time: ', time_taken)
    
    dict_lists = {}
    dict_lists[1] = elements_best
    dict_lists[2] = seconds
    dict_lists[3] = thirds
    dict_lists[4] = fourths
    
    return dict_lists
        

In [38]:
def save_file(obj):
    if type(obj) == dict:
        print("It's a dict")
        
        obj_pd = pandas.DataFrame(obj)
        
    elif type(obj)  == list:
        obj_

In [39]:
save_file(forward_seconds_10000)

It's a dict


In [55]:
s("file1.txt", forward_seconds_10000[1])

file saved


In [66]:
def r(filename):
    list_obj = []
    with open(filename, 'r') as f:
        for line in f:
            list_obj.append(line)
    
    if filename:
        print("file read")
        
    return list_obj

In [25]:
jan1 = np.loadtxt("Jan_dumps/random_ind.out")

In [51]:
jan1 = jan1.astype(int)

In [61]:
type(jan1)

list

In [60]:
jan1 = jan1.tolist()

In [None]:
jan1

In [27]:
jan2 = np.loadtxt("Jan_dumps/random_ind_2.out")

In [28]:
jan3 = np.loadtxt("Jan_dumps/random_ind_3.out")

In [29]:
jan2

array([67494., 25351., 44297., ..., 98186., 98211., 98328.])

In [30]:
jan3

array([24865., 79172., 73581., ..., 98178., 98208., 98279.])

In [34]:
train_predict(bag_train.loc[jan1])

0.32406572266974865

In [35]:
train_predict(bag_train.loc[jan2])

0.30734180608557127

In [36]:
train_predict(bag_train.loc[jan3])

0.3202908916237952

In [None]:
#jan1

In [57]:
def make_all_sets(ind_in):
    sizes_without_last = [600,  700,  800,  900, 1000, 1100, 1200, 1300, 1400]
    dict_sets = dict()
    for size in sizes:
        indexes_found = forward_search_smaller(bag_train, ind_in, num_initial_elements=5, desired_size=size)

        dict_sets[size] = indexes_found
    dict_sets[1500] = ind_in
    return dict_sets

In [62]:
jan1_sets = make_all_sets(jan1)

counted stepsize is : 2
<class 'list'>
adding 81380
adding 74377
adding 22035
adding 18585
adding 90558
adding 89
adding 176
adding 384
adding 396
adding 624
adding 686
adding 842
adding 932
adding 1135
adding 1217
adding 1345
adding 1553
adding 1689
adding 1827
adding 1924
adding 2073
adding 2132
adding 2317
adding 2377
adding 2571
adding 2770
adding 2837
adding 2942
adding 3143
adding 3242
adding 3428
adding 3557
adding 3735
adding 3848
adding 4022
adding 4032
adding 4162
adding 4306
adding 4540
adding 4586
adding 4812
adding 4875
adding 4995
adding 5119
adding 5279
adding 5458
adding 5559
adding 5741
adding 5872
adding 5897
adding 6136
adding 6267
adding 6367
adding 6471
adding 6626
adding 6738
adding 6878
adding 7048
adding 7192
adding 7257
adding 7378
adding 7532
adding 7654
adding 7830
adding 7941
adding 8100
adding 8220
adding 8381
adding 8510
adding 8542
adding 8749
adding 8844
adding 8970
adding 9116
adding 9304
adding 9403
adding 9509
adding 9581
adding 9809
adding 9894
addin

In [39]:
sizes_without_last = sizes[0:-1]

In [40]:
sizes_without_last

array([ 600,  700,  800,  900, 1000, 1100, 1200, 1300, 1400])

In [None]:
make_all_sets