In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from datetime import datetime

%matplotlib inline

In [3]:
from random import randint
from random import getrandbits
from random import sample
from random import choice
from random import random

In [9]:
best_results = pd.DataFrame()

In [10]:
scores = pd.DataFrame()

In [68]:
### Variables
ammount_genes_on = 1500
length_features = 100000
amount_chromosomes = 20
generation_number = 0
number_generations = 10
chromosome_mutation_rate = 0.4
gene_mutation_probability = 0.6
run_number = 0
# cycle = randint(0, 1000)

#bests
chromosome_best = []
r2_score_best = []
top_score = 0.16

### Save State
history = []

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [12]:
import time
import multiprocessing 
import smtplib

In [13]:
from multiprocessing import Process, Lock
from multiprocessing.sharedctypes import Array, Value

In [15]:
# Read data and present
tr_url = "https://raw.githubusercontent.com/maxuw/CardGame/master/trainingData.csv"
va_url = "https://raw.githubusercontent.com/maxuw/CardGame/master/validationData.csv"

train = pd.read_csv(tr_url)
valid = pd.read_csv(va_url)

In [16]:
# Helper functions to preprocess data to bag-of-cards format

def unnest(df, col):
    unnested = (df.apply(lambda x: pd.Series(x[col]), axis=1)
                .stack()
                .reset_index(level=1, drop=True))
    unnested.name = col
    return df.drop(col, axis=1).join(unnested)

def to_bag_of_cards(df):
    df['ind'] = np.arange(df.shape[0])# + 1
    df_orig = df.copy()
    df['deck'] = df['deck'].apply(lambda d: d.split(';'))
    df = unnest(df, 'deck')
    df['value'] = 1
    df_bag = df.pivot(index='ind', columns='deck', values='value')
    df_bag[df_bag.isna()] = 0
    df_bag = df_bag.astype('int')
    return pd.concat([df_orig.set_index('ind'), df_bag], axis=1)

In [17]:
bag_train = to_bag_of_cards(train)
bag_valid = to_bag_of_cards(valid)

In [18]:
# Specify example model fitting function and R squared metric

from sklearn.svm import SVR

def R2(x, y):
    return 1 - np.sum(np.square(x - y)) / np.sum(np.square(y - np.mean(y)))

def fit_svm(data):
    svr = SVR(kernel='rbf', gamma=1.0/90, C=1.0, epsilon=0.02, shrinking=False)
    svr.fit(data.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), data['winRate'])
    return svr

sizes = (np.arange(10) + 6) * 100

In [19]:
def create_chromosome(genes_on = 0, length = 0, indexes = []):
#     print(indexes)
    
    if length == 0:
        length = length_features

    if genes_on == 0:
        genes_on = ammount_genes_on
    
    if indexes == []:
    


        N = length # 
        K = genes_on # zeros N-K ones

        arr = np.array([False] * (N-K) + [True] * K, dtype=np.bool)

        np.random.shuffle(arr)
        #         print(np.sum(arr))
        
    else:
        sample_index = sample(indexes, ammount_genes_on)
        arr = np.array([False] * length)
        arr[sample_index] = True
#         print("created sample equals", np.where(arr==True)[:15])
        
    return arr

In [20]:
def create_population(indexes = []):
    
        
    population = []

    for i in range(amount_chromosomes):
        chromosome_new = create_chromosome(indexes=indexes)

        population.append(chromosome_new)

    history.append(population)
    return population

In [2]:
# def crossover_old(chrom0, chrom1):
#     crossover_start_time = datetime.now()
#     length_chrom = len(chrom0)
    

#     new_child = np.full(length_chrom, False, dtype=bool)
#     crossover_beginning_time = datetime.now() - crossover_start_time
    
#     loop_time_start = datetime.now()
#     for i in range(len(chrom0)):
#         if chrom0[i] == chrom1[i]:
#             new_child[i] = chrom0[i]
#         elif randint(0,1) == 0:
#             new_child[i] = chrom0[i]
#         else:
#             new_child[i] = chrom1[i]
    
#     loop_time = datetime.now() - loop_time_start
    
    
#     fixing_time_start = datetime.now()
# #     new_child = fix_child(new_child)
#     fixing_time = datetime.now() - fixing_time_start
    
#     print("Crossover: beginning: ", crossover_beginning_time, "Loop time: ", loop_time, "Fixing time: ", fixing_time)
    
#     return new_child

In [22]:
def crossover(chrom1, chrom2, genes_on=0):
    
    if genes_on == 0:
        genes_on = ammount_genes_on

    
    # where they are the same
    chrom_child = np.logical_and(chrom1, chrom2)
#     print(np.where(chrom_child))
    
    if np.sum(chrom_child) < genes_on:
        
        # potential genes
        list_potential_genes = np.where(chrom1 != chrom2)
        list_potential_genes = list(list_potential_genes[0])
    #     print(list_potential_genes)

        length_needed_fill_in = genes_on - np.sum(chrom_child)
#         print("length needed: ", length_needed_fill_in, "Genes_on_needed : ", genes_on, "on in child: ", np.sum(chrom_child))

        if length_needed_fill_in > 0:
            sample_to_change = sample(list_potential_genes, length_needed_fill_in)
    #         print(sample_to_change)

            chrom_child[sample_to_change] = True
    
    return chrom_child

In [47]:
def crossover_step(population):
    children = []
#     children.append(population[0])
    for i in range(amount_chromosomes):
        child = crossover(population[0], population[1])
        children.append(child)

    return children

In [25]:
def sort_population(population, population_rates):

    print("best chromosome in the population rate: ", max(population_rates))
    sorted_y_idx_list = sorted(range(len(population_rates)),key=lambda x:population_rates[x])
    sorted_y_idx_list.reverse()
    Xs = [population[i] for i in sorted_y_idx_list ]
    
    
    
    global chromosome_best
    chromosome_best = Xs[0]
    
    global r2_score_best
    r2_score_best = max(population_rates)  
    
#     print("top chromosome in the population: ", Xs[0])
    
    return Xs
    
#     sorted_population = [x for _,x in sorted(zip(population_rates, population))]
#     print(sorted_population)
# #     sorted_population.reverse()
# #     while population_rates != []:

# #         max_elem = max(population_rates)
# #         max_index = population_rates.index(max_elem)
# #         print("appending", max_index)
# #         sorted_population.append(population[max_index])
# #         print("deleting", max_index)
# #         del population_rates[max_index]
#     print(sorted_population)
#     return sorted_population
    



In [26]:
def mutate(chrom, mutation_amount=0):
    
    if mutation_amount == 0:
        mutation_amount = int(ammount_genes_on * gene_mutation_probability)
    
#     print(np.where(chrom))
#     print(np.where(chromosome_best))
    
    chrom_true = np.where(chrom==True)
    chrom_true = chrom_true[0].tolist()
    sample_true = sample(chrom_true, mutation_amount)

    chrom_false = np.where(chrom==False)
    chrom_false = chrom_false[0].tolist()
    sample_false = sample(chrom_false, mutation_amount)

    chrom[sample_true] = False
    chrom[sample_false] = True
    
    return chrom

In [3]:
# def mutate_old(chrom):
# #         print("were mutating something")
#     mutation_genes_amount = int(length_features * gene_mutation_probability)
# #         print(mutation_genes_amount)

#     N = length_features # 
#     K = mutation_genes_amount # zeros N-K ones

#     arr = np.array([False] * (N-K) + [True] * K, dtype=np.bool)

#     np.random.shuffle(arr)

#     indexes = np.where(arr == True)
# #         print(indexes)

#     changes = np.invert(chrom[indexes])

#     chrom[indexes] = changes

#     return chrom

In [28]:
def mutation_cycle(population):
    mutated_population = []
    
    mutation_count = 0
    
    for i in range(len(population)):
        
#         if i == 0:
#             mutated_population.append(population[i])
#             print("we don't mutate the best chromosome")
        
#         else:
        
        if random() < chromosome_mutation_rate:
#             print("a chromosome was selected to be mutated")
            mutated_chrom = mutate(population[i])            
            mutated_population.append(mutated_chrom)
            mutation_count += 1

        else:
            mutated_population.append(population[i])
    
    print("Number of chromosomes selected to be mutated this turn :", mutation_count)
    return mutated_population

In [29]:
def get_indexes(self, chrom=""):
    if chrom != "":
        indexes = np.where(chrom == True)
    else:
        indexes = np.where(population[0] == True)
    return indexes

In [30]:
from multiprocessing import Process, Lock
from multiprocessing.sharedctypes import Array


def multiprocess_rank(i, chromosome, sharedarray):
    
    indexes = np.where(chromosome == True)
#     print("indexes in the multi loop", indexes[0][:5])
#     print(indexes[:5])

    df = bag_train.loc[indexes]
#     print(df)

#         df = self.create_df(chromosome, data)
    model = fit_svm(df)
#     print(model)
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
#     print("predictions: ", pred)
    r2 = R2(pred, bag_valid['winRate'])
#     print(r2)
#         sharedarray[i] = r2
    sharedarray[i] = r2

In [70]:
def main(list_chromosomes):
    lock = Lock()
    array = Array('d', len(list_chromosomes), lock=lock)
    if __name__ == '__main__':
        for i in range(len(list_chromosomes)):
            chromosome = list_chromosomes[i]
#             print(chromosome)
            p = Process(target=multiprocess_rank, args=(i, chromosome, array))
            p.start()
            p.join()
    return list(array)


def evaluate_population(list_chromosomes):
    list_multi = main(list_chromosomes)
    
#     print("self population rates in encapsulate evaluate", list_multi)
        
#     max_list = max(list_multi)
#     print("max list", max_list)

#     max_index = list_multi.index(max_list)
#         best_now = max(zip(self.population_rates.values(), self.population_rates.keys()))
#     print("max_index", max_index)
#     print(list_multi)
    return list_multi




In [32]:
def save_best_chromosome():
    print("saving best chromosome")
    
    header = "genesOn: " + str(ammount_genes_on), "score ", str(r2_score_best), "countedGenes:", str(get_count_on(chromosome_best)), "chromosomes", str(amount_chromosomes), "generations", str(number_generations), "__genes", "\n\n",  "\n\n\n"
    header = str(header)
    content = get_indexes(chromosome_best)
    print("length of the saved best chromosome: ", len(content))
    
    file_characteristics = "genesOn: ", str(ammount_genes_on),  "score ", str(r2_score_best), "countedGenes:", str(get_count_on(chromosome_best)), "__"
    file_characteristics = str(file_characteristics)
    
    random_number = str(randint(0, 1000))
    directory = "Saves_jan1/"
#     print(type(file_characteristics))
    filename = directory + file_characteristics + random_number + ".csv"
    
    content_series = pd.Series(content)
    index = content
#     np.savetxt("file.txt", content)
    content_series.to_csv(filename, header=False)

In [33]:
def save_population(population):
    print("saving population")
    file_characteristics = "Population_genesOn: ", str(ammount_genes_on),  "score ", str(r2_score_best), "countedGenes:", str(get_count_on(population_best)), "__"
    file_characteristics = str(file_characteristics)
    
    directory = "Saves/"
    filename_np = directory + file_characteristics
#     filename_pd = directory + file_characteristics + ".csv"
#     filename_pd_indexes = directory + file_characteristics + "index.csv"
    
    
    np.savetxt(filename_np, population)
    
    
#     df.to_csv(filename_pd)

#     indexes = np.where(population[0] == True)
#     df.to_csv(filename_pd)

In [34]:
def load_population(file):
    
    population = np.loadtxt(file)
    
    return population

In [35]:
def train_predict_old(deck):
    model = fit_svm(deck)
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, valid['winRate'])
    
    return r2

In [36]:
def train_predict(deck):
    start_time = datetime.now()
    model = fit_svm(deck)
    pred = model.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, valid['winRate'])
    end_time = datetime.now()
    time_taken = end_time - start_time
    print("training and predicting model:", time_taken)
    return(r2)

In [37]:
def evaluate_alone_sp(chromosome):
    indexes = np.where(chromosome == True)
    df = bag_train.loc[indexes]
    model = fit_svm(df)
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, bag_valid['winRate'])
    print(r2)

In [38]:
def cycle(best_indexes = []):
    global generation_number
    generation_number = 0
    
    global run_number
    run_number = randint(0, 1000)
    
    total_start_time = datetime.now()
    generation_number = 0
    
    population = create_population(best_indexes)
    
    while generation_number < number_generations:
            
        population = gen_cycle(population)
        print("finished cycle")
        
#         print(get_indexes(chromosome_best))
        
    evaluation = evaluate_population(population)
    
    print("finished evaluating population")
    population = sort_population(population, evaluation)
    print(np.sum(population[0] == True))
    
    total_end_time = datetime.now()
    total_time_taken = total_end_time - total_start_time

    print('Total time: ', total_time_taken)
    
    return population

In [39]:
def email_details():

    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.starttls()
    server.login("maxpythonuw@gmail.com", "")
    
    
    msg = "genesOn: ", ammount_genes_on, "score ", r2_score_best, "countedGenes:", get_count_on(chromosome_best), "chromosomes", amount_chromosomes, "generations", number_generations, "__genes", "\n\n",  "\n\n\n", get_indexes(chromosome_best)
    msg = str(msg)
    
    print(msg)
    server.sendmail("maxpythonuw@gmail.com", "m.jackl@student.uw.edu.pl", msg)
    server.quit()

In [40]:
def save_email():
    if rank_best > 0.19:
        save_best_chromosome()
        email_details()

In [41]:
def gen_cycle(population):
    cycle_start_time = datetime.now()
    
    evaluation = evaluate_population(population)
    evaluation_time = datetime.now() - cycle_start_time
    
    best_chromosome_to_list_best()
    
    sort_population_time_start = datetime.now()
    population = sort_population(population, evaluation)
    sort_population_time = datetime.now() - sort_population_time_start
    
    crossover_time_start = datetime.now()
    population = crossover_step(population)
    crossover_time = datetime.now() - crossover_time_start
    
    mutation_cycle_time_start = datetime.now()
    population = mutation_cycle(population)
    mutation_cycle_time = datetime.now() - mutation_cycle_time_start
    
    ending_start = datetime.now()
    history.append(population)
    
    global generation_number
    generation_number += 1
    ending_time = datetime.now() - ending_start

    cycle_end_time = datetime.now()
    time_taken = cycle_end_time - cycle_start_time
    
    best_chromosome_to_list_best()
    
    if r2_score_best > 0.18:
        save_best_chromosome()

    print("cycle: ", generation_number, time_taken, "Evaluation:", evaluation_time, "sortPop: ", sort_population_time, "Crossover: ", crossover_time, "mutationTime: ", mutation_cycle_time, "Ending: ", ending_time)
    
    return population

In [42]:
def get_indexes(chromosome):
    
    indexes = []
    list_ch = chromosome
    for i in range(len(list_ch)):
        if list_ch[i] == True:
            indexes.append(i)

    return indexes

In [43]:
def get_count_on(chromosome):
    # assumes it's sorted
#     chromosome = population[0]
    sum = np.sum(chromosome == True)
    return sum

In [44]:
def best_chromosome_to_list_best():
    global best_results
    global scores
#     if type(best_results) != pandas.core.frame.DataFrame:
#         best_results = pd.DataFrame()
    
    chromosome_indexes = np.where(chromosome_best==True)
    chromosome_indexes = chromosome_indexes[0]
    chromosome_indexes = chromosome_indexes.tolist()
    
    if chromosome_indexes != []:
        if r2_score_best > top_score:
            best_results = best_results.append(pd.Series({"Genes On":ammount_genes_on, "Score": r2_score_best, "Chromosome" : chromosome_indexes, "Run": run_number}), ignore_index=True)
        
        else:
            scores = scores.append(pd.Series({"Genes On":ammount_genes_on, "Score": r2_score_best, "Chromosome" : chromosome_indexes, "Run": run_number}), ignore_index=True)

In [45]:
def loan_numpy(file):
    load = np.loadtxt(file)
    load = load.astype(int)
    load = load.tolist()
    
    return load

In [69]:
best_results.sort_values("Score")

Unnamed: 0,Chromosome,Genes On,Run,Score
0,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.160483
1,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.160483
14,"[146, 248, 278, 431, 498, 508, 522, 621, 622, ...",1500.0,616.0,0.16118
13,"[146, 248, 278, 431, 498, 508, 522, 621, 622, ...",1500.0,616.0,0.16118
12,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772
11,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772
10,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772
8,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772
9,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772
6,"[43, 83, 122, 186, 187, 354, 499, 522, 583, 69...",1500.0,883.0,0.161772


In [71]:
# genetic = cycle()

In [72]:
# jan1 = loan_numpy("Jan_dumps/random_ind.out")

In [97]:
# best_results["Chromosome"].index[1]

1

In [93]:
# best = best_results["Chromosome"]

In [107]:
# best.loc[1]

In [4]:
def dataframe_extract_all_sets(row_numbers, df):
    lists = []
    for row in row_numbers:
        list_ = dataframe_extract(row, df)
    lists.append(list_)
    return lists

In [None]:
def dataframe_extract(row_number, df_results):
    
    list = df_results["Chromosome"]
    list = list.loc[row_number]

    
    return list

In [106]:
# dataframe_extract(1, best_results)

In [None]:
best_chromosome_to_list_best()

In [None]:
train_predict(bag_train.loc[jan2])

In [None]:
# bag_train = bag_train.loc[jan2]

In [None]:
# population4 = cycle()

In [73]:
targets = [600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500]

In [74]:
for target in targets:
    ammount_genes_on = target
    
    cycle(jan1)

best chromosome in the population rate:  0.21578044370841054
Number of chromosomes selected to be mutated this turn : 9
saving best chromosome
length of the saved best chromosome:  600
cycle:  1 0:00:10.582233 Evaluation: 0:00:10.511472 sortPop:  0:00:00.000226 Crossover:  0:00:00.017401 mutationTime:  0:00:00.053071 Ending:  0:00:00.000006
finished cycle
best chromosome in the population rate:  0.21496419824595114
Number of chromosomes selected to be mutated this turn : 6
saving best chromosome
length of the saved best chromosome:  600
cycle:  2 0:00:10.838840 Evaluation: 0:00:10.792045 sortPop:  0:00:00.000669 Crossover:  0:00:00.013492 mutationTime:  0:00:00.027710 Ending:  0:00:00.000006
finished cycle
best chromosome in the population rate:  0.21941597613771735
Number of chromosomes selected to be mutated this turn : 7
saving best chromosome
length of the saved best chromosome:  600
cycle:  3 0:00:10.591285 Evaluation: 0:00:10.539236 sortPop:  0:00:00.000190 Crossover:  0:00:00.01

In [63]:
# list_df

In [54]:
list_df = genetic_algorithm

In [55]:
targets =  [600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500]

In [77]:

def create_random_sample():
    random_sample = []
    for target in targets:
        sample_chromosome = []
        is_target = list_df["Genes On"]==target
#         print(target)
        justTarget = list_df[is_target]
#         len(sample_chromosome = justTarget["Chromosome"].sample().tolist()[0])
        
        sample_chromosome = justTarget["Chromosome"].sample().tolist()[0]
        while len(sample_chromosome) != target:
            print(len(sample_chromosome))
            sample_chromosome = justTarget["Chromosome"].sample().tolist()[0]

        print(len(set(sample_chromosome)))
        print(len(set(sample_chromosome)))
        random_sample.append(sample_chromosome)
        
    return random_sample
    



In [78]:
def save_file_competition(sample, filename, directory="genetic/"):
    rand = randint(0,1000)
    filename = directory+filename+str(rand)+".txt"
    with open(filename, 'a') as f:
        for list_ in sample:
            ind_text = ','.join(list(map(str, list_)))
            text = ';'.join(['0.02', '1.0', str(1.0 / 90), ind_text])
            f.write(text + '\n')

In [79]:
def create_write_random():
    sample = create_random_sample()
    save_file_competition(sample, "genetic")

In [65]:
create_write_random()

NameError: name 'create_write_random' is not defined

In [161]:
def fit_svm(data):
    svr = SVR(kernel='rbf', gamma=1.0/90, C=1.0, epsilon=0.02, shrinking=False)
    svr.fit(data.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), data['winRate'])
    return svr

In [162]:
def train_predict_old(deck):
    model = fit_svm(deck)
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, valid['winRate'])
    
    return r2

In [None]:
# {'C': [0.001, 0.01, 0.1, 1, 10, 100],
#           'gamma': [0.0001, 0.001, 0.01, 0.1],

In [None]:
def parameters_test(indexes):
    gamma = [1.0/90, 1.1/90, 1.2/90, 1.3/90, 1.4/90, 1.5/90, 2/90, 0.9/90, 0.8/90, 0.7/90, 0.5/90, 1.0/90]
    epsilon = [0, 0.01, 0.1, 0.02, 0.5, 1, 2, 4]
    C = npdata.tolist() #[0.1, 0.2, 0.3]

    for gam in gamma:
        for eps in epsilon:
            for c in C:
                
                r2 = train_varying_parameters(indexes, gam, c, eps)
                print(str(r2), "    Gamma: ", gam, "Eps: ", eps, "C ", c)

In [None]:
def train_varying_parameters(indexes, gamma=1.0/90, C=1.0, epsilon=0.02):    
    
    df = bag_train.loc[indexes]
    svr = SVR(kernel='rbf', gamma=gamma, C=C, epsilon=epsilon, shrinking=False)
    model = svr.fit(df.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), df['winRate'])
    pred = model.predict(bag_valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
    r2 = R2(pred, valid['winRate'])
    
    return r2

In [None]:
train_varying_parameters(best_results["Chromosome"].loc[best_results["Score"].idxmax()])

In [None]:
parameters_test(best_results["Chromosome"].loc[best_results["Score"].idxmax()])

In [None]:
best_results.sort_values("Score", ascending=False).head()

In [None]:
best_results["Chromosome"].loc[best_results["Score"].idxmax()]

In [None]:
best_results.max(by="Score")

In [None]:
# Fit and predict on models of various training sizes

fit_list = list(map(lambda size: fit_svm(train.iloc[:size]), sizes))
pred_list = list(map(lambda fit: fit.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)),
                     fit_list))

In [75]:
import pickle

In [None]:
dict_best10000 = pickle.load( open( "pickles/best10000.pkl", "rb" ) )

In [None]:
for genes_on, value in dict_best10000.items():
    ammount_genes_on = genes_on
    cycle(value)
    print(list_best_chromosomes_best100000)

In [76]:
with open('pickles/gen_from_jan1.pkl', 'wb') as output:
    pickle.dump(best_results, output, -1)

In [None]:
results_best

In [None]:
scores