In [22]:
#unzips and re-zips individual log files and processes each one by calling out to the single file player 
#log processor
import logging
import os
import pdb
import tarfile
from bs4 import BeautifulSoup
import re
import shutil
import time
import math

In [23]:
#set-directory of compressed files to process
year_dir_name = '2012-all/'
dir_name = 'DominionPlayerLogs/' +  year_dir_name
output_dir = 'ProcessedLogs/' + year_dir_name

In [25]:
print("No of directories to loop through: " + str(len(os.listdir(dir_name))))

No of directories to loop through: 366


In [27]:
#find all games with a specific supply card set and specified number of players
no_required_players = 2
tgt_supply = ['Cellar','Market','Militia','Mine','Moat','Remodel','Smithy','Village','Woodcutter','Workshop']

#store filenames of logs with the correct supply card set
filename_list = []

#counter for number of files that could and couldn't be processed
failures = 0
successes = 0
directories_processed = 0

total_start_time = time.time()

for tar_day_file in sorted(os.listdir(dir_name)):
#tar_day_file = '20101213.tar.bz2'
#longest_file_processing_time = 0
#if True:
    #time how long it takes to process directory
    start_time = time.time()
    
    #extract tar file to directory
    if tar_day_file[-7:] == 'tar.bz2': 
        logging.debug("Extracting " + dir_name  + tar_day_file)
        with tarfile.open(dir_name + tar_day_file, 'r') as tar:
            extract_path = dir_name + tar_day_file[:-8]
            tar.extractall(extract_path)

        #then loop through individual games
        for game_log in os.listdir(extract_path):
            #read and parse html
            filename = extract_path + '/' + game_log
            file = open(filename, mode = 'r')
            soup = BeautifulSoup(file, 'html.parser')

            #convert html contents into a list of tabs, navigable strings etc
            pre_tag = soup.find('pre')
                                
            #check that the file isnt empty
            if pre_tag == None:
                failures +=1 
            else:
                #check that game wasn't aborted
                aborted = check_aborted_game(pre_tag.contents)

                if aborted == True:
                    failures +=1
                else:
                    #get player names and scores                      
                    #player_list = get_players(soup)
                    list_b_tags = soup.find_all('b'); 
                    no_players = len(list_b_tags)-1

                    #for now we focus only on games with a set number of players
                    if no_players == no_required_players:
                        #get cards in supply                       
                        supply_cards = get_supply_cards_with_no_of_players(no_players, pre_tag.contents)
                        
                        if supply_cards == tgt_supply:    
                            filename_list.append(filename)
                            print(filename)

        #delete extracted directory and output timings
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Processed directory " + str(tar_day_file) + f" in {elapsed_time/60} minutes")
        shutil.rmtree(extract_path)
        directories_processed +=1
        print("Number of directories processed " + str(directories_processed))

print("Directories processed: " + str(directories_processed))
print("Successes: " + str(successes))
print("Failures: " + str(failures))  

#write filename list to file
output_filename = 'files_with_tgt_supply_' + str(no_required_players) + 'players.txt'
with open(output_filename, 'w') as file:
    file.write(','.join(filename_list) + '\n')

# Calculate the elapsed time
total_end_time = time.time()
elapsed_time = total_end_time - total_start_time

print(f"Total elapsed time: {elapsed_time/60} minutes")


Processed directory 20120101.tar.bz2 in 10.967873966693878 minutes
Number of directories processed 1
Processed directory 20120102.tar.bz2 in 12.112566065788268 minutes
Number of directories processed 2
Processed directory 20120103.tar.bz2 in 10.988518667221069 minutes
Number of directories processed 3
Processed directory 20120104.tar.bz2 in 13.308150831858317 minutes
Number of directories processed 4
Processed directory 20120105.tar.bz2 in 13.222851884365081 minutes
Number of directories processed 5
Processed directory 20120106.tar.bz2 in 12.508947384357452 minutes
Number of directories processed 6
Processed directory 20120107.tar.bz2 in 12.1419912815094 minutes
Number of directories processed 7
Processed directory 20120108.tar.bz2 in 12.967961315313975 minutes
Number of directories processed 8
Processed directory 20120109.tar.bz2 in 13.867734066645305 minutes
Number of directories processed 9
Processed directory 20120110.tar.bz2 in 13.73196804523468 minutes
Number of directories proce

KeyboardInterrupt: 

In [26]:
#function to determine number of players
def get_players(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_list = []
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
                player_list.append(player)
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                player_list.append(player)
                
    return player_list
    

#function to log and catalogue a supply card set 
def log_supply_set(cards, filename, supply_cards_key, supply_cards_counter, supply_cards_filenames):
    #first identify if we have seen this card set previously
    
    #NOTE: we make a scary assumption here that the player logs always have cards listed in the same order
    #to be tested empirically. This is done to make the ocdde execute in a resaonable time frame
    #sorted_cards = sorted(cards)
    sorted_cards = cards
    
    previously_seen = False
    
    for index in supply_cards_key.keys():
        if sorted_cards == supply_cards_key[index]:
            #seen this set before, so increment supply cards counter and store the filename
            supply_cards_counter[index] += 1
            supply_cards_filenames[index].append(filename)
            previously_seen = True
            break
    
    if previously_seen == False:
        #in this case we need to create a new entry in the relevant dictionaries
        
        #no of unique sets seen so far
        index = len(supply_cards_key) + 1
        supply_cards_key[index] = sorted_cards
        supply_cards_counter[index] = 1
        supply_cards_filenames[index] = [filename]
    
    return None

def check_aborted_game(html_contents):
    for (k,r) in enumerate(html_contents):
        if ('game aborted' in r.text) or ('resigned' in r.text):
            return True
    return False

#following function parses a single line which consists of number of cards (unless equal to one) and card types
#each separated by a colon and ended with a full stop or a horizontal dashed line. It returns a list of cards 
#with card names duplicated according to the number of them in the row. Also the index of the full stop or 
#dashed line will be returned
def parse_row_into_cards(html_contents):
    card_list = []
    for (k,r) in enumerate(html_contents):
        #Need to be careful that the first content item doesn't contain a '....', causing 
        #the code to pickup a full stop
        if ((('.' not in r) or (k == 0)) and r.name == None and ('----------------------' not in r)):
            #check to see if trashing text contains the number of cards trashed
            #number is contained in last two digits of text
            try:
                no_cards = int(r[-2:])
            except ValueError:
                no_cards = 1
            #take next entry along which should be the card name
            card_type = r.next_sibling.text
            #and add copies of that to the dictionary
            for count in range(0,no_cards):
                card_list.append(card_type)
        elif (('.' in r) or ('----------------------' in r)) and (k != 0):
            return (k, card_list)

#function to get players and scores
def players_and_scores(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_scores = {}
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
            #next sibing contains the score, assume score is after a string of the form ': ' and is 2 digits long
            score_text = tag.next_sibling
            score = int(re.search(r'-?\d+', score_text[2:5]).group()) 
            player_scores[player] = score
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                score = int(re.search(r'-?\d+', s[colon_position+1 : colon_position+4]).group()) 
                player_scores[player] = score
                
    return player_scores

#extract number of turns for this game for each player
def turns_for_each_player(player_list, html_contents):
    total_turns = []
    count = 0
    for (k,r) in enumerate(html_contents):
        if 'turns' in r.text:
            turn_position = r.text.index('turns')
            #assume number of turns is given by two digits and there is a space to the start of the word 'turn'q
            total_turns.append(int(re.search(r'\d+', r.text[turn_position-3:turn_position]).group()))
            count += 1
            if count == len(player_list):
                break
            
    #note, turns are in same order as player names in html file
    turns_by_player = {}
    count = 0
    for player in player_list:
        turns_by_player[player] = total_turns[count]
        count += 1
    
    return turns_by_player

def get_supply_cards_with_no_of_players(no_of_players, html_contents):
    #so we need to be careful figuring out the cards in the supply, first we scan down the file
    # to see if there is a 'chosen cards are' string followed by ''<player name> vetoes'. In this
    #case the players are making choices over the card supply, if this isnt present then the
    #supply cards are given at the top of the file post a string that says 'cards in supply'

    #start with the case where players can veto
    vetoes_used = False
    for (i,s) in enumerate(html_contents):
        if 'chosen cards are' in s:
            #loop over chosen supply cards
            (index, cards) = parse_row_into_cards(html_contents[i:])
            index_end = i + index 
            supply_cards = cards
            vetoes_used = True
            break
    
    if vetoes_used == True:
        #next each player can veto a card
        found_player_veto = 0
        vetoed_cards = []
        for s in html_contents[index_end:]:
            if 'vetoes' in s:
                vetoed_cards.append(s.next_sibling.text)
                found_player_veto +=1
                if (found_player_veto == no_of_players):
                    break
        #remove vetoed cards from list of supply cards
        supply_cards = list(filter(lambda x: x not in vetoed_cards, supply_cards))
            
    #if this didnt occur move onto the second case
    if vetoes_used == False:
        for (i, s) in enumerate(html_contents):
            if 'cards in supply' in s:
                (index, cards) = parse_row_into_cards(html_contents[i:])
                supply_cards = cards
                break
    
    return supply_cards


def get_supply_cards(players_list, html_contents):
    #so we need to be careful figuring out the cards in the supply, first we scan down the file
    # to see if there is a 'chosen cards are' string followed by ''<player name> vetoes'. In this
    #case the players are making choices over the card supply, if this isnt present then the
    #supply cards are given at the top of the file post a string that says 'cards in supply'

    #start with the case where players can veto
    vetoes_used = False
    for (i,s) in enumerate(html_contents):
        if 'chosen cards are' in s:
            #loop over chosen supply cards
            (index, cards) = parse_row_into_cards(html_contents[i:])
            index_end = i + index 
            supply_cards = cards
            vetoes_used = True
            break
    
    if vetoes_used == True:
        #next each player can veto a card
        vetoed_cards = []
        for player in players_list:
            for s in html_contents[index_end:]:
                check_string = player + ' vetoes'
                if check_string in s:
                    vetoed_cards.append(s.next_sibling.text)
                    break
        #remove vetoed cards from list of supply cards
        supply_cards = list(filter(lambda x: x not in vetoed_cards, supply_cards))
            
    #if this didnt occur move onto the second case
    if vetoes_used == False:
        for (i, s) in enumerate(html_contents):
            if 'cards in supply' in s:
                (index, cards) = parse_row_into_cards(html_contents[i:])
                supply_cards = cards
                break
    
    return supply_cards

def process_player_actions(player_list, turns_by_player, html_contents):
    #use a dictionary of dictionaries to track gained cards by turn by player, and initialise keys
    incremental_cards_by_turn = {}
    for player in player_list:
        incremental_cards_by_turn[player] = {}
        for turn in range(1,turns_by_player[player]+1):
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
        for turn in range(1,turns_by_player[player]+1):
            incremental_cards_by_turn[player][turn]['buys'] = {}
            incremental_cards_by_turn[player][turn]['trashing'] = {}
            incremental_cards_by_turn[player][turn]['gaining'] = {}
            incremental_cards_by_turn[player][turn]['trashes'] = {}
        
    for player in player_list:
        turn_counter = 1
        #string to check for a buy action
        check_buy = player + ' buys'
        #string to check if current player is trashing a card
        check_trashing = 'trashing'
        #string to check if current player is gaining a card
        check_gaining = 'gaining'
        #string to check if another player trashes a card. Howver, current player can both 'trashes' 
        #and 'trashing'. Also it is possible for a player to trash nothing
        check_player_trashes = [x + ' trashes' for x in players] # need to check this!
        #string to check if another player is gains a card (gains is used rather than
        #gaining when a player gets a card out of turn)
        check_player_gains = [ x + ' gains' for x in players]
        #string to check end of turn
        check_turn_end = '(' + player + ' draws:'
        for (i,p) in enumerate(html_contents):
            check_turn = player + '\'s' + ' turn ' + str(turn_counter)
            if check_turn in p:
                buy_card_list = []
                trashing_card_list = []
                gaining_card_list = []
                gains_card_list = {} #needs to be a dictionary as multiple players may gain in another player's round
                trashes_card_list = {} #needs to be a dictionary as multiple players may need to trash in another player's round
                for player_ in player_list:
                    gains_card_list[player_] = [] 
                    trashes_card_list[player_] = [] 
                for (j,q) in enumerate(html_contents[i:]):
                    player_gains_list = [ x in q for x in check_player_gains]
                    player_trashes_list = [ x in q for x in check_player_trashes]
                    if check_buy in q:
                        (index, cards) = parse_row_into_cards(html_contents[i + j:])
                        buy_card_list.append(cards)
                    elif check_trashing in q:
                        if 'trashing nothing' in r:
                            break
                        (index, cards) = parse_row_into_cards(html_contents[i + j:])
                        trashing_card_list.append(cards)
                    elif check_gaining in q:
                        #also we need to check for a 'gaining nothing' case
                        if 'gaining nothing' in r:
                            break
                        else:
                            (index, cards) = parse_row_into_cards(html_contents[i + j:])
                            gaining_card_list.append(cards)
                    elif any(player_gains_list):
                        #next we check if an opponent gains a card, e.g. a curse card
                        #need to loop through opponents
                        for (index, player_gains_check) in enumerate(player_gains_list):
                            if player_gains_check == True:
                                player_ = players[index]
                                (index_, cards) = parse_row_into_cards(html_contents[i + j:])
                                gains_card_list[player_].append(cards)
                    elif any(player_trashes_list):
                        #next we check if an opponent trashes a card
                        for (index, player_trashes_check) in enumerate(player_trashes_list):
                            if player_trashes_check == True:
                                player_ = players[index]
                                (index_, cards) = parse_row_into_cards(html_contents[i + j:])
                                trashes_card_list[player_].append(cards)
                    elif check_turn_end in q.text:
                        incremental_cards_by_turn[player][turn_counter]['buys'] = buy_card_list
                        incremental_cards_by_turn[player][turn_counter]['trashing'] = trashing_card_list
                        incremental_cards_by_turn[player][turn_counter]['gaining'] = gaining_card_list
                        for player_ in gains_card_list.keys():
                            incremental_cards_by_turn[player_][turn_counter]['gains'] = gains_card_list[player_] 
                        for player_ in trashes_card_list.keys():
                            incremental_cards_by_turn[player_][turn_counter]['trashes'] = trashes_card_list[player_] 
                        turn_counter += 1
                        break
    
    #finally we flatten incremental_cards_by_turn into a dictionary of lists
    for player in player_list:
        for turns in range(1, turns_by_player[player]+1):
            for cmd_type in incremental_cards_by_turn[player][turns].keys():
                flattened_list = [item for sublist in incremental_cards_by_turn[player][turns][cmd_type] for item in sublist]
                incremental_cards_by_turn[player][turns][cmd_type] = flattened_list
    
    return incremental_cards_by_turn