In [1]:
#unzips and re-zips individual log files and processes each one by calling out to the single file player 
#log processor
import logging
import os
import pdb
import tarfile
from bs4 import BeautifulSoup
import re
import shutil
import time
import math

In [2]:
#set-directory of compressed files to process
year_dir_name = '2010-all/'
dir_name = 'DominionPlayerLogs/' +  year_dir_name
output_dir = 'ProcessedLogs/' + year_dir_name
files_to_ignore = []

In [3]:
print("No of directories to loop through: " + str(len(os.listdir(dir_name))))

No of directories to loop through: 83


In [6]:
#first we loop through all the files and identify the most commonly used supply card sets for two players
#we store the file names for these for processing into play traces in our second step

#counter for number of files that could and couldn't be processed
failures = 0
successes = 0
directories_processed = 0

#we only pick out games with a certain number of players
no_required_players = 2

#we need to group supply card sets as we pass through the logs
supply_cards_key = {} # first we have an integer to list map, labelling each supply card set which has the required
#number of players with an integer. 
supply_cards_counter = {} # next we store the count of how many times each supply set is observed with the
#required number of players (using the supply_cards_key)
supply_cards_filenames = {} # this stores the filenames associated with logs that have two players 
#(using the supply_cards_key)

#temporary variable
processed_files = []

#set-up log
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='processor.log', mode='a')
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

total_start_time = time.time()

#process individual log files
logging.debug("Starting parsing of player logs.....")
for tar_day_file in os.listdir(dir_name):
    #time how long it takes to process directory
    start_time = time.time()
    
    #extract tar file to directory
    if tar_day_file[-7:] == 'tar.bz2': 
        logging.debug("Extracting " + dir_name  + tar_day_file)
        with tarfile.open(dir_name + tar_day_file, 'r') as tar:
            extract_path = dir_name + tar_day_file[:-8]
            tar.extractall(extract_path)

        #then loop through individual games
        for game_log in os.listdir(extract_path):
            if game_log not in files_to_ignore:
                #output file to be processed in log file
                filename = extract_path + '/' + game_log
                logging.debug("Processing file: " + filename)
                
                #store processed file name in a list, just helps with debugging
                processed_files.append(filename)

                #read and parse html
                file = open(filename, mode = 'r')
                soup = BeautifulSoup(file, 'html.parser')

                #convert html contents into a list of tabs, navigable strings etc
                pre_tag = soup.find('pre')

                #check that the file isnt empty
                if pre_tag == None:
                    logging.debug(game_log + ' was empty.')
                    failures +=1 
                else:
                    #check that game wasn't aborted
                    aborted = check_aborted_game(pre_tag.contents)

                    if aborted == True:
                        logging.debug(game_log + ' was aborted.')
                        failures +=1
                    else:
                        #get player names and scores                      
                        player_scores = players_and_scores(soup)
                        
                        player_list = []
                        for player in player_scores.keys():
                            player_list.append(player)
                        
                        #track successes
                        successes += 1

                        #for now we focus only on games with a set number of players
                        if len(player_list) == no_required_players:
                            #get cards in supply                       
                            supply_cards = get_supply_cards(player_list, pre_tag.contents)

                            #log this instance of the supply card set, including the filename
                            log_supply_set(supply_cards, filename, supply_cards_key, 
                                                          supply_cards_counter,
                                                          supply_cards_filenames)


        #delete extracted directory and output timings
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Processed directory " + str(tar_day_file) + f" in {elapsed_time/60} minutes")
        shutil.rmtree(extract_path)
        directories_processed +=1
        print("Number of directories processed " + str(directories_processed))
    
print("Directories processed: " + str(directories_processed))
print("Successes: " + str(successes))
print("Failures: " + str(failures))  
print("No unique kingdom card sets: " + str(len(supply_cards_key)))

# Calculate the elapsed time
total_end_time = time.time()
elapsed_time = total_end_time - total_start_time

print(f"Total elapsed time: {elapsed_time/60} minutes")

Processed directory 20101212.tar.bz2 in 2.06843288342158 minutes
Processed directory 20101202.tar.bz2 in 1.931572178999583 minutes
Processed directory 20101116.tar.bz2 in 1.736232582728068 minutes
Processed directory 20101106.tar.bz2 in 1.1436418175697327 minutes
Processed directory 20101023.tar.bz2 in 0.8531520167986552 minutes
Processed directory 20101129.tar.bz2 in 1.918907352288564 minutes
Processed directory 20101015.tar.bz2 in 0.7130263010660808 minutes
Processed directory 20101130.tar.bz2 in 2.0522701501846314 minutes
Processed directory 20101120.tar.bz2 in 1.6816781361897786 minutes
Processed directory 20101224.tar.bz2 in 2.207562534014384 minutes
Processed directory 20101014.tar.bz2 in 0.6370308001836141 minutes
Processed directory 20101121.tar.bz2 in 1.8350844820340475 minutes
Processed directory 20101225.tar.bz2 in 1.9411231319109599 minutes
Processed directory 20101203.tar.bz2 in 2.1121778964996336 minutes
Processed directory 20101213.tar.bz2 in 2.80079532066981 minutes
Pro

In [19]:
#determine the supply cards with required number of players and sufficient number of game logs then
#output filenames, count and supply card set to file
lower_threshold = 20
popular_supply_cards = []
for key in supply_cards_key.keys():
    supply_card_count = supply_cards_counter[key]
    if supply_card_count >= lower_threshold:
        popular_supply_cards.append((key,supply_card_count))
        
output_filename = 'useful_log_files_' + str(no_required_players) + 'players.txt'
with open(output_filename, 'w') as file:
    for entry in popular_supply_cards:
        index, count = entry
        file.write('Supply cards: ' + ','.join(supply_cards_key[index]) + '\n')  
        file.write('Count: ' + str(count) + '\n')  
        file.write('Filenames: ' + ','.join(supply_cards_filenames[index]) + '\n')
    
print(popular_supply_cards)    

[(457, 43), (35730, 28)]


In [22]:
#read in data from log file generated in previous cell
filename = 'ResultsFiles/2010/useful_log_files_2players_2010.txt'
supply_cards_from_file_key = {} 
supply_cards_from_file_counter = {} 
supply_cards_from_file_filenames = {} 
file = open(filename, 'r')
contents = file.readlines()

index = 0
found_supply_cards = False
found_count = False
found_files = False
for entry in contents:
    colon_index = entry.rfind(':')
    if 'Supply cards' in entry:
        supply_cards_string = entry[colon_index+2:-1]
        supply_cards_from_file_key[index] = supply_cards_string.split(',')
        found_supply_cards = True
    if 'Count' in entry:
        count = int(re.search(r'\d+', entry[colon_index+2:]).group())
        supply_cards_from_file_counter[index] = count
        found_count = True
    if 'Filenames' in entry:
        filenames_string = entry[colon_index+2:-1]
        supply_cards_from_file_filenames[index] = filenames_string.split(',')
        found_files = True
    if (found_supply_cards and found_count and found_files):
        index+=1
        found_supply_cards = False
        found_count = False
        found_files = False
        
print(supply_cards_from_file_key)
print(supply_cards_from_file_counter)
print(supply_cards_from_file_filenames)

{0: ['Cellar', 'Market', 'Militia', 'Mine', 'Moat', 'Remodel', 'Smithy', 'Village', 'Woodcutter', 'Workshop'], 1: ['Apprentice', 'Bishop', 'Contraband', 'Expand', 'Golem', 'Hoard', "Philosopher's Stone", 'Potion', 'Rabble', 'Smugglers', 'Warehouse']}
{0: 43, 1: 28}
{0: ['DominionPlayerLogs/2010-all/20101212/game-20101212-191619-b5ccefe1.html', 'DominionPlayerLogs/2010-all/20101225/game-20101225-041818-14ec2ba0.html', 'DominionPlayerLogs/2010-all/20101203/game-20101203-155543-2fdd2a42.html', 'DominionPlayerLogs/2010-all/20101213/game-20101213-213952-c80a34bd.html', 'DominionPlayerLogs/2010-all/20101213/game-20101213-160920-b720ceb1.html', 'DominionPlayerLogs/2010-all/20101107/game-20101107-154349-c5cd288d.html', 'DominionPlayerLogs/2010-all/20101107/game-20101107-160205-fafc6b28.html', 'DominionPlayerLogs/2010-all/20101107/game-20101107-153141-36a72944.html', 'DominionPlayerLogs/2010-all/20101022/game-20101022-182755-1cac9b62.html', 'DominionPlayerLogs/2010-all/20101022/game-20101022-18

In [None]:
#testing on a single file
#filename = processed_files[len(processed_files)-1]
filename = 'DominionPlayerLogs/2010-all/20101011/game-20101011-220812-c650e1b4.html'
observed_supply_cards_key = {}
observed_supply_cards_counter = {} 

#read and parse html
file = open(filename, mode = 'r')
soup = BeautifulSoup(file, 'html.parser')

#convert html contents into a list of tabs, navigable strings etc
pre_tag = soup.find('pre')

#check that the file isnt empty
if pre_tag != None:
    #first check that game wasn't aborted
    aborted = check_aborted_game(pre_tag.contents)

    #get player names and scores
    player_scores = players_and_scores(soup)
    print(player_scores)
    
    #get turns by player
    player_list = []
    for player in player_scores.keys():
        player_list.append(player)
    turns_by_player = turns_for_each_player(player_list, pre_tag.contents)
    print(turns_by_player)

    #get cards in supply
    supply_cards = get_supply_cards(player_list, pre_tag.contents)
    
    #log supply card set
    log_supply_set(supply_cards, filename, supply_cards_key, 
                                                          supply_cards_counter,
                                                          supply_cards_filenames)
    
    print(supply_cards)
    print(supply_cards_key)
    print(supply_cards_counter)
    print(supply_cards_filenames)

In [5]:
#function to log and catalogue a supply card set 
def log_supply_set(cards, filename, supply_cards_key, supply_cards_counter, supply_cards_filenames):
    #first identify if we have seen this card set previously
    sorted_cards = sorted(cards)
    previously_seen = False
    
    for index in supply_cards_key.keys():
        if sorted_cards == supply_cards_key[index]:
            #seen this set before, so increment supply cards counter and store the filename
            supply_cards_counter[index] += 1
            supply_cards_filenames[index].append(filename)
            previously_seen = True
            break
    
    if previously_seen == False:
        #in this case we need to create a new entry in the relevant dictionaries
        
        #no of unique sets seen so far
        index = len(supply_cards_key) + 1
        supply_cards_key[index] = sorted_cards
        supply_cards_counter[index] = 1
        supply_cards_filenames[index] = [filename]
    
    return None

def check_aborted_game(html_contents):
    for (k,r) in enumerate(html_contents):
        if ('game aborted' in r.text) or ('resigned' in r.text):
            return True
    return False

#following function parses a single line which consists of number of cards (unless equal to one) and card types
#each separated by a colon and ended with a full stop or a horizontal dashed line. It returns a list of cards 
#with card names duplicated according to the number of them in the row. Also the index of the full stop or 
#dashed line will be returned
def parse_row_into_cards(html_contents):
    card_list = []
    for (k,r) in enumerate(html_contents):
        #Need to be careful that the first content item doesn't contain a '....', causing 
        #the code to pickup a full stop
        if ((('.' not in r) or (k == 0)) and r.name == None and ('----------------------' not in r)):
            #check to see if trashing text contains the number of cards trashed
            #number is contained in last two digits of text
            try:
                no_cards = int(r[-2:])
            except ValueError:
                no_cards = 1
            #take next entry along which should be the card name
            card_type = r.next_sibling.text
            #and add copies of that to the dictionary
            for count in range(0,no_cards):
                card_list.append(card_type)
        elif (('.' in r) or ('----------------------' in r)) and (k != 0):
            return (k, card_list)

#function to get players and scores
def players_and_scores(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_scores = {}
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
            #next sibing contains the score, assume score is after a string of the form ': ' and is 2 digits long
            score_text = tag.next_sibling
            score = int(re.search(r'-?\d+', score_text[2:5]).group()) 
            player_scores[player] = score
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                score = int(re.search(r'-?\d+', s[colon_position+1 : colon_position+4]).group()) 
                player_scores[player] = score
                
    return player_scores

#extract number of turns for this game for each player
def turns_for_each_player(player_list, html_contents):
    total_turns = []
    count = 0
    for (k,r) in enumerate(html_contents):
        if 'turns' in r.text:
            turn_position = r.text.index('turns')
            #assume number of turns is given by two digits and there is a space to the start of the word 'turn'q
            total_turns.append(int(re.search(r'\d+', r.text[turn_position-3:turn_position]).group()))
            count += 1
            if count == len(player_list):
                break
            
    #note, turns are in same order as player names in html file
    turns_by_player = {}
    count = 0
    for player in player_list:
        turns_by_player[player] = total_turns[count]
        count += 1
    
    return turns_by_player

def get_supply_cards(players_list, html_contents):
    #so we need to be careful figuring out the cards in the supply, first we scan down the file
    # to see if there is a 'chosen cards are' string followed by ''<player name> vetoes'. In this
    #case the players are making choices over the card supply, if this isnt present then the
    #supply cards are given at the top of the file post a string that says 'cards in supply'

    #start with the case where players can veto
    vetoes_used = False
    for (i,s) in enumerate(html_contents):
        if 'chosen cards are' in s:
            #loop over chosen supply cards
            (index, cards) = parse_row_into_cards(html_contents[i:])
            index_end = i + index 
            supply_cards = cards
            vetoes_used = True
            break
    
    if vetoes_used == True:
        #next each player can veto a card
        vetoed_cards = []
        for player in players_list:
            for s in html_contents[index_end:]:
                check_string = player + ' vetoes'
                if check_string in s:
                    vetoed_cards.append(s.next_sibling.text)
                    break
        #remove vetoed cards from list of supply cards
        supply_cards = list(filter(lambda x: x not in vetoed_cards, supply_cards))
            
    #if this didnt occur move onto the second case
    if vetoes_used == False:
        for i, s in enumerate(pre_tag.contents):
            if 'cards in supply' in s:
                (index, cards) = parse_row_into_cards(html_contents[i:])
                supply_cards = cards
                break
    
    return supply_cards

def process_player_actions(player_list, turns_by_player, html_contents):
    #use a dictionary of dictionaries to track gained cards by turn by player, and initialise keys
    incremental_cards_by_turn = {}
    for player in player_list:
        incremental_cards_by_turn[player] = {}
        for turn in range(1,turns_by_player[player]+1):
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
            incremental_cards_by_turn[player][turn] = {}
        for turn in range(1,turns_by_player[player]+1):
            incremental_cards_by_turn[player][turn]['buys'] = {}
            incremental_cards_by_turn[player][turn]['trashing'] = {}
            incremental_cards_by_turn[player][turn]['gaining'] = {}
            incremental_cards_by_turn[player][turn]['trashes'] = {}
        
    for player in player_list:
        turn_counter = 1
        #string to check for a buy action
        check_buy = player + ' buys'
        #string to check if current player is trashing a card
        check_trashing = 'trashing'
        #string to check if current player is gaining a card
        check_gaining = 'gaining'
        #string to check if another player trashes a card. Howver, current player can both 'trashes' 
        #and 'trashing'. Also it is possible for a player to trash nothing
        check_player_trashes = [x + ' trashes' for x in players] # need to check this!
        #string to check if another player is gains a card (gains is used rather than
        #gaining when a player gets a card out of turn)
        check_player_gains = [ x + ' gains' for x in players]
        #string to check end of turn
        check_turn_end = '(' + player + ' draws:'
        for (i,p) in enumerate(html_contents):
            check_turn = player + '\'s' + ' turn ' + str(turn_counter)
            if check_turn in p:
                buy_card_list = []
                trashing_card_list = []
                gaining_card_list = []
                gains_card_list = {} #needs to be a dictionary as multiple players may gain in another player's round
                trashes_card_list = {} #needs to be a dictionary as multiple players may need to trash in another player's round
                for player_ in player_list:
                    gains_card_list[player_] = [] 
                    trashes_card_list[player_] = [] 
                for (j,q) in enumerate(html_contents[i:]):
                    player_gains_list = [ x in q for x in check_player_gains]
                    player_trashes_list = [ x in q for x in check_player_trashes]
                    if check_buy in q:
                        (index, cards) = parse_row_into_cards(html_contents[i + j:])
                        buy_card_list.append(cards)
                    elif check_trashing in q:
                        if 'trashing nothing' in r:
                            break
                        (index, cards) = parse_row_into_cards(html_contents[i + j:])
                        trashing_card_list.append(cards)
                    elif check_gaining in q:
                        #also we need to check for a 'gaining nothing' case
                        if 'gaining nothing' in r:
                            break
                        else:
                            (index, cards) = parse_row_into_cards(html_contents[i + j:])
                            gaining_card_list.append(cards)
                    elif any(player_gains_list):
                        #next we check if an opponent gains a card, e.g. a curse card
                        #need to loop through opponents
                        for (index, player_gains_check) in enumerate(player_gains_list):
                            if player_gains_check == True:
                                player_ = players[index]
                                (index_, cards) = parse_row_into_cards(html_contents[i + j:])
                                gains_card_list[player_].append(cards)
                    elif any(player_trashes_list):
                        #next we check if an opponent trashes a card
                        for (index, player_trashes_check) in enumerate(player_trashes_list):
                            if player_trashes_check == True:
                                player_ = players[index]
                                (index_, cards) = parse_row_into_cards(html_contents[i + j:])
                                trashes_card_list[player_].append(cards)
                    elif check_turn_end in q.text:
                        incremental_cards_by_turn[player][turn_counter]['buys'] = buy_card_list
                        incremental_cards_by_turn[player][turn_counter]['trashing'] = trashing_card_list
                        incremental_cards_by_turn[player][turn_counter]['gaining'] = gaining_card_list
                        for player_ in gains_card_list.keys():
                            incremental_cards_by_turn[player_][turn_counter]['gains'] = gains_card_list[player_] 
                        for player_ in trashes_card_list.keys():
                            incremental_cards_by_turn[player_][turn_counter]['trashes'] = trashes_card_list[player_] 
                        turn_counter += 1
                        break
    
    #finally we flatten incremental_cards_by_turn into a dictionary of lists
    for player in player_list:
        for turns in range(1, turns_by_player[player]+1):
            for cmd_type in incremental_cards_by_turn[player][turns].keys():
                flattened_list = [item for sublist in incremental_cards_by_turn[player][turns][cmd_type] for item in sublist]
                incremental_cards_by_turn[player][turns][cmd_type] = flattened_list
    
    return incremental_cards_by_turn

In [None]:
#get turns by player
#player_list = []
#for player in player_scores.keys():
#    player_list.append(player)
#turns_by_player = turns_for_each_player(player_list, pre_tag.contents)

In [None]:
#process player actions to determine cards gained per turn
#incremental_cards_by_turn = process_player_actions(player_list, turns_by_player, pre_tag.contents):

#first output high level information, including number of players and the set of supply cards 
#we store this in a folder structure that matches observed_supply_cards_key and 
#observed_supply_cards_counter
#high_level_output_file = game_log[:-5] + '_summary.txt'

#determine output dir name for the summary file
#summary_output_dir = output_dir + 'SupplyIndex' + str(supply_index) + '/NoPlayers' + str(len(player_list)) 
#output_filename = summary_output_dir + '/' + high_level_output_file

#check to see if directory exists and if it does not then create one
#if not os.path.exists(summary_output_dir):
#    os.makedirs(summary_output_dir)

#with open(output_filename, 'w') as file:
#    file.write('Input log: ' + game_log + '\n')
#    for player in player_list:
#        file.write('Player: ' + str(player) + ' Score: ' 
#                              + str(player_scores[player]) 
#                              + str(' Turns: ') + str(turns_by_player[player])
#                              + '\n')
#    file.write('Supply cards: ' + ','.join(supply_cards)+ '\n')