In [28]:
#unzips and re-zips individual log files and processes each one by calling out to the single file player 
#log processor
import logging
import os
import pdb
import tarfile
from bs4 import BeautifulSoup
import re
import shutil
import time
import math

In [29]:
#set-directory of compressed files to process
year_dir_name = '2013-all/'
dir_name = 'DominionPlayerLogs/' +  year_dir_name
output_dir = 'ProcessedLogs/' + year_dir_name

In [30]:
print("No of directories to loop through: " + str(len(os.listdir(dir_name))))

No of directories to loop through: 74


In [32]:
#find all games with a specific supply card set and specified number of players
no_required_players = 2
supply_SD1E = ['Cellar', 'Chapel', 'Feast', 'Gardens', 'Laboratory', 'Thief', 'Village', 'Witch', 'Woodcutter',
               'Workshop']
tgt_supply = sorted(supply_SD1E)

#store filenames of logs with the correct supply card set
filename_list = []

#counter for number of files that could and couldn't be processed
failures = 0
successes = 0
directories_processed = 0

total_start_time = time.time()

for tar_day_file in sorted(os.listdir(dir_name)):
    #time how long it takes to process directory
    start_time = time.time()
    
    #extract tar file to directory
    if tar_day_file[-7:] == 'tar.bz2': 
        logging.debug("Extracting " + dir_name  + tar_day_file)
        with tarfile.open(dir_name + tar_day_file, 'r') as tar:
            extract_path = dir_name + tar_day_file[:-8]
            tar.extractall(extract_path)

        #then loop through individual games
        for game_log in os.listdir(extract_path):
            #read and parse html
            filename = extract_path + '/' + game_log
            file = open(filename, mode = 'r')
            soup = BeautifulSoup(file, 'html.parser')

            #convert html contents into a list of tabs, navigable strings etc
            pre_tag = soup.find('pre')
                                
            #check that the file isnt empty
            if pre_tag == None:
                failures +=1 
            else:
                #check that game wasn't aborted
                aborted = check_aborted_game(pre_tag.contents)

                if aborted == True:
                    failures +=1
                else:
                    #get player names and scores                      
                    #player_list = get_players(soup)
                    list_b_tags = soup.find_all('b'); 
                    no_players = len(list_b_tags)-1

                    #for now we focus only on games with a set number of players
                    if no_players == no_required_players:
                        #get cards in supply                       
                        supply_cards = get_supply_cards_with_no_of_players(no_players, pre_tag.contents)
                        sorted_supply_cards = sorted(supply_cards)
                        
                        if sorted_supply_cards == tgt_supply:    
                            filename_list.append(filename)
                            print(filename)

        #delete extracted directory and output timings
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Processed directory " + str(tar_day_file) + f" in {elapsed_time/60} minutes")
        shutil.rmtree(extract_path)
        directories_processed +=1
        print("Number of directories processed " + str(directories_processed))

print("Directories processed: " + str(directories_processed))
print("Successes: " + str(successes))
print("Failures: " + str(failures))  

#write filename list to file
output_filename = 'files_with_tgt_supply_' + str(no_required_players) + 'players.txt'
with open(output_filename, 'w') as file:
    file.write(','.join(filename_list) + '\n')

# Calculate the elapsed time
total_end_time = time.time()
elapsed_time = total_end_time - total_start_time

print(f"Total elapsed time: {elapsed_time/60} minutes")


Processed directory 20130101.tar.bz2 in 15.856157779693604 minutes
Number of directories processed 1
Processed directory 20130102.tar.bz2 in 17.46605072816213 minutes
Number of directories processed 2
Processed directory 20130103.tar.bz2 in 16.965998764832815 minutes
Number of directories processed 3
Processed directory 20130104.tar.bz2 in 16.972508414586386 minutes
Number of directories processed 4
Processed directory 20130105.tar.bz2 in 18.421869496504467 minutes
Number of directories processed 5
Processed directory 20130106.tar.bz2 in 18.807342247168222 minutes
Number of directories processed 6
Processed directory 20130107.tar.bz2 in 17.67398198843002 minutes
Number of directories processed 7
Processed directory 20130108.tar.bz2 in 17.641586152712502 minutes
Number of directories processed 8
Processed directory 20130109.tar.bz2 in 14.451813352108001 minutes
Number of directories processed 9
Processed directory 20130110.tar.bz2 in 15.232057348887126 minutes
Number of directories proc

In [31]:
#function to determine number of players
def get_players(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_list = []
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
                player_list.append(player)
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                player_list.append(player)
                
    return player_list
    
def check_aborted_game(html_contents):
    for (k,r) in enumerate(html_contents):
        if ('game aborted' in r.text) or ('resigned' in r.text):
            return True
    return False

#following function parses a single line which consists of number of cards (unless equal to one) and card types
#each separated by a colon and ended with a full stop or a horizontal dashed line. It returns a list of cards 
#with card names duplicated according to the number of them in the row. Also the index of the full stop or 
#dashed line will be returned
def parse_row_into_cards(html_contents):
    card_list = []
    for (k,r) in enumerate(html_contents):
        #Need to be careful that the first content item doesn't contain a '....', causing 
        #the code to pickup a full stop
        if ((('.' not in r) or (k == 0)) and r.name == None and ('----------------------' not in r)):
            #check to see if trashing text contains the number of cards trashed
            #number is contained in last two digits of text
            try:
                no_cards = int(r[-2:])
            except ValueError:
                no_cards = 1
            #take next entry along which should be the card name
            card_type = r.next_sibling.text
            #and add copies of that to the dictionary
            for count in range(0,no_cards):
                card_list.append(card_type)
        elif (('.' in r) or ('----------------------' in r)) and (k != 0):
            return (k, card_list)

#function to get players and scores
def players_and_scores(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_scores = {}
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
            #next sibing contains the score, assume score is after a string of the form ': ' and is 2 digits long
            score_text = tag.next_sibling
            score = int(re.search(r'-?\d+', score_text[2:5]).group()) 
            player_scores[player] = score
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                score = int(re.search(r'-?\d+', s[colon_position+1 : colon_position+4]).group()) 
                player_scores[player] = score
                
    return player_scores

#extract number of turns for this game for each player
def turns_for_each_player(player_list, html_contents):
    total_turns = []
    count = 0
    for (k,r) in enumerate(html_contents):
        if 'turns' in r.text:
            turn_position = r.text.index('turns')
            #assume number of turns is given by two digits and there is a space to the start of the word 'turn'q
            total_turns.append(int(re.search(r'\d+', r.text[turn_position-3:turn_position]).group()))
            count += 1
            if count == len(player_list):
                break
            
    #note, turns are in same order as player names in html file
    turns_by_player = {}
    count = 0
    for player in player_list:
        turns_by_player[player] = total_turns[count]
        count += 1
    
    return turns_by_player

def get_supply_cards_with_no_of_players(no_of_players, html_contents):
    #so we need to be careful figuring out the cards in the supply, first we scan down the file
    # to see if there is a 'chosen cards are' string followed by ''<player name> vetoes'. In this
    #case the players are making choices over the card supply, if this isnt present then the
    #supply cards are given at the top of the file post a string that says 'cards in supply'

    #start with the case where players can veto
    vetoes_used = False
    for (i,s) in enumerate(html_contents):
        if 'chosen cards are' in s:
            #loop over chosen supply cards
            (index, cards) = parse_row_into_cards(html_contents[i:])
            index_end = i + index 
            supply_cards = cards
            vetoes_used = True
            break
    
    if vetoes_used == True:
        #next each player can veto a card
        found_player_veto = 0
        vetoed_cards = []
        for s in html_contents[index_end:]:
            if 'vetoes' in s:
                vetoed_cards.append(s.next_sibling.text)
                found_player_veto +=1
                if (found_player_veto == no_of_players):
                    break
        #remove vetoed cards from list of supply cards
        supply_cards = list(filter(lambda x: x not in vetoed_cards, supply_cards))
            
    #if this didnt occur move onto the second case
    if vetoes_used == False:
        for (i, s) in enumerate(html_contents):
            if 'cards in supply' in s:
                (index, cards) = parse_row_into_cards(html_contents[i:])
                supply_cards = cards
                break
    
    return supply_cards