In [1]:
#unzips and re-zips individual log files and processes each one by calling out to the single file player 
#log processor
import logging
import os
import pdb
import tarfile
from bs4 import BeautifulSoup
import re
import shutil
import time
import math

In [8]:
#set-directory of compressed files to process
year_dir_name = '2012-all_from_latest_unprocessed_20120210/'
dir_name = 'DominionPlayerLogs/' +  year_dir_name
output_dir = 'ProcessedLogs/' + year_dir_name

In [3]:
print("No of directories to loop through: " + str(len(os.listdir(dir_name))))

No of directories to loop through: 327


In [10]:
#following function parses a single line which consists of number of cards (unless equal to one) and card types
#each separated by a colon and ended with a full stop or a horizontal dashed line. It returns a list of cards 
#with card names duplicated according to the number of them in the row. Also the index of the full stop or 
#dashed line will be returned
def parse_row_into_cards(html_contents):
    card_list = []
    for (k,r) in enumerate(html_contents):
        #Need to be careful that the first content item doesn't contain a '....', causing 
        #the code to pickup a full stop
        if ((('.' not in r) or (k == 0)) and r.name == None and ('----------------------' not in r)):
            #check to see if trashing text contains the number of cards trashed
            #number is contained in last two digits of text
            try:
                no_cards = int(r[-2:])
            except ValueError:
                no_cards = 1
            #take next entry along which should be the card name
            card_type = r.next_sibling.text
            #and add copies of that to the dictionary
            for count in range(0,no_cards):
                card_list.append(card_type)
        elif (('.' in r) or ('----------------------' in r)) and (k != 0):
            return (k, card_list)

def get_supply_cards_with_no_of_players(no_of_players, html_contents):
    #so we need to be careful figuring out the cards in the supply, first we scan down the file
    # to see if there is a 'chosen cards are' string followed by ''<player name> vetoes'. In this
    #case the players are making choices over the card supply, if this isnt present then the
    #supply cards are given at the top of the file post a string that says 'cards in supply'

    #start with the case where players can veto
    vetoes_used = False
    for (i,s) in enumerate(html_contents):
        if 'chosen cards are' in s:
            #loop over chosen supply cards
            (index, cards) = parse_row_into_cards(html_contents[i:])
            index_end = i + index 
            supply_cards = cards
            vetoes_used = True
            break
    
    if vetoes_used == True:
        #next each player can veto a card
        found_player_veto = 0
        vetoed_cards = []
        for s in html_contents[index_end:]:
            if 'vetoes' in s:
                vetoed_cards.append(s.next_sibling.text)
                found_player_veto +=1
                if (found_player_veto == no_of_players):
                    break
        #remove vetoed cards from list of supply cards
        supply_cards = list(filter(lambda x: x not in vetoed_cards, supply_cards))
            
    #if this didnt occur move onto the second case
    if vetoes_used == False:
        for (i, s) in enumerate(html_contents):
            if 'cards in supply' in s:
                (index, cards) = parse_row_into_cards(html_contents[i:])
                supply_cards = cards
                break
    
    return supply_cards

def check_aborted_game(html_contents):
    for (k,r) in enumerate(html_contents):
        if ('game aborted' in r.text) or ('resigned' in r.text):
            return True
    return False

In [11]:
#find all games with a specific supply card set and specified number of players
no_required_players = 2
supply_SD1E = ['Cellar', 'Chapel', 'Feast', 'Gardens', 'Laboratory', 'Thief', 'Village', 'Witch', 'Woodcutter',
               'Workshop']
tgt_supply = sorted(supply_SD1E)

#store filenames of logs with the correct supply card set
filename_list = []

#counter for number of files that could and couldn't be processed
failures = 0
successes = 0
directories_processed = 0

total_start_time = time.time()

for tar_day_file in sorted(os.listdir(dir_name)):
    #time how long it takes to process directory
    start_time = time.time()
    
    #extract tar file to directory
    if tar_day_file[-7:] == 'tar.bz2': 
        logging.debug("Extracting " + dir_name  + tar_day_file)
        with tarfile.open(dir_name + tar_day_file, 'r') as tar:
            extract_path = dir_name + tar_day_file[:-8]
            tar.extractall(extract_path)

        #then loop through individual games
        for game_log in os.listdir(extract_path):
            #read and parse html
            filename = extract_path + '/' + game_log
            file = open(filename, mode = 'r')
            soup = BeautifulSoup(file, 'html.parser')

            #convert html contents into a list of tabs, navigable strings etc
            pre_tag = soup.find('pre')
                                
            #check that the file isnt empty
            if pre_tag == None:
                failures +=1 
            else:
                #check that game wasn't aborted
                aborted = check_aborted_game(pre_tag.contents)

                if aborted == True:
                    failures +=1
                else:
                    #get player names and scores                      
                    #player_list = get_players(soup)
                    list_b_tags = soup.find_all('b'); 
                    no_players = len(list_b_tags)-1

                    #for now we focus only on games with a set number of players
                    if no_players == no_required_players:
                        #get cards in supply                       
                        supply_cards = get_supply_cards_with_no_of_players(no_players, pre_tag.contents)
                        sorted_supply_cards = sorted(supply_cards)
                        
                        if sorted_supply_cards == tgt_supply:    
                            filename_list.append(filename)
                            print(filename)

        #delete extracted directory and output timings
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Processed directory " + str(tar_day_file) + f" in {elapsed_time/60} minutes")
        shutil.rmtree(extract_path)
        directories_processed +=1
        print("Number of directories processed " + str(directories_processed))

print("Directories processed: " + str(directories_processed))
print("Successes: " + str(successes))
print("Failures: " + str(failures))  

#write filename list to file
output_filename = 'files_with_tgt_supply_' + str(no_required_players) + 'players.txt'
with open(output_filename, 'w') as file:
    file.write(','.join(filename_list) + '\n')

# Calculate the elapsed time
total_end_time = time.time()
elapsed_time = total_end_time - total_start_time

print(f"Total elapsed time: {elapsed_time/60} minutes")


Processed directory 20120210.tar.bz2 in 16.14483303229014 minutes
Number of directories processed 1


KeyboardInterrupt: 