In [1]:
#unzips and re-zips individual log files and processes each one by calling out to the single file player 
#log processor

#this version of the code produces play traces suitable to be used with n-gram analysis

#note we are ignoring:
#1. actions from second player caused by first player, e.g. discarding two
#cards as other player has played a militia.
#2. actions that result from a player following instructions on a cards, e.g. 
#we note that a player plays a mine, but we dont include the fact that they then trash a copper
#and gain a silver 
#we are effictively ignoring extended action sequences which aligns with how we are registering
#actions in TAG

#we also define an 'End Current Phase' action, when a player either just plays treasure cards
#in the action phase or does not buy anything in the buy phase

#as a coneqeunce of the above we only track 'plays' and 'buys' actions and not trashing, trashes
#or gains. In this case we are not trying to deteremine deck composition only the main choices
#taken by a player.

In [2]:
import logging
import os
import pdb
import tarfile
from bs4 import BeautifulSoup
import re
import shutil
import time
import math
import pandas as pd

In [3]:
#set name of input file that contains a list of logs to build play traces from
input_filename = 'ResultsFiles/2Player_FG1E_Supply_AllYears_logfilenames.txt'
output_filename = 'ResultsFiles/2Player_FG1E_Supply_AllYears_ActionTraces.csv'

#supply set associated with log files
supply_cards = ['Cellar','Market','Militia','Mine','Moat','Remodel','Smithy','Village',
                'Woodcutter','Workshop','Curse']

base_cards = ['Gold', 'Silver', 'Copper', 'Estate', 'Duchy', 'Province']

treasure_cards_list = ['Gold', 'Silver', 'Copper', 'Golds', 'Silvers', 'Coppers']

total_card_set = supply_cards + base_cards

#get a list of log files to process
file = open(input_filename, 'r')
log_files= file.readlines()

#tidy up by removing part of path
remove_start = 'gdrive/My Drive/Colab Notebooks/DominionPlayerLogProcessing/'
remove_end = '\n'

for (i,file) in enumerate(log_files):
    log_files[i] = log_files[i][len(remove_start): -len(remove_end)]

print("No of log files: " +str(len(log_files)))

#check for duplicates
set_without_duplicates = set(log_files)
print("No of duplciates: " + str(len(log_files) - len(set_without_duplicates)))

target_directory_for_log_files = 'DominionPlayerLogs/2Player_FG1E_Logs'

No of log files: 149
No of duplciates: 0


In [4]:
#only needs to be run once to extract log files from tar files and store separately

#NOTE: we assume all log files in the input file are in the following
#directory
#path = 'DominionPlayerLogs/2010_And_2011'

#for log_file in log_files:
#    #determine name of tar file to unzip
#    substr_path = 'DominionPlayerLogs/2010-all/' #year here is irrelevant just using length, so could 
#    #also use 2011-all etc
#    tar_file = log_file[len(substr_path): len(substr_path) + 8] + '.tar.bz2'
    
#    #determine game-log file needed 
#    game_log = log_file[len(substr_path) + 9:]
    
#    #next extract tar file
#    with tarfile.open(path_string + tar_file, 'r') as tar:
#        tar.extract(game_log, target_directory_for_log_files)

In [5]:
list_of_log_files = os.listdir(target_directory_for_log_files)
#print(list_of_log_files)
print("No of log files: " + str(len(list_of_log_files))) # note one extra compared ot the above due 
#.DS_Store file being picked up

No of log files: 150


In [9]:
#extract number of turns for this game for each player
def turns_for_each_player(player_list, html_contents):
    total_turns = []
    count = 0
    for (k,r) in enumerate(html_contents):
        if 'turns' in r.text:
            turn_position = r.text.index('turns')
            #assume number of turns is given by two digits and there is a space to the start of the word 'turn'q
            total_turns.append(int(re.search(r'\d+', r.text[turn_position-3:turn_position]).group()))
            count += 1
            if count == len(player_list):
                break
            
    #note, turns are in same order as player names in html file
    turns_by_player = {}
    count = 0
    for player in player_list:
        turns_by_player[player] = total_turns[count]
        count += 1
    
    return turns_by_player

#following function parses a single line which consists of number of cards (unless equal to one) and card types
#each separated by a colon and ended with a full stop or a horizontal dashed line. It returns a list of cards 
#with card names duplicated according to the number of them in the row. 
#finally, we stop parsing if we find a string that is in the cmd_list that is not equal to the cmd
#this prevents us having issues with statements like 'trashing a Silver and gaining a Gold'
#Also the index of the full stop or dashed line or start of a new command will be returned
def parse_row_into_cards(cmd, cmd_list, html_contents):
    card_list = []
    cmd_list_copy = cmd_list.copy()
    cmd_list_copy.remove(cmd)
    for (k,r) in enumerate(html_contents):
        #Need to be careful that the first content item doesn't contain a '....', causing 
        #the code to pickup a full stop
        #check we aren't finding any other commands
        cmd_bools = [(other_cmd in r) for other_cmd in cmd_list_copy]
        if ((('.' not in r) or (k == 0)) and r.name == None and ('----------------------' not in r) and (not any(cmd_bools))):
            #check to see if trashing text contains the number of cards trashed
            #number is contained in last two digits of text
            try:
                no_cards = int(r[-2:])
            except ValueError:
                no_cards = 1
            #take next entry along which should be the card name
            card_type = r.next_sibling.text
            #and add copies of that to the dictionary
            for count in range(0,no_cards):
                card_list.append(card_type)
        elif (('.' in r) or ('----------------------' in r) or any(cmd_bools)) and (k != 0):
            return (k, card_list)

#function to get players and scores
def players_and_scores(html_soup):
    #look for entries of the form #n <name>: x points' where n is a positive integer - these are tagged with a 'b'
    #in earlier log files the '#n' is dropped and hence we also need to consider this case
    player_scores = {}
    list_b_tags = html_soup.find_all('b'); 
    for tag in list_b_tags:
        s = tag.text
        #extract name (assume string of the form '#n ' comes before the name)
        match = re.search(r'#\d+ ', s)
        if match != None:
            #we need to find the right most ':' (some names may contain a ':')
            colon_index = s.rfind(':')
            if colon_index == -1:
                player = s[match.end():] #sometimes the colon is in the next sibling
            else:
                player = s[match.end():colon_index]
            #next sibing contains the score, assume score is after a string of the form ': ' and is 2 digits long
            score_text = tag.next_sibling
            score = int(re.search(r'-?\d+', score_text[2:5]).group()) 
            player_scores[player] = score
            
    #deal with second case where name is not preceeded by a '#n ', in this case we need to look for a ': n points'
    if len(player_scores) == 0:
        for tag in list_b_tags:
            s = tag.text
            match = re.search(r': (-?\d+) point', s) #we drop the s at the end of points to deal with the case where someone scores 1
            if match != None:
                #in this case the name is contained in the text preceeded by a ':'
                colon_position = match.start()
                if colon_position == 0:
                    #sometime the name is in the previous tag
                    pdb.set_trace()
                    player = s.prev_sibling.text
                else:
                    player = s[:colon_position]
                score = int(re.search(r'-?\d+', s[colon_position+1 : colon_position+4]).group()) 
                player_scores[player] = score
                
    return player_scores

#this function creates a dictionary containg player actions as strings for each round and turn
def create_player_actions(player_list, turns_by_player, treasure_cards, html_contents):
    actions = {}
    
    #note sometimes a player has one more go than another player but
    #we still initialise our dictionary to the larger number of turns
    max_no_turns = 0
    for player in player_list:
        if turns_by_player[player] > max_no_turns:
            max_no_turns = turns_by_player[player]
    
    for player in player_list:
        actions[player] = {}
        for turn in range(1, max_no_turns+1):
            actions[player][turn] = {}
        
    cmd_list = ['plays', 'buys']
    player_no = 0
    for player in player_list:
        turn_counter = 1
        #string to check for a play action
        check_plays = player + ' plays'
        #string to check for a buy action
        check_buy = player + ' buys'
        #string to check end of round
        check_turn_end = '(' + player + ' draws:'
        for (i,p) in enumerate(html_contents):
            check_turn = player + '\'s' + ' turn ' + str(turn_counter)
            if check_turn in p: 
                tick_counter = 0
                action_cards_played = False
                cards_bought = False
                for (j,q) in enumerate(html_contents[i:]):
                    if check_plays in q:
                        (index, cards) = parse_row_into_cards('plays', cmd_list, html_contents[i + j:])
                        #only record play of action cards and not treasure cards
                        for card in cards:
                            if card not in treasure_cards:
                                actions[player][turn_counter][tick_counter] = card.upper() + ' : Player ' + str(player_no)
                                tick_counter += 1
                                action_cards_played = True
                    if check_buy in q:
                        #first check if all played cards were treasure cards or no cards were 
                        #played at all, because we then need to add an 'End Curent Phase' action
                        if (action_cards_played == False):
                            actions[player][turn_counter][tick_counter] = 'End Current Phase'
                            tick_counter += 1
                        #next check bought cards
                        (index, cards) = parse_row_into_cards('buys', cmd_list, html_contents[i + j:])
                        for card in cards:
                            actions[player][turn_counter][tick_counter] = 'BuyCard: ' + card.upper() + ' by player ' + str(player_no)
                            tick_counter += 1
                            cards_bought = True
                    elif check_turn_end in q.text:
                        #again check if there were any plays involving cards other than action cards
                        #and add an 'End Current Phase'
                        if (action_cards_played == False) and (cards_bought == False):
                            actions[player][turn_counter][tick_counter] = 'End Current Phase'
                            tick_counter += 1
                            actions[player][turn_counter][tick_counter] = 'End Current Phase'
                        elif (cards_bought == False):
                            #then add an end current phase due to player not purchasing any cards
                            actions[player][turn_counter][tick_counter] = 'End Current Phase'
                        turn_counter += 1
                        break
        player_no += 1
                    
    return actions

#reformat player actions into a dataframe
def reformat_actions_into_df(player_actions_dict, player_scores_dict, game_id):
    cols = ['GameID', 'Player', 'Round', 'Turn', 'AgentName', 'ActionDescription']
    output = pd.DataFrame(columns = cols)
    row_data = {key: None for key in cols}
    
    player_no = 0
    for player, round_dict in player_actions_dict.items():
        for round_, turn_dict in round_dict.items():
            for turn_, action_description in turn_dict.items():
                row_data['GameID'] = game_id
                row_data['Player'] = player_no
                row_data['Round'] = round_
                row_data['Turn'] = turn_
                row_data['AgentName'] = player
                row_data['ActionDescription'] = action_description
                new_row = pd.DataFrame([row_data])
                output = pd.concat([output, new_row], ignore_index = True)
        player_no += 1
    
    output.reset_index()
    return output

In [10]:
#next need to generate a play trace file using data from each log file

#main columns for this file should be GameID, Player name, Player (0 or 1), Round (starting from zero), then list of card names 
#with entries given by the total number of cards a player has in her deck

#set-up log
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='processor.log', mode='a')
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

gameID = -1 #start counting games from zero
processed_files = []

#files_to_ignore = ['game-20110130-000030-482766c5.html', #ignored as one player buys a curse card
#                   'game-20110213-153439-e3c9416a.html', #ignored as one player buys a curse card
#                   'game-20110117-173643-e788bbed.html'] #ignored as one player buys a curse card

files_to_ignore = []

output = pd.DataFrame()
for game_log in os.listdir(target_directory_for_log_files):
    if game_log not in files_to_ignore:
        if game_log[-4:] == 'html':
            #output file to be processed in log file
            logging.debug("Processing file: " + game_log)

            start_time_for_file = time.time()
            
            gameID+=1

            #store processed file name in a list, just helps with debugging
            processed_files.append(game_log)

            #read and parse html
            file = open(target_directory_for_log_files + '/' + game_log, mode = 'r')
            soup = BeautifulSoup(file, 'html.parser')

            #convert html contents into a list of tabs, navigable strings etc
            pre_tag = soup.find('pre')

            #get player names and scores                      
            player_scores = players_and_scores(soup)

            list_of_players = []
            for player in player_scores.keys():
                list_of_players.append(player)

            #get turns by player
            player_turns = turns_for_each_player(list_of_players, pre_tag.contents)
                
            #get action sequence string
            player_actions = create_player_actions(list_of_players, player_turns, treasure_cards_list, pre_tag.contents)
           
            #convert play-trace to dataframe
            tmp_df = reformat_actions_into_df(player_actions, player_scores, gameID)
            
            #add to dataframe for output
            output = pd.concat([output, tmp_df])
           
#output to results file
output.to_csv(output_filename, sep = ',', index = False)