In [180]:
import pandas as pd
import numpy as np
import os
import shutil
from collections import OrderedDict
from datetime import datetime, timedelta
import time
import math

from tqdm import tqdm

# Create Results Files

In [134]:
def create_results_files(comp_chart_path, results_path):
    '''
        Will grab the .2 file from each comprehensive chart directory
        
        Args:
            comp_chart_path (string): path to the comprehensive chart directories
            results_path (string): path to directory in which to save results files
            
        Returns:
            Nothing        
    '''
    chart_files = []
    for subdir, dirs, files in os.walk(comp_chart_path):
        for file in files:
            if file[-1] == '2':
                chart_files.append(subdir + '/' +file)
       
    # Delete files in results directory
    
    # Copy files
    for file in chart_files:
        shutil.copy(file, results_path)
            
create_results_files('./comp_chart_files/', './results_files/')

In [135]:
'''
# Add date to input file name
input_path = './drf/'
input_files = [input_path + file for file in os.listdir(input_path) if file.endswith(".DRF")]


for file in input_files:
    new_name = file[6:-4] + '2018.DRF'
    shutil.copy(file, './temp/{}'.format(new_name))
'''

'\n# Add date to input file name\ninput_path = \'./drf/\'\ninput_files = [input_path + file for file in os.listdir(input_path) if file.endswith(".DRF")]\n\n\nfor file in input_files:\n    new_name = file[6:-4] + \'2018.DRF\'\n    shutil.copy(file, \'./temp/{}\'.format(new_name))\n'

# Find All Pairs of Input/Results Files

In [136]:
def find_file_pairs(input_path, results_path):
    '''
        Find matching input/output files
        
        Args:
            input_path (string): relative path to input files
            results_path (string): relative path to results files
            
        Returns:
            (list) list of (input_file,results_file) pairs
    '''
    
    # Get a list of all input files and results files
    input_files = [input_path + file for file in os.listdir(input_path) if file.endswith(".DRF")]
    results_files = [results_path + file for file in os.listdir(results_path) if file.endswith('.2')]
    
    # Get names of files w/o directories/extensions
    input_names = sorted([file[len(input_path) : -4] for file in input_files])
    results_names = sorted([file[len(results_path) : -2] for file in results_files])
    
    # Find matches 
    matches = [name for name in input_names if name in results_names]
    
    # Create list of input/results file pairs -- [(input_file, results_file),(...),...]
    file_pairs = [('{}{}.DRF'.format(input_path, name), '{}{}.2'.format(results_path, name)) for name in matches]
    
    return file_pairs

file_pairs = find_file_pairs(input_path='./input_files/', results_path='./results_files/')

In [137]:
len(file_pairs)

402

# Add Number of Entrants to Files

In [138]:
def add_entrants(input_file, results_file):
    '''
        Find highest post position for each race and use as number of entrants
        
        Args:
            file (string): path to results file
            
        Returns:
            Nothing
    '''
    # Load files 
    input_df = pd.read_csv(input_file, header=None)
    res_df = pd.read_csv(results_file, header=None)
    
    # Find how many races are in each
    race_col = 2
    num_input_races = input_df[race_col].max()
    num_res_races = res_df[race_col].max()
    
    # TODO: Assert that race counts are equal
        
    # For each race, count entrants, append as last column
    input_last_col = input_df.columns.max() + 1
    race_entrants = {}
    for race in range(1,num_input_races+1):
        entrants = input_df.loc[input_df[race_col] == race].shape[0]
        iloc = input_df.loc[input_df[race_col] == race].index
        input_df.loc[iloc, input_last_col] = pd.Series(entrants, index=iloc)
        race_entrants[race] = entrants
        
    # Apply those same entrants numbers to results file
    res_last_col = res_df.columns.max() + 1
    for race in range(1, num_res_races+1):
        iloc = res_df.loc[res_df[race_col] == race].index
        res_df.loc[iloc, res_last_col] = pd.Series(race_entrants[race], index=iloc)
        
    # Save back to file
    input_df.to_csv(input_file, header=False, index=False)
    res_df.to_csv(results_file, header=False, index=False)

    
# Iterate through all files in file_pairs and add number of entrants
for pair in file_pairs:
    input_file = pair[0]
    res_file = pair[1]
    add_entrants(input_file, res_file)