In [None]:
# running this notebook to have the list of column names for dataframe import
%run column_names_lst.ipynb

#### Function to create a complete dataframe from the 2010-2025 baseball seasons. Each row is a game from that season.

**Note:** 2020 was a shortened COVID season (60 games vs 162 regular games)

In [None]:
import os

def make_dataframe(data_dir=None):
    '''
    Create a complete dataframe from Retrosheet game log files.
    
    Parameters:
    -----------
    data_dir : str, optional
        Directory containing the GL*.TXT files. 
        If None, uses current directory.
    
    Returns:
    --------
    pd.DataFrame
        Complete dataframe of the 2010-2025 baseball seasons
    '''
    df = pd.DataFrame(columns=lst)
    
    # Updated to include 2010-2025
    file_names = [
        'GL2010', 'GL2011', 'GL2012', 'GL2013', 'GL2014', 'GL2015',
        'GL2016', 'GL2017', 'GL2018', 'GL2019', 'GL2020', 'GL2021',
        'GL2022', 'GL2023', 'GL2024', 'GL2025'
    ]
    
    # Use provided directory or current directory
    if data_dir is None:
        data_dir = os.path.dirname(os.path.abspath('__file__'))
    
    files_loaded = 0
    files_missing = []
    
    for file_name in file_names:
        file_path = os.path.join(data_dir, f"{file_name}.TXT")
        
        if os.path.exists(file_path):
            data = pd.read_csv(file_path, names=lst)
            df = pd.concat([df, data], ignore_index=True)
            files_loaded += 1
            print(f"Loaded {file_name}.TXT ({len(data)} games)")
        else:
            files_missing.append(file_name)
            print(f"Warning: {file_name}.TXT not found")
    
    print(f"\nLoaded {files_loaded} files, {len(files_missing)} missing")
    print(f"Total games: {len(df)}")
    
    if files_missing:
        print(f"\nMissing files: {files_missing}")
        print("Download from: https://www.retrosheet.org/gamelogs/index.html")
        
    return df

In [None]:
# Create the dataframe
df = make_dataframe()

In [None]:
# Display info about the dataframe
print(f"Shape: {df.shape}")
print(f"\nYears covered:")
df['Date'] = df.iloc[:, 0]  # First column is date
df['Year'] = df['Date'].astype(str).str[:4]
print(df['Year'].value_counts().sort_index())

In [None]:
# Save to CSV
df.to_csv('2010_to_2025_seasons.csv', index=False)
print("Saved to 2010_to_2025_seasons.csv")