In [15]:
import pandas as pd
import glob

# Step 1: List all CSV files
files = glob.glob('../data/raw/*.csv')

# Step 2: Columns you want to keep
columns_to_keep = [
    'Date', 'Div', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'FTR',
    'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 
]

# Step 3: Keep track of valid DataFrames
dfs_to_merge = []

for file in files:
    df = pd.read_csv(file)
    cols_in_file = df.columns.tolist()
    missing_cols = [col for col in columns_to_keep if col not in cols_in_file]
    if missing_cols:
        print(f"{file} is missing columns: {missing_cols}")
    else:
        print(f"{file} contains all required columns.")
        # Select only required columns to ensure consistency
        dfs_to_merge.append(df[columns_to_keep])

# Step 4: Merge all valid DataFrames
if dfs_to_merge:
    merged_df = pd.concat(dfs_to_merge, ignore_index=True)
    print("All CSVs merged successfully!")
    # Optional: save merged CSV
    merged_df.to_csv('../data/merged_file.csv', index=False)
else:
    print("No CSVs to merge. Check your files.")


../data/raw\2000-01.csv contains all required columns.
../data/raw\2001-02.csv contains all required columns.
../data/raw\2002-03.csv contains all required columns.
../data/raw\2003-04.csv contains all required columns.
../data/raw\2004-05.csv contains all required columns.
../data/raw\2005-06.csv contains all required columns.
../data/raw\2006-07.csv contains all required columns.
../data/raw\2007-08.csv contains all required columns.
../data/raw\2008-09.csv contains all required columns.
../data/raw\2009-10.csv contains all required columns.
../data/raw\2010-11.csv contains all required columns.
../data/raw\2011-12.csv contains all required columns.
../data/raw\2012-13.csv contains all required columns.
../data/raw\2013-14.csv contains all required columns.
../data/raw\2014-15.csv contains all required columns.
../data/raw\2015-16.csv contains all required columns.
../data/raw\2016-17.csv contains all required columns.
../data/raw\2017-18.csv contains all required columns.
../data/ra

In [14]:
import pandas as pd
import glob

# Step 1: List all CSV files
files = glob.glob('../data/raw/*.csv')

# Step 2: Initialize set of columns from the first CSV
if files:
    first_df = pd.read_csv(files[0])
    common_cols = set(first_df.columns.tolist())
    
    # Step 3: Intersect with columns of all other CSVs
    for file in files[1:]:
        df = pd.read_csv(file)
        common_cols &= set(df.columns.tolist())
    
    print("Columns common to all CSVs:")
    print(sorted(common_cols))
else:
    print("No CSV files found.")


Columns common to all CSVs:
['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'Date', 'Div', 'FTAG', 'FTHG', 'FTR', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HTR', 'HY', 'HomeTeam', 'Referee']
