In [7]:
import pandas as pd
import glob
import os

'''
Loop through all excel files in the excel folder and convert them to csv
The header row conatains:
- 0: Exchange Date
- 1: Close
- 2: %Chg
- 3: Open
- 4: Low
- 5: High
- 6: Volume
- 7: Turnover - USD
Which also is the name of the column
'''

directory = os.fsencode('excel_data')

# column headers
expected_columns = ['Exchange Date', 'Close', '%Chg',
                    'Open', 'Low', 'High', 'Volume', 'Turnover - USD']

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xlsx"):
        print(f'Processing {filename}')

        df = pd.read_excel(f'excel_data/{filename}', header=None)

        for i, row in df.iterrows():
            if list(row[:8]) == expected_columns:
                header_row = i
                print(f'Header found at row {header_row} in {filename}')
                break
        else:
            print(f'Header not found in {filename}, skipping file')
            continue

        df = pd.read_excel(f'excel_data/{filename}', skiprows=header_row)

        df.iloc[:, :8].to_csv(f'etf_csv/{filename[:-5]}.csv', index=False)
    else:
        continue

Processing GOVT.xlsx
Header found at row 29 in GOVT.xlsx
Processing EFAS.xlsx
Header found at row 32 in EFAS.xlsx
Processing IEV.xlsx
Header found at row 27 in IEV.xlsx
Processing EWG.xlsx
Header found at row 32 in EWG.xlsx
Processing VWOB.xlsx
Header found at row 27 in VWOB.xlsx
Processing SPY.xlsx
Header found at row 31 in SPY.xlsx
Processing SAUS.xlsx
Header found at row 29 in SAUS.xlsx
Processing VEU.xlsx
Header found at row 28 in VEU.xlsx
Processing GHYG.xlsx
Header found at row 28 in GHYG.xlsx
Processing EWJ.xlsx
Header found at row 30 in EWJ.xlsx
Processing IGOV.xlsx
Header found at row 26 in IGOV.xlsx
Processing LQD.xlsx
Header found at row 30 in LQD.xlsx
Processing AGGG.xlsx
Header found at row 29 in AGGG.xlsx
Processing VTI.xlsx
Header found at row 26 in VTI.xlsx
Processing EDIV.xlsx
Header found at row 29 in EDIV.xlsx
Processing IWM.xlsx
Header found at row 31 in IWM.xlsx
Processing HYG.xlsx
Header found at row 28 in HYG.xlsx
Processing PSP.xlsx
Header found at row 32 in PSP

In [8]:
'''
Verify that all csv files have the same number of rows
'''

csv_files = glob.glob('etf_csv/*.csv')
for file in csv_files:
    df = pd.read_csv(file)
    print(f'{file}: {len(df)} rows')


etf_csv/EDIV.csv: 3209 rows
etf_csv/IEV.csv: 3209 rows
etf_csv/BWX.csv: 1627 rows
etf_csv/AHYG.csv: 3263 rows
etf_csv/IGOV.csv: 3209 rows
etf_csv/EFA.csv: 3209 rows
etf_csv/EEM.csv: 3209 rows
etf_csv/VEU.csv: 3210 rows
etf_csv/IEF.csv: 3209 rows
etf_csv/VWOB.csv: 2862 rows
etf_csv/GHYG.csv: 1404 rows
etf_csv/EWJ.csv: 3209 rows
etf_csv/EMB.csv: 3209 rows
etf_csv/GLD.csv: 3268 rows
etf_csv/VTI.csv: 3210 rows
etf_csv/LQD.csv: 3209 rows
etf_csv/IHYG.csv: 3253 rows
etf_csv/EWZ.csv: 3209 rows
etf_csv/IWM.csv: 3209 rows
etf_csv/URTH.csv: 1578 rows
etf_csv/EWC.csv: 3209 rows
etf_csv/SDY.csv: 3209 rows
etf_csv/TLT.csv: 3209 rows
etf_csv/EWU.csv: 3209 rows
etf_csv/EFAS.csv: 1991 rows
etf_csv/ACWI.csv: 3209 rows
etf_csv/SPY.csv: 3268 rows
etf_csv/EWG.csv: 3209 rows
etf_csv/EPP.csv: 3209 rows
etf_csv/PSP.csv: 3209 rows
etf_csv/AGGG.csv: 1767 rows
etf_csv/SAUS.csv: 3253 rows
etf_csv/HYG.csv: 3209 rows
etf_csv/GOVT.csv: 3185 rows
etf_csv/FXI.csv: 3209 rows
etf_csv/REET.csv: 2583 rows
etf_csv/JNK.csv