## Load modules and specify paths

In [None]:
# Import required modules
import pandas as pd
import glob2
from openpyxl import load_workbook

# Specify paths to files
master_path = r'C:\Users\MT1070\Desktop\Master Call Volume' 
additional_files_path = r'C:\Users\MT1070\Desktop\Master Call Volume\speech-report-extracts'

# Display names of files in master folder
master_file_names = glob2.glob(master_path + "/*.xlsx")

# Display names of files in additional files folder
additional_file_names = glob2.glob(additional_files_path + "/*.xlsx")

# Remove the master Excel file from additional_file_names list if present
additional_file_names = [file for file in additional_file_names if file != r'C:\Users\MT1070\Desktop\Master Call Volume\speech-report-extracts']

# Initialize an empty data frame to store data from all files
final_sheet = pd.DataFrame()

## Read data extracts and append to DF

In [None]:
# Iteratively read data extracts from master files and append them to the final DataFrame
data_frames = []  # Store DataFrames from master files
for file in master_file_names:
    try:
        df = pd.read_excel(file, sheet_name=None, skipfooter=2, engine='openpyxl')
        df_concat = pd.concat(df.values(), ignore_index=True, sort=False)
        data_frames.append(df_concat)
    except Exception as e:
        print(f"Error occurred while processing file: {file}")
        print(str(e))

## Merge all DataFrames vertically and reset index

In [None]:
if len(data_frames) > 0:
    final_sheet = pd.concat(data_frames, axis=0, ignore_index=True)

# Reset index to use data col in final_sheet
final_sheet.reset_index(drop=True, inplace=True)

In [None]:
# Read df_additional data
df_additional = pd.DataFrame()
for file in additional_file_names:
    try:
        df_additional_temp = pd.read_excel(file, sheet_name=None, skipfooter=2, engine='openpyxl')
        df_additional_temp_concat = pd.concat(df_additional_temp.values(), ignore_index=True, sort=False)
        df_additional = pd.concat([df_additional, df_additional_temp_concat], axis=0, ignore_index=True)
    except Exception as e:
        print(f"Error occurred while processing additional file: {file}")
        print(str(e))



## Merge data horizontally and set column headers

In [None]:
if not df_additional.empty:
    final_sheet = pd.concat([final_sheet, df_additional], axis=1)

# Set the column headers
final_sheet.columns = final_sheet.iloc[0]
final_sheet = final_sheet[1:]

# print(final_sheet.columns)


### Combine data into a new Excel file

In [19]:
master_file_path = r"C:\Users\MT1070\Desktop\Master Call Volume\SpinSci Call Volume Evaluation.xlsx"
final_sheet.to_excel(master_file_path, sheet_name='Raw Data', header=True, index=False)


## Reformat date column and remove rows with 'date' in the date column

In [None]:
final_sheet['date'] = pd.to_datetime(final_sheet['date'], format='%m/%d/%Y', errors='coerce')
final_sheet = final_sheet[~final_sheet['date'].astype(str).str.contains('date', case=False, na=False)]

# Group the data set based on calendar date
filtered_data = final_sheet.groupby(pd.Grouper(key='date', freq='D')).sum()

## Combine data into a new Excel file

In [None]:
master_file_path = r"C:\Users\MT1070\Desktop\Master Call Volume\SpinSci Call Volume Evaluation.xlsx"
final_sheet.to_excel(master_file_path, sheet_name='Raw Data', header=True, index=False)


### Graveyard

In [None]:
# Group the data set based on calendar date
# filtered_data = final_sheet.groupby(final_sheet['date'].dt.date).sum()


# # Drop rows where 'date' column contains the string 'date'
# final_sheet = final_sheet[~final_sheet['date'].str.contains('date', case=False, na=False)]