# Excel Data Extractor - Extract & Write

This function builds on ExcelDataExtractor tool by extracting specified data in extracted fields file and writing the extracted data into .xlsx files.

Requirements: Run function in folder containing the source folder.

Usage: ExtractExcelData_ExtractWrite()

Output: Creates a folder (named "Extracted_Data") of excel files with extracted excel data as specified in "_Extracted_Fields.xlsx" files.

Returm: merged_df_target_dict: dictionary of merged dataframes for each filtered df.



In [1]:
def ExtractExcelData_ExtractWrite():
    import os
    import pandas as pd
    import numpy as np
    import re
    import pickle
    
    regex=re.compile('(.*)_Extracted_Fields\.xlsx$')
    regex2=re.compile(r'.*\\Extracted_Data$')
    
    # Retrieve source_folder and merged_df_dict
    with open('ExtractExcelDataObjects.pkl','rb') as f:  # Python 3: open(..., 'rb')
        source_folder, merged_df_dict = pickle.load(f)

    #If Extracted_Data folder does not exist
    if 'Extracted_Data' not in os.listdir(source_folder):
            os.mkdir(os.path.join('.',source_folder,'Extracted_Data'))
    
    # Prep excel writer
    writer = pd.ExcelWriter(os.path.join('..','Extracted_Data','Extracted Data - Master.xlsx'), 
                            engine='xlsxwriter')
    print(os.path.join('..','Extracted_Data','Extracted Data - Master.xlsx'))
    cwd=os.getcwd()
    os.chdir(os.path.join(".",source_folder,"Extracted_Fields"))
       
    merged_df_target_dict=dict()  
    for file in os.listdir():
        df_required_fields=pd.read_excel(file,header=0)
        df_required_fields.dropna(axis=0,subset=["Template Column Name"],inplace=True)
        tpl_required_fields=list(zip(df_required_fields["Sheet Name"].tolist(),df_required_fields["Column Name"].tolist()))

        mass_dump_filename_withoutext=regex.search(file).group(1)
        mass_dump_filename=''.join([mass_dump_filename_withoutext,'.xlsx'])
        print("Extracting data from {0}".format(mass_dump_filename))
        
        # Extract specified columns
        df_target=merged_df_dict[mass_dump_filename].loc[:,tpl_required_fields]
        df_target.reset_index(inplace=True)
        
#         # Create target file directory path
#         df_target_file_filename=''.join([mass_dump_filename_withoutext,'_Extracted_Data','.xlsx'])
#         df_target_file_directory=os.path.join("..","Extracted_Data",df_target_file_filename)
    
        # Write extracted fields into destination folder
        sheetname=''.join([mass_dump_filename_withoutext,'_Data'])
        merged_df_target_dict[sheetname]=df_target
        
        df_target.to_excel(writer, sheet_name=sheetname,index=True)  # send df to writer
        worksheet = writer.sheets[sheetname]  # pull worksheet object

        for idx, col in enumerate(df_target):  # loop through all columns
                series = df_target[col]
                max_len = max((
                    series.astype(str).map(len).max(),  # len of largest item
                    len(str(series.name))  # len of column name/header
                    )) + 1  # adding a little extra space
                worksheet.set_column(idx, idx, max_len)  # set column width .set_column(first_col, last_col, width, cell_format, options)
        
        print("Sheet created! - {0}".format(sheetname))
    
    writer.save()
    os.chdir(cwd)
    os.getcwd()
    
    print("\nData extracted!")
    
    return merged_df_target_dict

In [2]:
merged_df_target_dict=ExtractExcelData_ExtractWrite()

..\Extracted_Data\Extracted Data - Master.xlsx
Extracting data from pc paperboard (1).xlsx
Sheet created! - pc paperboard (1)_Data
Extracting data from pc-corrugated.xlsx
Sheet created! - pc-corrugated_Data
Extracting data from pc-flexible.xlsx
Sheet created! - pc-flexible_Data
Extracting data from pc-injebm.xlsx
Sheet created! - pc-injebm_Data
Extracting data from pc-metal.xlsx
Sheet created! - pc-metal_Data
Extracting data from Thermo PC sheet.xlsx
Sheet created! - Thermo PC sheet_Data

Data extracted!
