In [None]:
import os
import shutil
import pandas as pd
import xlrd
import numpy as np
import re
from datetime import datetime

# ==================== SPLIT FUNCTIONALITY ====================

def split(source_dir):
    """
    Processes Excel files from source_dir by scanning its subdirectories,
    applying specific conditions, and copying the files into two new folders:
      - NewFormat
      - OldFormat
    These folders are created within a "Consolidated_Data" folder located in the
    parent directory of source_dir.
    
    Returns:
        tuple: (new_format_dir, old_format_dir)
    """
    parent_dir = os.path.dirname(source_dir)
    consolidated_dir = os.path.join(parent_dir, 'Consolidated_Data')
    os.makedirs(consolidated_dir, exist_ok=True)
    
    new_format_dir = os.path.join(consolidated_dir, 'NewFormat')
    old_format_dir = os.path.join(consolidated_dir, 'OldFormat')
    os.makedirs(new_format_dir, exist_ok=True)
    os.makedirs(old_format_dir, exist_ok=True)
    
    folders_without_copy = []
    excel_extensions = ('.xls', '.xlsx')
    
    def get_excel_sheet_count(file_path):
        try:
            df = pd.read_excel(file_path, sheet_name=None)
            return len(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return 0
    
    def check_f16_ends_with_00(file_path):
        try:
            if file_path.lower().endswith('.xls'):
                workbook = xlrd.open_workbook(file_path)
                sheet = workbook.sheet_by_index(0)
                cell_value = sheet.cell_value(15, 5)
                if isinstance(cell_value, str) and cell_value.endswith(':00'):
                    return True
                elif isinstance(cell_value, (int, float)) and str(int(cell_value)).endswith(':00'):
                    return True
            return False
        except Exception as e:
            print(f"Error reading F16 from {file_path}: {e}")
            return False
    
    def contains_transaction_twice(file_name):
        return file_name.lower().count('trans') >= 2
    
    def get_unique_file_path(destination_folder, file_name):
        base_name, extension = os.path.splitext(file_name)
        counter = 1
        new_file_name = file_name
        while os.path.exists(os.path.join(destination_folder, new_file_name)):
            new_file_name = f"{base_name}_{counter}{extension}"
            counter += 1
        return new_file_name
    
    def contains_criteria(file_name):
        criteria_variations = ['criteria', 'citeria', 'criertia', 'criteeria', 
                                 'critiera', 'criteera', 'creiteira', 'critieira', 'criterita']
        return any(variation in file_name.lower() for variation in criteria_variations)
    
    for root, dirs, files in os.walk(source_dir):
        if any(keyword in os.path.basename(root).lower() for keyword in ['daily', 'payroll']):
            print(f"Skipping directory: {root}")
            continue
        
        excel_files_in_dir = [file for file in files if file.lower().endswith(excel_extensions)]
        if excel_files_in_dir:
            subdirectory_full_path = os.path.abspath(root)
            conditions_met = 0
            for file in excel_files_in_dir:
                file_path = os.path.join(root, file)
                sheet_count = get_excel_sheet_count(file_path)
                if check_f16_ends_with_00(file_path):
                    if contains_criteria(file.lower()) and sheet_count == 2:
                        dest_file_path = os.path.join(new_format_dir, get_unique_file_path(new_format_dir, file))
                        shutil.copy2(file_path, dest_file_path)
                        print(f"Moved {file} to NewFormat")
                        conditions_met += 1
                    elif contains_criteria(file.lower()) and sheet_count > 2:
                        dest_file_path = os.path.join(old_format_dir, get_unique_file_path(old_format_dir, file))
                        shutil.copy2(file_path, dest_file_path)
                        print(f"Moved {file} to OldFormat")
                        conditions_met += 1
                    elif ('revenue' in file.lower() and 
                          not any(keyword in file.lower() for keyword in ['all', 'lot', 'park', 'exception']) and 
                          sheet_count > 2 and not contains_transaction_twice(file.lower())):
                        dest_file_path = os.path.join(old_format_dir, get_unique_file_path(old_format_dir, file))
                        shutil.copy2(file_path, dest_file_path)
                        print(f"Moved {file} to OldFormat")
                        conditions_met += 1
                    elif ('revenue' in file.lower() and 
                          not any(keyword in file.lower() for keyword in ['all', 'lot', 'park', 'exception']) and 
                          sheet_count == 2 and not contains_transaction_twice(file.lower())):
                        dest_file_path = os.path.join(new_format_dir, get_unique_file_path(new_format_dir, file))
                        shutil.copy2(file_path, dest_file_path)
                        print(f"Moved {file} to NewFormat")
                        conditions_met += 1
            if conditions_met == 0 and 'revis' not in os.path.basename(root).lower() \
               and os.path.basename(root) != os.path.basename(os.path.dirname(root)):
                folders_without_copy.append(subdirectory_full_path)
    
    if folders_without_copy:
        print("\nFolders with Excel files that did not meet any conditions:")
        for folder in folders_without_copy:
            print(f"- {folder}")
    else:
        print("\nAll directories with Excel files met at least one condition.")
    
    return new_format_dir, old_format_dir

# ==================== DEDUPLICATION FUNCTIONALITY ====================

def read_excel_file_for_dedup(file_path):
    workbook = xlrd.open_workbook(file_path)
    sheet = workbook.sheet_by_index(0)
    f16_value = sheet.cell_value(15, 5)
    j7_value = sheet.cell_value(6, 9)
    f16_date_part = str(f16_value)[:8]
    return f16_date_part, j7_value

def deduplicate(source_dir):
    print(f"\nProcessing folder for deduplication: {source_dir}")
    f16_dict = {}
    for file_name in os.listdir(source_dir):
        file_path = os.path.join(source_dir, file_name)
        try:
            f16_date_part, j7_value = read_excel_file_for_dedup(file_path)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue
        if f16_date_part not in f16_dict:
            f16_dict[f16_date_part] = []
        file_size = os.path.getsize(file_path)
        f16_dict[f16_date_part].append({
            'file_name': file_name,
            'file_size': file_size,
            'j7_value': j7_value,
            'file_path': file_path
        })
    
    for f16_date_part, files in f16_dict.items():
        if len(files) > 1:
            print(f"\nShared F16 (first 8 characters) content: {f16_date_part}")
            seen_files = set()
            for i in range(len(files)):
                for j in range(i + 1, len(files)):
                    file1 = files[i]
                    file2 = files[j]
                    if file1['file_size'] == file2['file_size']:
                        if file1['file_path'] not in seen_files and file2['file_path'] not in seen_files:
                            print(f"\nFlagged duplicate files with F16: {f16_date_part}")
                            print(f"File 1: {file1['file_name']} (Size: {file1['file_size']} bytes)")
                            print(f"File 2: {file2['file_name']} (Size: {file2['file_size']} bytes)")
                            try:
                                os.remove(file2['file_path'])
                                print(f"Deleted duplicate file: {file2['file_path']}")
                                seen_files.add(file2['file_path'])
                            except Exception as e:
                                print(f"Error deleting file {file2['file_path']}: {e}")

# ==================== MERGE (CONSOLIDATION) FUNCTIONALITY ====================

def findHeader(df):
    row = df.index[df.iloc[:, 0] == "Entry Time"]
    return row[0] + 1

def findFooter(df):
    row = df.index[df.iloc[:, 0].str.contains("Totals", case=False, na=False)]
    totalrows = len(df)
    numFooterRows = totalrows - row[0]
    return numFooterRows

def findStation(df):
    row = df.index[df.iloc[:, 0].str.contains("Station", case=False, na=False)]
    if row.empty:
        return False
    else:
        return row[0]

def processSheetwithHeader(eFile, sName, oFile, adjust_footer=False):
    print("Processing with header:", eFile)
    df = pd.read_excel(eFile, sheet_name=sName)
    rowHeader = findHeader(df)
    rowFooter = findFooter(df)
    StationRow = findStation(df)
    if StationRow is not False:
        station = df.iloc[StationRow, 0]
        start_index = station.find("Station: ") + len("Station: ")
        end_index = station.find("-", start_index)
        Station = station[start_index:end_index]
    df = pd.read_excel(eFile, sheet_name=sName, skiprows=rowHeader, skipfooter=rowFooter+1)
    if StationRow is not False:
        df.insert(df.columns.get_loc("Prev Stn") + 1, 'Station', Station)
    df.rename(columns={'Trans Time': 'Transaction Time',
                       'Media\n': 'Media',
                       'Media ID\n': 'Media',
                       'Prev Stn': 'Previous Station',
                       'Trans Type': 'Transaction Type'}, inplace=True)
    df = df[['Entry Time', 'Transaction Time', 'Previous Station', 'Station', 'Media', 'Transaction Type', 'Revenue']]
    df.to_csv(oFile, mode="a", index=False, header=True)

def processSheetnoHeader(eFile, sName, oFile, adjust_footer=False):
    print("Processing without header:", eFile)
    df = pd.read_excel(eFile, sheet_name=sName)
    rowHeader = findHeader(df)
    rowFooter = findFooter(df)
    if adjust_footer:
        rowFooter = rowFooter + 1
    StationRow = findStation(df)
    if StationRow is not False:
        station = df.iloc[StationRow, 0]
        start_index = station.find("Station: ") + len("Station: ")
        end_index = station.find("-", start_index)
        Station = station[start_index:end_index]
    df = pd.read_excel(eFile, sheet_name=sName, skiprows=rowHeader, skipfooter=rowFooter)
    if StationRow is not False:
        df.insert(df.columns.get_loc("Prev Stn") + 1, 'Station', Station)
    df.rename(columns={'Trans Time': 'Transaction Time',
                       'Media\n': 'Media',
                       'Media ID\n': 'Media',
                       'Prev Stn': 'Previous Station',
                       'Trans Type': 'Transaction Type'}, inplace=True)
    df = df[['Entry Time', 'Transaction Time', 'Previous Station', 'Station', 'Media', 'Transaction Type', 'Revenue']]
    df.to_csv(oFile, mode="a", index=False, header=False)

def consolidateMultipleSheetswithHeadersandFooters(fileList, folder_path, oFile):
    firstFile = fileList[0]
    for eFile in fileList:
        neweFile = os.path.join(folder_path, eFile)
        print("Processing File:" + str(neweFile))
        dfChecks = pd.ExcelFile(neweFile)
        numSheets = len(dfChecks.sheet_names) + 1
        adjust_footer = True if numSheets == 3 else False
        if eFile == firstFile:
            for i in range(2, numSheets):
                cSheet = "Sheet" + str(i)
                if i == 2:
                    processSheetwithHeader(neweFile, cSheet, oFile, adjust_footer)
                else:
                    processSheetnoHeader(neweFile, cSheet, oFile, adjust_footer)
        else:
            for i in range(2, numSheets):
                cSheet = "Sheet" + str(i)
                processSheetnoHeader(neweFile, cSheet, oFile, adjust_footer)

def consolidateFolder(folder_path):
    fileList = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
    output_file = os.path.join(folder_path, "consolidated_data.csv")
    consolidateMultipleSheetswithHeadersandFooters(fileList, folder_path, output_file)
    return output_file

def merge(new_format_dir, old_format_dir):
    """
    Consolidates the Excel files in each of the two folders into a single CSV file.
    The consolidated file is saved as "consolidated_data.csv" within each folder.
    """
    print(f"\nConsolidating folder: {new_format_dir}")
    output_file_new = consolidateFolder(new_format_dir)
    print(f"Consolidated data for NewFormat created at: {output_file_new}")
    
    print(f"\nConsolidating folder: {old_format_dir}")
    output_file_old = consolidateFolder(old_format_dir)
    print(f"Consolidated data for OldFormat created at: {output_file_old}")

# ==================== CHOP FUNCTIONALITY ====================

def chop(input_file, output_dir, suffix, max_size_mb=24):
    """
    Splits a large CSV file into multiple smaller CSV files based on file size.
    
    Args:
        input_file (str): Path to the large CSV file.
        output_dir (str): Directory where the split files will be saved.
        suffix (str): Suffix to append to each split file's name (e.g., "new" or "old").
        max_size_mb (int, optional): Maximum file size in megabytes for each split file. Defaults to 24.
    """
    max_size_bytes = max_size_mb * 1024 * 1024
    os.makedirs(output_dir, exist_ok=True)
    with open(input_file, 'r', encoding='utf-8') as infile:
        header = infile.readline()
        header_bytes = header.encode('utf-8')
        file_count = 1
        current_file_path = os.path.join(output_dir, f"split_{file_count}_{suffix}.csv")
        outfile = open(current_file_path, 'w', encoding='utf-8')
        outfile.write(header)
        current_size = len(header_bytes)
        for line in infile:
            line_bytes = line.encode('utf-8')
            if current_size + len(line_bytes) > max_size_bytes:
                outfile.close()
                file_count += 1
                current_file_path = os.path.join(output_dir, f"split_{file_count}_{suffix}.csv")
                outfile = open(current_file_path, 'w', encoding='utf-8')
                outfile.write(header)
                current_size = len(header_bytes)
            outfile.write(line)
            current_size += len(line_bytes)
        outfile.close()

def chop_all(new_format_dir, old_format_dir, max_size_mb=24):
    """
    Applies the chop function to the consolidated_data.csv file in both NewFormat and OldFormat folders.
    The split files are stored in a single folder called "csv_chunks" at the same level as NewFormat and OldFormat.
    
    The output files will be named like "split_1_new.csv", "split_2_new.csv" (for NewFormat) and 
    "split_1_old.csv", "split_2_old.csv" (for OldFormat).
    
    Parameters:
        new_format_dir (str): Path to the NewFormat folder.
        old_format_dir (str): Path to the OldFormat folder.
        max_size_mb (int, optional): Maximum file size in MB for each split file.
    """
    parent_folder = os.path.dirname(new_format_dir)
    consolidated_new_file = os.path.join(new_format_dir, "consolidated_data.csv")
    consolidated_old_file = os.path.join(old_format_dir, "consolidated_data.csv")
    output_dir = os.path.join(parent_folder, "csv_chunks")
    
    print(f"Chopping consolidated file in NewFormat: {consolidated_new_file}")
    chop(consolidated_new_file, output_dir, suffix="new", max_size_mb=max_size_mb)
    print(f"Split files for NewFormat created in: {output_dir}\n")
    
    print(f"Chopping consolidated file in OldFormat: {consolidated_old_file}")
    chop(consolidated_old_file, output_dir, suffix="old", max_size_mb=max_size_mb)
    print(f"Split files for OldFormat created in: {output_dir}")

# ==================== MASTER WORKFLOW ====================

def main(source_dir, max_chop_mb=24):
    """
    Master function that:
      1. Calls split() on the user-provided source directory.
      2. Retrieves the generated NewFormat and OldFormat directories.
      3. Runs deduplication on each of these directories.
      4. Runs merge() on each of these directories to consolidate Excel files into CSV files.
      5. Splits (chops) each consolidated CSV file into smaller chunks.
    
    The user needs only supply the source directory.
    """
    print(f"Starting split process for data in: {source_dir}")
    new_format_dir, old_format_dir = split(source_dir)
    print("\nSplit process complete.")
    
    print("\nStarting deduplication on NewFormat files...")
    deduplicate(new_format_dir)
    print("\nStarting deduplication on OldFormat files...")
    deduplicate(old_format_dir)
    
    print("\nStarting merge (consolidation) process...")
    merge(new_format_dir, old_format_dir)
    print("\nMerge process complete.")
    
    print("\nStarting chop process on consolidated CSV files...")
    chop_all(new_format_dir, old_format_dir, max_size_mb=max_chop_mb)
    
    print("\nAll processing complete.")

# ==================== Example Usage ====================
# In a Jupyter Notebook, the user would simply call:
# main(r'C:\Users\mvalsania25\Desktop\Parking_Complete', max_chop_mb=24)
