In [14]:
import os
import xlrd

# Folder containing the Excel files
source_dir = r'C:\Users\mvalsania25\Desktop\Destination_X'

# Function to get the first 8 characters of F16 and J7 from an Excel file using xlrd
def read_excel_file(file_path):
    # Open the Excel file using xlrd
    workbook = xlrd.open_workbook(file_path)
    sheet = workbook.sheet_by_index(0)  # Get the first sheet
    
    # Get the value from F16 (Row 15, Column 5, xlrd uses 0-based indexing)
    f16_value = sheet.cell_value(15, 5)  # F16 corresponds to row 15, column 5
    
    # Get the value from J7 (Row 6, Column 9)
    j7_value = sheet.cell_value(6, 9)  # J7 corresponds to row 6, column 9
    
    # Extract the first 8 characters of F16 (date part)
    f16_date_part = str(f16_value)[:8]  # Get first 8 characters
    
    return f16_date_part, j7_value

# Dictionary to track F16 values (first 8 characters) and corresponding documents
f16_dict = {}

# Loop through all files in the directory
for file_name in os.listdir(source_dir):
    # Construct the full file path
    file_path = os.path.join(source_dir, file_name)
    
    # Handle files using xlrd (works for both .xls and .xlsx files)
    f16_date_part, j7_value = read_excel_file(file_path)

    # Add the file and its details to the f16_dict (using the first 8 characters of F16)
    if f16_date_part not in f16_dict:
        f16_dict[f16_date_part] = []
    file_size = os.path.getsize(file_path)  # Get the file size in bytes
    
    f16_dict[f16_date_part].append({
        'file_name': file_name,
        'file_size': file_size,
        'j7_value': j7_value,
        'file_path': file_path
    })

# Now, check if there are any F16 values (first 8 characters) that appear in multiple files
for f16_date_part, files in f16_dict.items():
    if len(files) > 1:
        print(f"\nShared F16 (first 8 characters) content: {f16_date_part}")
        
        # Check for duplicates within the group of files that share the same F16 value
        seen_files = set()  # To keep track of files already considered for deletion

        # Loop through the files that have the same F16 (first 8 characters) value
        for i in range(len(files)):
            for j in range(i + 1, len(files)):
                file1 = files[i]
                file2 = files[j]

                # If the files are the same size, delete one of them
                if file1['file_size'] == file2['file_size']:
                    if file1['file_path'] not in seen_files and file2['file_path'] not in seen_files:
                        # Print metadata for both files
                        print(f"\nFlagged files with the same F16 content: {f16_date_part}")
                        print(f"File 1: {file1['file_name']}")
                        print(f"  Size: {file1['file_size']} bytes")
                        print(f"  Content of J7: {file1['j7_value']}")
                        print(f"  Full Path: {file1['file_path']}")
                        
                        print(f"File 2: {file2['file_name']}")
                        print(f"  Size: {file2['file_size']} bytes")
                        print(f"  Content of J7: {file2['j7_value']}")
                        print(f"  Full Path: {file2['file_path']}")
                        
                        # Delete one of the files (choosing file2 here)
                        try:
                            os.remove(file2['file_path'])
                            print(f"Deleted file: {file2['file_path']}")
                            seen_files.add(file2['file_path'])  # Mark file2 as deleted
                        except Exception as e:
                            print(f"Error deleting file {file2['file_path']}: {e}")



Shared F16 (first 8 characters) content: 04/28/23

Flagged files with the same F16 content: 04/28/23
File 1: CRITERIA 04.29.2023.xls
  Size: 2294784 bytes
  Content of J7: 4/30/2023
  Full Path: C:\Users\mvalsania25\Desktop\Destination_X\CRITERIA 04.29.2023.xls
File 2: CRITERIA 04.29.2023_1.xls
  Size: 2294784 bytes
  Content of J7: 4/30/2023
  Full Path: C:\Users\mvalsania25\Desktop\Destination_X\CRITERIA 04.29.2023_1.xls
Deleted file: C:\Users\mvalsania25\Desktop\Destination_X\CRITERIA 04.29.2023_1.xls

Shared F16 (first 8 characters) content: 05/13/23

Flagged files with the same F16 content: 05/13/23
File 1: CRITERIA 05.14.2023.xls
  Size: 2448384 bytes
  Content of J7: 5/15/2023
  Full Path: C:\Users\mvalsania25\Desktop\Destination_X\CRITERIA 05.14.2023.xls
File 2: CRITERIA 05.14.2023_1.xls
  Size: 2448384 bytes
  Content of J7: 5/15/2023
  Full Path: C:\Users\mvalsania25\Desktop\Destination_X\CRITERIA 05.14.2023_1.xls
Deleted file: C:\Users\mvalsania25\Desktop\Destination_X\CRIT