<a href="https://colab.research.google.com/github/mattwantshouses/name_parsing/blob/main/Prod_Combine_Scraper_Results_%2B_Foreclosure_file_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Combine Scraper results with Foreclosure info
### Combine the base file (ex: Foreclosures) with the scraper file to retain all the info

This will link both files together based on the Parcel ID.
Updated code keeps only records found in the smaller file.

Both files must have a "Parcel ID" column. It doesn't have to be exact, but it does need to include those items.

In [None]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import os
from google.colab import files
import io
from datetime import datetime
import pytz

In [None]:
# Version 2 - only matches the records present in the smaller of the two files

# 1. Function to find the parcel ID column
def find_parcel_id_column(df):
    for col in df.columns:
        if 'parcel' in col.lower() and 'id' in col.lower():
            return col
    return None

# 2. Function to load a file (CSV or Excel)
def load_file(file_name, file_content):
    try:
        if file_name.endswith('.csv'):
            df = pd.read_csv(io.BytesIO(file_content), dtype=str)
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(io.BytesIO(file_content), dtype=str)
        else:
            raise ValueError(f"Unsupported file format for {file_name}. Please use CSV or Excel files.")
        return df
    except Exception as e:
        print(f"Error loading file {file_name}: {e}")
        return None

# 3. Main script
def main():
    print("Please upload both files (CSV or Excel):")
    uploaded_files = files.upload()

    if len(uploaded_files) != 2:
        print(f"Error: Expected 2 files, but {len(uploaded_files)} were uploaded. Please run the cell again and upload exactly 2 files.")
        return

    file_names = list(uploaded_files.keys())
    dfs = []

    for file_name, file_content in uploaded_files.items():
        df = load_file(file_name, file_content)
        if df is None:
            print(f"Error loading {file_name}. Exiting.")
            return
        dfs.append(df)

    # Determine which dataframe has fewer rows
    if len(dfs[0]) <= len(dfs[1]):
        df_left, df_right = dfs[0], dfs[1]
        file_name_left, file_name_right = file_names[0], file_names[1]
    else:
        df_left, df_right = dfs[1], dfs[0]
        file_name_left, file_name_right = file_names[1], file_names[0]

    # 4. Find parcel ID columns
    parcel_id_col_left = find_parcel_id_column(df_left)
    parcel_id_col_right = find_parcel_id_column(df_right)

    if parcel_id_col_left is None or parcel_id_col_right is None:
        print("Error: Unable to find a suitable parcel ID column in one or both files.")
        return

    print(f"Using '{parcel_id_col_left}' from {file_name_left} and '{parcel_id_col_right}' from {file_name_right} as parcel ID columns.")

    # 5. Merge dataframes
    try:
        merged_df = pd.merge(df_left, df_right, left_on=parcel_id_col_left, right_on=parcel_id_col_right, how='left', suffixes=('_1', '_2'))
    except Exception as e:
        print(f"Error merging dataframes: {e}")
        return

    # 6. Save merged file
    try:
        # Get current date and time in EST
        est = pytz.timezone('US/Eastern')
        current_time = datetime.now(est).strftime("%m-%d-%Y_%H-%M-%S")
        output_filename = f"Combined Base-Scraper Results_{current_time}.xlsx"

        # Save the DataFrame to an Excel file
        merged_df.to_excel(output_filename, index=False)

        # Download the file
        files.download(output_filename)

        print(f"Merged file '{output_filename}' has been created and is ready for download.")
        print("If the download doesn't start automatically, please check your browser's download settings.")

    except Exception as e:
        print(f"Error saving merged file: {e}")

if __name__ == "__main__":
    main()

Please upload both files (CSV or Excel):


Saving Foreclosures 07-08-24.csv to Foreclosures 07-08-24.csv
Saving khaliq foreclosure scraper results 07-08-24 (1).xlsx to khaliq foreclosure scraper results 07-08-24 (1).xlsx
Using 'parcel_id' from khaliq foreclosure scraper results 07-08-24 (1).xlsx and 'Parcel ID:' from Foreclosures 07-08-24.csv as parcel ID columns.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Merged file 'Combined Base-Scraper Results_07-08-2024_23-27-48.xlsx' has been created and is ready for download.
If the download doesn't start automatically, please check your browser's download settings.


# Next iteration

To change the output filename to "Combined " + filename of the smaller file, we need to make modifications to sections 3 and 6 of the script. Here are the changes:

3. Main script (add these lines after loading the files):
```python
# After loading the files, add these lines:
file_sizes = {file_name: len(df) for file_name, df in zip(file_names, dfs)}
smaller_file_name = min(file_sizes, key=file_sizes.get)
smaller_file_name_without_extension = os.path.splitext(smaller_file_name)[0]
```

6. Save merged file (replace the existing code with this):
```python
# 6. Save merged file
try:
    # Get current date and time in EST
    est = pytz.timezone('US/Eastern')
    current_time = datetime.now(est).strftime("%m-%d-%Y_%H-%M-%S")
    output_filename = f"Combined {smaller_file_name_without_extension}_{current_time}.xlsx"

    # Save the DataFrame to an Excel file
    merged_df.to_excel(output_filename, index=False)

    # Download the file
    files.download(output_filename)

    print(f"Merged file '{output_filename}' has been created and is ready for download.")
    print("If the download doesn't start automatically, please check your browser's download settings.")

    except Exception as e:
    print(f"Error saving merged file: {e}")

```

These changes will:
1. Determine which file is smaller based on the number of rows in each DataFrame.
2. Extract the filename of the smaller file without its extension.
3. Use this filename in the output filename, combined with "Combined " at the beginning and the current date and time at the end.

The rest of the script remains unchanged. These modifications will achieve the desired output filename format while maintaining all other functionality.