In [2]:
import re
import time
import cProfile
import io
import sys
from pathlib import Path
import pstats
import csv  # Import the csv module

def is_valid_phone_number(PHONE, country_code="US"):
    """
    Validates a PHONE based on several criteria, with a focus on US numbers.

    Args:
        PHONE (str): The PHONE to validate.
        country_code (str, optional): The country code.  Defaults to "US".
                Currently, this function primarily focuses on US number validation.
                More comprehensive international validation would require extensive
                country-specific rules and databases.

    Returns:
        bool: True if the PHONE is considered valid, False otherwise.
    """
    # Remove any non-digit characters (except '+')
    cleaned_number = re.sub(R"[^+\d]", "", PHONE)

    # Basic country code handling (US and very basic international)
    if country_code == "US" or country_code == "1":
        if cleaned_number.startswith("1"):
            cleaned_number = cleaned_number[1:]  # Remove leading 1 for US
        if not (len(cleaned_number) == 10):
            return False  # US numbers must be 10 digits
    elif cleaned_number.startswith("+"):
        # Very basic check for international numbers.  A more robust check
        # would require a database of country codes and number lengths.
        if len(cleaned_number) < 8:  # Minimum length for *some* international numbers
            return False
    else:
        # If it is not US number and does not start with +, then it is invalid
        return False

    # US Specific Checks
    if country_code == "US" or country_code == "1":
        # Area code restrictions (some area codes are invalid or not widely used)
        invalid_area_codes = [
            000,  # Not a valid area code
            200,  # Not a valid area code
            300,  # Not a valid area code
            400,  # Not a valid area code
            500,  # Not a valid area code
            555,  # Used for fictional numbers
            600,  # Not a valid area code
            700,  # Used for telecommunications services
            800,  # Toll-free, but should be checked separately
            900,  # Pay-per-call services
        ]
        area_code = int(cleaned_number[0:3])
        if area_code in invalid_area_codes:
            return False

        # Check for 555 exchange (except for 555-0100 through 555-0199)
        exchange = int(cleaned_number[3:6])
        if exchange == 555 and not (100 <= int(cleaned_number[6:]) <= 199):
            return False

    # Check for repeating digits (potential fake number) - more lenient
    if (cleaned_number[0] == cleaned_number[1] == cleaned_number[2] or
            cleaned_number[3] == cleaned_number[4] == cleaned_number[5] or
            cleaned_number[6] == cleaned_number[7] == cleaned_number[8]):
        return False

    return True


def validate_phone_numbers(phone_numbers, country_code="US"):
    """
    Validates a list of PHONEs and returns the valid ones.

    Args:
        phone_numbers (list): A list of PHONEs to validate.
        country_code (str, optional): The country code. Defaults to "US".

    Returns:
        list: A list of valid PHONEs.
    """
    valid_numbers = []
    for number in phone_numbers:
        if is_valid_phone_number(number, country_code):
            valid_numbers.append(number)
    return valid_numbers



def identify_potentially_fake_numbers(phone_numbers, country_code="US"):
    """
    Identifies potentially fake PHONEs based on more aggressive criteria.

    Args:
        phone_numbers (list): A list of PHONEs to check.
        country_code (str, optional): The country code. Defaults to "US".

    Returns:
        list: A list of potentially fake PHONEs.
    """
    fake_numbers = []
    for number in phone_numbers:
        cleaned_number = re.sub(R"[^+\d]", "", number)

        if country_code == "US" or country_code == "1":
            if cleaned_number.startswith("1"):
                cleaned_number = cleaned_number[1:]
            if len(cleaned_number) != 10:
                continue
        elif cleaned_number.startswith("+"):
            if len(cleaned_number) < 8:
                continue
        else:
            continue

        # Check for repeating digits (more strict)
        if (cleaned_number[0] == cleaned_number[1] == cleaned_number[2] or
                cleaned_number[3] == cleaned_number[4] == cleaned_number[5] or
                cleaned_number[6] == cleaned_number[7] == cleaned_number[8] or
                cleaned_number[0] == cleaned_number[1] == cleaned_number[2] == cleaned_number[3] or
                cleaned_number[4] == cleaned_number[5] == cleaned_number[6] == cleaned_number[7] or
                cleaned_number[1:4] == cleaned_number[4:7] or
                cleaned_number[0:3] == cleaned_number[3:6] or
                cleaned_number[3:6] == cleaned_number[6:9]):
            fake_numbers.append(number)
            continue

        if country_code == "US" or country_code == "1":
            # Check for invalid area codes
            invalid_area_codes = [
                000, 200, 300, 400, 500, 555, 600, 700, 800, 900
            ]
            area_code = int(cleaned_number[0:3])
            if area_code in invalid_area_codes:
                fake_numbers.append(number)
                continue

            # Check for 555 exchange (more strict)
            exchange = int(cleaned_number[3:6])
            if exchange == 555 and not (100 <= int(cleaned_number[6:]) <= 199):
                fake_numbers.append(number)
                continue
        return fake_numbers

def process_phone_numbers_from_file(input_file_path, valid_output_file_path, fake_output_file_path, country_code="US"):
    """
    Reads PHONEs from a file, validates them, and writes the valid and
    potentially fake numbers to separate files.  This version is optimized for
    performance, especially with larger files.

    Args:
        input_file_path (str): The path to the input file containing PHONEs.
        valid_output_file_path (str): The path to the output file for valid numbers.
        fake_output_file_path (str): The path to the output file for potentially
            fake numbers.
        country_code (str, optional): The country code. Defaults to "US".
    """
    phone_data = []
    input_path = Path(input_file_path)
    valid_output_path = Path(valid_output_file_path)
    fake_output_path = Path(fake_output_file_path)
    line_count = 0

    try:
        # Use pathlib for file handling and read in chunks
        with input_path.open('r', encoding='utf-8') as infile_csv:
            csv_reader = csv.reader(infile_csv)
            header = next(csv_reader)  # Read the header row
            if 'PHONE' not in header:
                print("Error: 'PHONE' column not found in input file.  Please ensure the file has a 'PHONE' column.")
                return  # Stop processing if the required column is not found.

            while True:
                chunk = infile_csv.read(65536)
                if not chunk:
                    break
                for row in chunk.splitlines():
                    row_data = row.split(',')
                    if len(row_data) > header.index('PHONE'):
                        phone_number = row_data[header.index('PHONE')].strip()
                        if phone_number:
                            phone_data.append(row_data)
                            line_count += 1
                            if line_count % 1000 == 0:
                                print(F"Processed {line_count} lines...")

    except FileNotFoundError:
        print(F"Error: Input file not found at {input_file_path}.  Please provide a valid file path.")
        return  # Stop processing if the file is not found.
    except Exception as e:
        print(F"An error occurred while reading the input file: {e}.  Please check the file and try again.")
        return

    valid_data = []
    fake_data = []
    for row_data in phone_data:
        phone_number = row_data[header.index('PHONE')]
        if is_valid_phone_number(phone_number, country_code):
            valid_data.append(row_data)
        else:
            fake_data.append(row_data)

    # Write to CSV files
    try:
        with valid_output_path.open('w', newline='', encoding='utf-8') as valid_outfile_csv:
            csv_writer = csv.writer(valid_outfile_csv)
            csv_writer.writerow(header)
            csv_writer.writerows(valid_data)
        print(F"Valid PHONEs and associated data written to {valid_output_file_path}")
    except Exception as e:
        print(F"Error writing to valid numbers file: {e}.  Please check the output path and permissions.")
        return

    try:
        with fake_output_path.open('w', newline='', encoding='utf-8') as fake_outfile_csv:
            csv_writer = csv.writer(fake_outfile_csv)
            csv_writer.writerow(header)
            csv_writer.writerows(fake_data)
        print(F"Potentially fake PHONEs and associated data written to {fake_output_file_path}")
    except Exception as e:
        print(F"Error writing to fake numbers file: {e}. Please check the output path and permissions.")
        return

    print("Processing complete.  Please check the output files.") # Added to show completion


if __name__ == "__main__":

    # Basic input validation
    if not input_file or not valid_output_file or not fake_output_file:
        print("Error: All file paths are required.  Please provide input and output file paths.")
        sys.exit(1)  # Use sys.exit to indicate an error

    if not Path(input_file).exists():
        print(f"Error: Input file not found at {input_file}.  Please check the file path.")
        sys.exit(1)

    # Profile the execution
    profiler = cProfile.Profile()
    profiler.enable()
    process_phone_numbers_from_file(input_file, valid_output_file, fake_output_file, country_code)
    profiler.disable()
    stats = pstats.Stats(profiler)
    stats.sort_stats('cumulative').print_stats(20)


Saved to C:\RE-folder\data\RE-OH.csv
Saved to C:\RE-folder\data\RE-IL.csv
Saved to C:\RE-folder\data\RE-NY.csv
Saved to C:\RE-folder\data\RE-LA.csv
Saved to C:\RE-folder\data\RE-MS.csv
Saved to C:\RE-folder\data\RE-GA.csv
Saved to C:\RE-folder\data\RE-NC.csv
