01 Read forty datasets

In [6]:
import pandas as pd
from typing import Union, Tuple, List, Dict
import logging
import re
import numpy as np
import string
from IPython.display import display, HTML

# Load the correct file to get the FORTY dataset details
forty_datasets_file_path = 'Fortydatasets.xlsx'

# Load the sheet to obtain the details of all datasets
forty_datasets_df = pd.read_excel(forty_datasets_file_path)

# Display the first few rows to verify the loaded data
forty_datasets_df.head()

Unnamed: 0,index,name,area,url,instances,attributes,year,#webhits,Order,New_area,...,names_file_format,attribute_info,source,data_set_information,relevant_papers,papers_that_cite_this_data_set,num_papers,#numpapers,2#,FinalRank
0,52,Iris,Life,https://archive.ics.uci.edu/ml/datasets/Iris,150.0,4.0,1988.0,1,,,...,,,,,,,100,1.0,2.0,1.0
1,107,Wine,Physical,https://archive.ics.uci.edu/ml/datasets/Wine,178.0,13.0,1991.0,5,,,...,,,,,,,40,13.0,18.0,4.0
2,92,Spambase,Computer,https://archive.ics.uci.edu/ml/datasets/Spambase,4601.0,57.0,1999.0,25,,,...,,,,,,,4,75.0,100.0,24.0
3,45,Heart Disease,Life,https://archive.ics.uci.edu/ml/datasets/Heart+...,303.0,75.0,1988.0,4,,,...,,,,,,,58,3.0,7.0,2.0
4,2,Adult,Social,https://archive.ics.uci.edu/ml/datasets/Adult,48842.0,14.0,1996.0,2,,,...,,,,,,,51,8.0,10.0,3.0


02 Read Analysed Columns and define dataset index

In [7]:
# Load the 'AnalysedColumns' sheet to identify the columns 
analysed_columns_file_path = 'AnalysedColumns.xlsx' # for 50 datasets
#analysed_columns_file_path = 'AnalysedColumns1411.xlsx' # for 10 datasets
#analysed_columns_file_path = 'AnalysedColumns2111.xlsx' # for 92 fake data

analysed_columns_df = pd.read_excel(analysed_columns_file_path)

# Define the dataset index for which you want the dataset file URL and name
desired_dataset_index = 275 # Replace with the actual index you are interested in 

#changed_dataset_local_path = 'iris_CHANGED.txt' # 52
#changed_dataset_local_path = 'glass_CHANGED.txt' # 42
#changed_dataset_local_path = 'letter-recognition_CHANGED.txt' #  58
#changed_dataset_local_path = 'Rice_Cammeo_Osmancik.txt' #545 - Obtained from @DATA part 
#changed_dataset_local_path = 'chronic_kidney_disease_full.txt' #336 - Obtained from @DATA part

from datetime import datetime
print(f"desired_dataset_index: {desired_dataset_index} Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

desired_dataset_index: 275 Last run on: 2024-04-07 21:40:09


03 Get dataset file URL

In [8]:
import pandas as pd
from typing import Union, Tuple
import logging

logging.basicConfig(level=logging.INFO)

def get_dataset_file_url(excel_file_path: str, dataset_index: int) -> Union[Tuple[str, str], Exception]:
    """
    Load dataset details from an Excel file and return the dataset file URL and name for a specific dataset index.

    Parameters:
    - excel_file_path (str): The path to the Excel file containing dataset details.
    - dataset_index (int): The index number of the dataset for which to get the URL.

    Returns:
    - Tuple or Exception: A tuple containing the dataset file URL and name, or an exception if something goes wrong.
    """
    try:
        # Load the Excel sheet into a DataFrame
        datasets_df = pd.read_excel(excel_file_path)
        
        # Check if 'index', 'dataset_file_url', and 'name' columns exist in the DataFrame
        required_columns = ['index', 'dataset_file_url', 'name']
        for col in required_columns:
            if col not in datasets_df.columns:
                return f"Required column '{col}' does not exist in the DataFrame"
        
        # Extract the dataset file URL and name for the given dataset index
        dataset_details = datasets_df.loc[datasets_df['index'] == dataset_index, ['dataset_file_url', 'name']]
        if dataset_details.empty:
            return f"No dataset found with index {dataset_index}"

        dataset_file_url = dataset_details['dataset_file_url'].values[0]
        dataset_name = dataset_details['name'].values[0]

        #logging.info(f"Dataset name: {dataset_name}, File URL: {dataset_file_url}")
        return (dataset_file_url, dataset_name)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return e

# Define the Excel file path where the dataset details are stored
excel_file_path = forty_datasets_file_path

# Call the function to get the dataset details
dataset_details = get_dataset_file_url(excel_file_path, desired_dataset_index)

# Check the type of the return value to see if it's a tuple (indicating success)
if isinstance(dataset_details, tuple):
    dataset_file_url, dataset_name = dataset_details
    print(f"The dataset file URL for dataset {dataset_name} (index {desired_dataset_index}) is: {dataset_file_url}")
else:
    print(f"An error occurred or the dataset was not found: {dataset_details}")

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


The dataset file URL for dataset Bike Sharing Dataset (index 275) is: https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Last run on: 2024-04-07 21:40:09


04 Load dataset

In [9]:
import pandas as pd
import requests
import io
from io import BytesIO
import zipfile
import tarfile
import gzip
import logging
import csv
from typing import Optional, Union
import os
import tempfile
import shutil
import sys

logging.basicConfig(level=logging.INFO)

def is_header_for_csv(line, delimiter=' '):
    """
    Determine if a line is likely a header by checking if there are no numeric values.
    If at least one numeric value is found, the line is considered not a header (i.e., a data line).
    """
    # Regex to match quoted strings or non-whitespace sequences
    pattern = re.compile(r'\".*?\"|\S+')

    elements = pattern.findall(line.replace(delimiter, ' '))  # Replace delimiter with space for easier parsing

    # Check if any element is a number
    is_numeric_present = any(element.replace('.', '', 1).lstrip('-').isdigit() for element in elements)

    # Check for replicated elements
    unique_elements = set(elements)
    replicated_elements = len(elements) - len(unique_elements)

    # If no numeric value is found and there are replicated elements, consider this a header line
    is_header = not is_numeric_present and replicated_elements == 0

    return is_header


def load_csv(file_content: Union[str, bytes], header: Optional[int], na_values: Optional[Union[str, list]]) -> pd.DataFrame:
    # Adjust to decode bytes if necessary
    if isinstance(file_content, bytes):
        try:
            file_content_decoded = file_content.decode('utf-8')
        except UnicodeDecodeError:
            file_content_decoded = file_content.decode('ISO-8859-1')
    else:
        file_content_decoded = file_content
    
    # Preprocess to replace multiple tabs with a single tab
    #file_content_processed = re.sub('\t+', '\t', file_content_decoded)
    file_content_processed = re.sub('[ \t]+', ' ', file_content_decoded)

    file_stream = io.StringIO(file_content_processed)
        
    try:
        dialect = csv.Sniffer().sniff(file_stream.readline())
        delimiter = dialect.delimiter
        file_stream.seek(0)
    except csv.Error:
        delimiter = ','
        logging.info("Falling back to default delimiter ',' due to detection failure.")
  
    # Use the heuristic to decide if the first line is likely a header
    first_line = file_stream.readline()
    # Print the first row for inspection
    print("First row for inspection:", first_line)
    is_header_row = is_header_for_csv(first_line, delimiter)
    print('Is first row header ?',is_header_row )
    file_stream.seek(0)  # Reset to start of file after reading the first line
    
    header_decision = 0 if is_header_for_csv(first_line, delimiter) else None
    #header_decision = None if is_data_row else 0
    # Regex pattern to match quoted strings or non-whitespace sequences
    pattern = re.compile(r'\".*?\"|\S+')

    df = pd.read_csv(file_stream, header=header_decision, delimiter=delimiter, na_values=na_values, keep_default_na=False)
   
  # Check if the DataFrame needs re-parsing with regex pattern
    if len(df.columns) < 2:
        file_stream.seek(0)
        lines = file_stream.readlines()
        parsed_data = [pattern.findall(line) for line in lines]

        if header is None and header_decision == 0:
            header_row = parsed_data.pop(0)  # Use the first row as header
        else:
            header_row = None  # Let pandas create default headers or use provided header index

        df = pd.DataFrame(parsed_data, columns=header_row)
        df = df.apply(pd.to_numeric, errors='ignore')  # Attempt to correct data types
    return df

def is_header_for_excel(first_row: pd.DataFrame) -> bool:
    """
    Determines if the first row of a DataFrame is likely to be a header by checking the entire row's content.

    Args:
        first_row (pd.DataFrame): DataFrame containing at least one row of data to inspect.

    Returns:
        bool: True if the first row is likely a header (i.e., contains mostly non-numeric data), False if it's likely data.
    """
    # Check if the majority of the items in the first row are non-numeric
    non_numeric_count = first_row.applymap(lambda x: not isinstance(x, (int, float))).iloc[0].sum()
    
    # Determine if the first row is likely a header based on the proportion of non-numeric items
    is_header = non_numeric_count > len(first_row.columns) / 2
    return is_header

def load_excel(file_content: Union[str, bytes, io.BytesIO], na_values: Optional[Union[str, list]] = None, skip_rows: Union[int, list] = 0, parse_dates: bool = False) -> pd.DataFrame:
    # Ajuste para o file_content ser um objeto BytesIO, se for bytes
    if isinstance(file_content, bytes):
        file_content = BytesIO(file_content)  # Envolve bytes em BytesIO
    try:
        # Load the first row to check if it's likely to be a header
        first_row = pd.read_excel(file_content, nrows=1, header=None)
        
        # Print the first row for inspection
        print("First row for inspection:", first_row.iloc[0].values)

        # Execute the heuristic
        likely_header = is_header_for_excel(first_row)

        # Decide on using the first row as header based on heuristic
        #header_decision = None if likely_header else 0  
        header_decision = 0 if likely_header else None

        # Reset file_content to read from the beginning if it's a stream
        if isinstance(file_content, io.BytesIO):
            print("Resetting stream position")  # Diagnostic print
            file_content.seek(0)

        # Load the full Excel file with determined header option
        df = pd.read_excel(file_content, na_values=na_values, skiprows=skip_rows, header=header_decision, parse_dates=parse_dates)
        return df
    except ValueError as e:
        logging.error(f"Excel loading error: {e}")
        return pd.DataFrame()
        
def download_and_extract(url: str) -> str:
    """
    Downloads an archive from the given URL and extracts it into a temporary directory.
    Handles .zip, .tar.gz, and .gz files. Returns the path to the directory or file.
    """
    temp_dir = tempfile.mkdtemp()
    response = requests.get(url, stream=True)
    file_name = os.path.basename(url)
    temp_file_path = os.path.join(temp_dir, file_name)
    
    with open(temp_file_path, 'wb') as file:
        shutil.copyfileobj(response.raw, file)
    
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(temp_file_path, 'r') as archive:
            archive.extractall(temp_dir)
    elif file_name.endswith(('.tar.gz', '.tgz')):
        with tarfile.open(temp_file_path, 'r:gz') as archive:
            archive.extractall(temp_dir)
    elif file_name.endswith('.gz'):
        # Handle single .gz files by extracting to the same directory
        extracted_file_path = temp_file_path[:-3]  # Remove .gz extension
        with gzip.open(temp_file_path, 'rb') as f_in, open(extracted_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(temp_file_path)  # Remove the original .gz file
        return extracted_file_path  # Direct path to the extracted file for .gz
    
    os.remove(temp_file_path)  # Clean up archive file after extraction
    return temp_dir  # Path to directory with extracted files

def select_file_from_extracted(directory: str) -> Optional[str]:
    """
    Recursively lists all files in the extracted directory and prompts the user to select one.
    """
    files_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            files_list.append(os.path.join(root, file))

    if not files_list:
        logging.error("No files found in the directory.")
        return None

    for index, file in enumerate(files_list, start=1):
        relative_path = os.path.relpath(file, directory)  # Show relative path for clarity
        print(f"{index}: {relative_path}", flush=True)

    try:
        file_index = int(input("Enter the number of the file you want to load: ")) - 1
        if 0 <= file_index < len(files_list):
            selected_relative_path = os.path.relpath(files_list[file_index], directory)  # Get relative path
            print(f"Selected File: {selected_relative_path}")  # Print relative path
            return files_list[file_index]  # Return full path for further processing
        else:
            logging.error("Invalid selection.")
    except ValueError:
        logging.error("Please enter a valid number.")
    
    return None

def fetch_file_content(url: str) -> str:
    response = requests.get(url, stream=True)
    try:
        content = response.content.decode('utf-8')
    except UnicodeDecodeError:
        content = response.content.decode('ISO-8859-1')
    return content

def load_dataset(file_path_or_url: str, na_values: Optional[Union[str, list]] = None, skip_rows: Union[int, list] = 0, parse_dates: bool = False) -> Optional[pd.DataFrame]:
    if file_path_or_url.startswith(('http://', 'https://')):
        if file_path_or_url.endswith(('.csv', '.txt', '.data')):
            response = requests.get(file_path_or_url)
            file_content = response.content
            return load_csv(file_content, header=None, na_values=na_values)
        elif file_path_or_url.endswith(('.xlsx', '.xls')):
            response = requests.get(file_path_or_url)
            return load_excel(response.content, na_values=na_values, skip_rows=skip_rows, parse_dates=parse_dates)
    else:
        # Handle local file path
        if os.path.exists(file_path_or_url):
            if file_path_or_url.endswith(('.csv', '.txt', '.data')):
                with open(file_path_or_url, 'r', encoding='utf-8') as f:
                    file_content = f.read()
                return load_csv(file_content, header=None, na_values=na_values)
            elif file_path_or_url.endswith(('.xlsx', '.xls')):
                with open(file_path_or_url, 'rb') as f:
                    file_content = f.read()
                return load_excel(file_content, na_values=na_values, skip_rows=skip_rows, parse_dates=parse_dates)
        else:
            logging.error(f"File does not exist: {file_path_or_url}")
            return None
    # Handling archives
    extracted_path = download_and_extract(file_path_or_url)
    if os.path.isdir(extracted_path):
        selected_file = select_file_from_extracted(extracted_path)
        if selected_file and selected_file.endswith(('.csv', '.txt', '.data')):
            with open(selected_file, 'rb') as f:
                file_content = f.read()
            return load_csv(file_content, header=None, na_values=na_values)
        elif selected_file and selected_file.endswith(('.xlsx', '.xls')):
            with open(selected_file, 'rb') as f:
                file_content = f.read()
            return load_excel(file_content, na_values=na_values, skip_rows=skip_rows, parse_dates=parse_dates)
    elif os.path.isfile(extracted_path):
        if extracted_path.endswith(('.csv', '.txt', '.data')):
            with open(extracted_path, 'rb') as f:
                file_content = f.read()
            return load_csv(file_content, header=None, na_values=na_values)
        elif extracted_path.endswith(('.xlsx', '.xls')):
            with open(extracted_path, 'rb') as f:
                file_content = f.read()
            return load_excel(file_content, na_values=na_values, skip_rows=skip_rows, parse_dates=parse_dates)
        else:
            # Attempt to load the file without assuming an extension, particularly useful for .gz extracted files
            with open(extracted_path, 'rb') as f:
                file_content = f.read()
            try:
                # Attempt to load as CSV first; this part assumes CSV if no extension is found
                return load_csv(file_content, header=None, na_values=na_values)
            except Exception as e:
                logging.error(f"Failed to automatically determine file type for {extracted_path}: {e}")
                return None
    return None


# Example usage
if __name__ == "__main__":
    print(f"Dataset file url: {dataset_file_url}")
    dataset_df = load_dataset(dataset_file_url)

    if dataset_df is not None:
        print("Dataset loaded successfully.")
        print(dataset_df.head())
    else:
        print("Failed to load dataset.")

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        

Dataset file url: https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip


1: day.csv
2: hour.csv
3: Readme.txt
Selected File: hour.csv
First row for inspection: instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt

Is first row header ? True
Dataset loaded successfully.
   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1   0        0        6           0   
1        2  2011-01-01       1   0     1   1        0        6           0   
2        3  2011-01-01       1   0     1   2        0        6           0   
3        4  2011-01-01       1   0     1   3        0        6           0   
4        5  2011-01-01       1   0     1   4        0        6           0   

   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  
0           1  0.24  0.2879  0.81        0.0       3          13   16  
1           1  0.22  0.2727  0.80        0.0       8          32   40  
2           1  0.22  0.2727  0.80        0.0       5 

05 Assign Column names

In [10]:
import pandas as pd
from typing import Union
import logging

logging.basicConfig(level=logging.INFO)

def assign_column_names(analysed_columns_df: pd.DataFrame, dataset_index: int, dataset_df: pd.DataFrame, dataset_name: str = "Unknown") -> Union[pd.DataFrame, Exception]:
    """
    Assign column names to a DataFrame based on a given dataset index from an "AnalysedColumns" DataFrame.

    Parameters:
    - analysed_columns_df (pd.DataFrame): The DataFrame containing analysed columns information.
    - dataset_index (int): The index number of the dataset for which to get the column names.
    - target_df (pd.DataFrame): The DataFrame to which the column names will be assigned.
    - dataset_name (str): The name of the dataset. Default is "Unknown".

    Returns:
    - pd.DataFrame or Exception: The DataFrame with assigned column names or an exception if something goes wrong.
    """
    try:
        # Check if 'index' and 'Column' columns exist in the DataFrame
        if 'index' not in analysed_columns_df.columns or 'Column' not in analysed_columns_df.columns:
            return f"Required columns 'index' or 'Column' do not exist in the DataFrame"

        # Extract the column names for the given dataset index
        column_names = analysed_columns_df.loc[analysed_columns_df['index'] == dataset_index, 'Column'].tolist()
        if not column_names:
            return f"No column names found for dataset index {dataset_index}"

        # Assign the extracted column names to the target DataFrame
        dataset_df.columns = column_names

        logging.info(f"Successfully assigned column names to the dataset '{dataset_name}' for index {dataset_index}")
        return dataset_df
    except Exception as e:
        logging.error(f"An error occurred while assigning column names to the dataset '{dataset_name}': {e}")
        return e

# Assume analysed_columns_df contains the analysed columns information
# Assume target_df is the DataFrame loaded previously
# Assume dataset_name contains the name of the dataset

# Call the function to assign the column names
dataset_df = assign_column_names(analysed_columns_df, desired_dataset_index, dataset_df, dataset_name)

# Display the first few rows to verify the column names have been correctly assigned


from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

dataset_df




INFO:root:Successfully assigned column names to the dataset 'Bike Sharing Dataset' for index 275


Last run on: 2024-04-07 21:40:26


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


List of functions in the DataQualityIssues class along with their associated DQI (Data Quality Issue) numbers:

is_blank - Not directly associated with a DQI number

handle_blank_empty_null_nan - DQI #1 (Missing Data - Completeness)

handle_predefined_unacceptable_values - DQI #4 (Ambiguous Data - Accuracy, Consistency)

handle_extraneous_data - DQI #5 (Extraneous Data - Consistency, Uniqueness)

handle_street_extraneous_data - DQI #5 (Extraneous Data - Consistency, Uniqueness)

standardize_date - Not directly associated with a DQI number

standardize_date_time - Not directly associated with a DQI number

is_valid_time - Not directly associated with a DQI number

handle_outdated_temporal_data - DQI #6 (Outdated Temporal Data - Timeliness)

handle_outdated_temporal_data_datetime - DQI #6 (Outdated Temporal Data - Timeliness)

handle_duplicates - DQI #9 (Duplicates - Uniqueness)

handle_excessive_distinct_values - DQI #10 (Structural Conflicts - Consistency, Uniqueness)

handle_dates_format - DQI #14 (Different units/representations - Consistency)

handle_datetimes_format - DQI #14 (Different units/representations - Consistency)

handle_negative_values - DQI #15 (Domain Violation - Accuracy)

handle_values_outside_range - DQI #15 (Domain Violation - Accuracy)

handle_floating_point_values - DQI #15 (Domain Violation - Accuracy)

handle_capitalization_format - DQI #15 (Domain Violation - Accuracy)

handle_short_length_values - DQI #15 (Domain Violation - Accuracy)

handle_invalid_months - DQI #15 (Domain Violation - Accuracy)

handle_invalid_weekdays - DQI #15 (Domain Violation - Accuracy)

handle_street_format - DQI #15 (Domain Violation - Accuracy)

is_date_valid - Not directly associated with a DQI number

handle_invalid_dates - DQI #15 (Domain Violation - Accuracy)

is_structurally_valid_date - Not directly associated with a DQI number

handle_invalid_datetimes - DQI #15 (Domain Violation - Accuracy)

standardize_phone_number - Not directly associated with a DQI number

handle_phone_number_format - DQI #15 (Domain Violation - Accuracy)

handle_ip_format - DQI #15 (Domain Violation - Accuracy)

handle_url_format - DQI #15 (Domain Violation - Accuracy)

is_valid_email - Not directly associated with a DQI number

handle_email_format - DQI #15 (Domain Violation - Accuracy)

handle_binary_values - DQI #15 (Domain Violation - Accuracy)

handle_non_numeric_values - DQI #17 (Wrong Data Type - Consistency)

handle_non_alphanumeric_values - DQI #17 (Wrong Data Type - Consistency)

handle_non_string_values - DQI #17 (Non-String Data Type - Consistency)

handle_alphanumeric_consistency - DQI #17 (Wrong Data Type - Consistency)

handle_uniqueness_violation - DQI #19 (Uniqueness Violation - Uniqueness)

handle_special_characters - DQI #21 (Use of Special Characters - Consistency)

06 Data Quality Issues

In [11]:
import numpy as np
import pandas as pd
import re
import string
from datetime import datetime, time  

min_valid_year = 1800
max_valid_year = 2100

class DataQualityIssues:

    @staticmethod    
    def is_blank(x):
        # Function to determine if a value is blank
        x_str = str(x).strip()
        return pd.isnull(x) or x_str == '' or x_str.lower() == 'null' or x_str in ['""', "''", '" "', "' '"]

    @staticmethod
    def handle_blank_empty_null_nan(df, column):          
        # Get indices for blank/empty/null/NaN values
        blank_indices = df[df[column].apply(DataQualityIssues.is_blank)].index

        # Convert 'nan' to a string representation for display
        issue_data = []
        for index in blank_indices:
            val = df.at[index, column]
            if isinstance(val, float) and np.isnan(val):
                # Convert NaN floats to a string for display
                issue_data.append((index, ''))  # Represent empty values as empty strings
            else:
                # Use the original value
                issue_data.append((index, val))

        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Blank/Empty/Null/NaN value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Blank/Empty/Null/NaN value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #1 (Missing Data - Completeness)"
            }

        return {"issue": False}


    @staticmethod
    def handle_predefined_unacceptable_values(df, column):
        predefined_unacceptable_values = ['?']
        unacceptable_indices_and_values = []

        for idx, val in df[column].items():
            val_str = str(val).strip()
            if val_str in predefined_unacceptable_values:
                unacceptable_indices_and_values.append((idx, val))

        total_issues = len(unacceptable_indices_and_values)
        if unacceptable_indices_and_values:
            if total_issues > 20:
                first_10 = unacceptable_indices_and_values[:10]
                last_10 = unacceptable_indices_and_values[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Unacceptable value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Unacceptable value(s) at index(es): {unacceptable_indices_and_values}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #4 (Ambiguous Data - Accuracy, Consistency)"
            }
        return {"issue": False}


    @staticmethod
    def handle_extraneous_data(df, column):
        # Define the function to check for extraneous data
        def has_extraneous_data(x):
            return any(char.isdigit() or char in ['!', '?'] for char in str(x))

        # Get indices and values for extraneous data
        extraneous_data_indices = df[df[column].apply(has_extraneous_data)].index
        extraneous_data_values = df.loc[extraneous_data_indices, column].tolist()
        issue_data = list(zip(extraneous_data_indices, extraneous_data_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Extraneous data value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Extraneous data value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "indices": extraneous_data_indices,  # Include all indices directly
                "dq_issue": "DQI #5 (Extraneous Data - Consistency, Uniqueness)"
            }

        return {"issue": False}


    @staticmethod
    def handle_street_extraneous_data(df, column):
        """
        Check for extraneous data in street names, allowing common patterns including numbers,
        hyphens, periods, slashes, and commas which are typical in street addresses, but flagging street names
        composed solely of numbers as errors.
        """
        def has_extraneous_street_data(x):
            # Allow numbers, letters, spaces, hyphens, periods, slashes, and commas
            allowed_chars = string.ascii_letters + string.digits + ' -./,'
            # Flag if the string is solely numeric
            if x.isdigit():
                return True
            return any(char not in allowed_chars for char in x)

        extraneous_data_indices = df[df[column].apply(lambda x: isinstance(x, str) and has_extraneous_street_data(x))].index
        extraneous_data_values = df.loc[extraneous_data_indices, column].tolist()
        issue_data = list(zip(extraneous_data_indices, extraneous_data_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Extraneous street data value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Extraneous street data value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "indices": extraneous_data_indices,  # Include all indices directly
                "dq_issue": "DQI #5 (Extraneous Data - Consistency, Uniqueness)"
            }

        return {"issue": False}

    @staticmethod
    def convert_to_strftime_format(deduced_format):
        format_mappings = {
            "DDMMYYYY": "%d/%m/%Y",
            "MMDDYYYY": "%m/%d/%Y",
            "YYYYMMDD": "%Y/%m/%d"
        }
        return format_mappings.get(deduced_format, "%Y-%m-%d")  # Default format

        
    @staticmethod
    def standardize_date(date_str, deduced_format):
        month_mapping = {
            'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
            'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12',
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
            'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }
        # Remove ordinal suffixes and commas
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str.replace(',', ''))

        for month, num in month_mapping.items():
            date_str = re.sub(r'\b' + month + r'\b', num, date_str, flags=re.IGNORECASE)

        # Split the date string into components
        date_parts = re.split(r'[-/. ]', date_str)

        if len(date_parts) == 3:
            if deduced_format == 'YYYYMMDD':
                standardized_date = date_parts[0][:4] + date_parts[1].zfill(2) + date_parts[2].zfill(2)
            elif deduced_format == 'DDMMYYYY':
                standardized_date = date_parts[0].zfill(2) + date_parts[1].zfill(2) + date_parts[2][:4]
            elif deduced_format == 'MMDDYYYY':
                standardized_date = date_parts[0].zfill(2) + date_parts[1].zfill(2) + date_parts[2][:4]
            else:
                return None  # Format mismatch
        else:
            return None  # Format not recognized    
        
        return standardized_date

    @staticmethod
    def standardize_date_time(date_str, deduced_format):
        
       # Convert to string in case the input is not a string (e.g., float, int)
        date_str = str(date_str)
        
        # Handle dates with ordinal suffixes and commas
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str.replace(',', ''))

        parts = date_str.split(' ')
        
        if len(parts) > 1 and parts[-1] in ["AM", "PM"]:
            time_part = ' '.join(parts[-2:])
            date_part = ' '.join(parts[:-2])
        elif len(parts) > 1 and ':' in parts[-1]:
            time_part = parts[-1]
            date_part = ' '.join(parts[:-1])
        else:
            date_part = date_str
            time_part = '00:00:00'  # Default time for date-only entries

        # Convert textual months to numbers
        month_mapping = {
            'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
            'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12',
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
            'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }

        # Only apply month conversion if there's a textual month
        needs_conversion = any(month in date_part for month in month_mapping)

        if needs_conversion:
            for month, num in month_mapping.items():
                date_part = re.sub(r'\b' + month + r'\b', num, date_part, flags=re.IGNORECASE)

        # Process the date part
        date_parts = re.split(r'[-/. ]', date_part)

        if len(date_parts) == 3:
            if deduced_format == 'YYYYMMDD':
                standardized_date = date_parts[0][:4] + date_parts[1].zfill(2) + date_parts[2].zfill(2)
            elif deduced_format == 'DDMMYYYY':
                standardized_date = date_parts[0].zfill(2) + date_parts[1].zfill(2) + date_parts[2][:4]
            elif deduced_format == 'MMDDYYYY':
                standardized_date = date_parts[0].zfill(2) + date_parts[1].zfill(2) + date_parts[2][:4]
            else:
                return None  # Format mismatch
        else:
            return None  # Format not recognized

        return standardized_date + ' ' + time_part if time_part and standardized_date else None

        
    @staticmethod
    def is_valid_time(time_str):
        if isinstance(time_str, time):  
            time_str = time_str.strftime("%H:%M:%S")

        if not time_str:
            return False

        time_formats = ['%H:%M', '%I:%M %p', '%H:%M:%S', '%I:%M:%S %p']

        for fmt in time_formats:
            try:
                datetime.strptime(time_str, fmt)
                return True
            except ValueError:
                continue

        return False

    @staticmethod
    def handle_outdated_temporal_data(df, column, min_year, max_year):
        outdated_entries = []
        # Assume a default format for date standardization
        default_format = 'DDMMYYYY'  

        for idx, val in df[column].items():
            standardized_date = DataQualityIssues.standardize_date(str(val), default_format)
            year = None

            # Skip the processing if standardized_date is None
            if standardized_date is None:
                continue

            if len(standardized_date) == 8 and standardized_date.isdigit():
                # Check for YYYYMMDD format
                if int(standardized_date[4:6]) <= 12 and int(standardized_date[6:]) <= 31:
                    year = int(standardized_date[:4])
                # Check for DDMMYYYY format
                elif int(standardized_date[:2]) <= 31 and int(standardized_date[2:4]) <= 12:
                    year = int(standardized_date[4:])

            # Check if the year is within the valid range
            if year and not (min_year <= year <= max_year):
                outdated_entries.append((idx, val))

        total_issues = len(outdated_entries)
        if outdated_entries:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = outdated_entries[:10]
                last_10 = outdated_entries[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Date value(s) not in [{min_year}-{max_year}] period at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Date value(s) not in [{min_year}-{max_year}] period at index(es): {outdated_entries}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #6 (Outdated Temporal Data - Timeliness)"
            }

        return {"issue": False}
    
    @staticmethod
    def handle_outdated_temporal_data_datetime(df, column, min_year, max_year):
        outdated_entries = []
        # Assume a default format for date standardization
        default_format = 'DDMMYYYY'  

        for idx, val in df[column].items():
            # Split the datetime into date and time parts using standardize_date_time
            standardized_datetime = DataQualityIssues.standardize_date_time(str(val), default_format)
            if standardized_datetime is None:
                continue

            parts = standardized_datetime.split(' ')
            date_part = parts[0]  # The date part is always the first part

            year = None

            # Extract the year from the standardized date
            if len(date_part) == 8 and date_part.isdigit():
                # Check for YYYYMMDD format
                if int(date_part[4:6]) <= 12 and int(date_part[6:]) <= 31:
                    year = int(date_part[:4])
                # Check for DDMMYYYY format
                elif int(date_part[:2]) <= 31 and int(date_part[2:4]) <= 12:
                    year = int(date_part[4:])

            # Check if the year is within the valid range
            if year and not (min_year <= year <= max_year):
                outdated_entries.append((idx, val))

        total_issues = len(outdated_entries)
        if outdated_entries:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = outdated_entries[:10]
                last_10 = outdated_entries[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Datetime value(s) not in [{min_year}-{max_year}] period at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Datetime value(s) not in [{min_year}-{max_year}] period at index(es): {outdated_entries}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #6 (Outdated Temporal Data - Timeliness)"
            }

        return {"issue": False}
    
    @staticmethod
    def handle_duplicates(df, column):
        duplicate_values = df[column].duplicated(keep=False)  # Mark all duplicates
        issue_data = df[duplicate_values][column].reset_index().values.tolist()
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Duplicate value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Duplicate value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #9 (Duplicates - Uniqueness)"
            }
        return {"issue": False}


    @staticmethod
    def handle_excessive_distinct_values(df, column, threshold=100):
        unique_values_count = df[column].nunique(dropna=False)

        if unique_values_count > threshold:
            sample_values = df[column].dropna().unique()[:10]  # Sample of up to 10 unique values
            return {
                "issue": True,
                "error_message": f"Data seems not categorical or has too many categories (> {threshold}). Sample values: {list(sample_values)}",
                "dq_issue": "DQI #10 (Structural Conflicts - Consistency, Uniqueness)"
            }
        return {"issue": False}

                
    @staticmethod
    def is_date_valid(date_str, fmt):
        try:
            # Parse the date string based on the provided format
            if fmt == 'YYYYMMDD':
                year, month, day = int(date_str[:4]), int(date_str[4:6]), int(date_str[6:8])
            elif fmt == 'DDMMYYYY':
                day, month, year = int(date_str[:2]), int(date_str[2:4]), int(date_str[4:8])
            elif fmt == 'MMDDYYYY':
                month, day, year = int(date_str[:2]), int(date_str[2:4]), int(date_str[4:8])
            else:
                return False

            # Debug print to check extracted date components
            # print(f"Extracted components for {fmt}: Year: {year}, Month: {month}, Day: {day}")

            # Construct datetime object to validate the date
            datetime(year, month, day)

            return min_valid_year <= year <= max_valid_year
        except ValueError as e:
            # print(f"Date parsing resulted in an error: {e}")
            return False

    @staticmethod
    def is_valid_in_any_format(date_str):
        formats = ['%Y%m%d', '%d%m%Y', '%m%d%Y']  # Add other potential formats
        for fmt in formats:
            try:
                datetime.strptime(date_str, fmt)
                return True
            except ValueError:
                continue
        return False
        
    @staticmethod
    def handle_invalid_dates(df, column):
        invalid_entries = []
        # Assume a default format for date standardization
        default_format = 'DDMMYYYY'  

        for idx, val in df[column].items():
            standardized_date = DataQualityIssues.standardize_date(str(val),default_format)

            # Skip the validation if standardized_date is None
            if standardized_date is None:
                continue

            # Check if date is structurally valid in any format
            if not (DataQualityIssues.is_date_valid(standardized_date, 'DDMMYYYY') or
                    DataQualityIssues.is_date_valid(standardized_date, 'MMDDYYYY') or
                    DataQualityIssues.is_date_valid(standardized_date, 'YYYYMMDD')):
                invalid_entries.append((idx, val))    

        total_issues = len(invalid_entries)
        if invalid_entries:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_entries[:10]
                last_10 = invalid_entries[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Invalid date value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(invalid_entries)
                message = f"{total_issues} Invalid date value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue":  "DQI #13 (Temporal mismatch - Accuracy, Timeliness)"
            }

        return {"issue": False}

    @staticmethod
    def is_structurally_valid_date(date_str, expected_format):
        # Check the date in any format for structural validity
        return DataQualityIssues.is_date_valid(date_str, expected_format)

    @staticmethod
    def handle_invalid_datetimes(df, column, deduced_format):
        #print(f"Handling invalid datetimes. Deduced format: {deduced_format}")
        invalid_datetime_errors = []

        for idx, val in df[column].items():
            #print(f"Processing index {idx}, value: {val}")
            standardized_datetime = DataQualityIssues.standardize_date_time(val, deduced_format)

            if not standardized_datetime:
                #print(f"Standardization failed for: {val}")
                invalid_datetime_errors.append((idx, val))
                continue

            parts = standardized_datetime.split(' ')
            date_str = parts[0]
            time_str = parts[1] if len(parts) > 1 else '00:00:00'

            #print(f"Standardized datetime: {standardized_datetime}, Date: {date_str}, Time: {time_str}")

            if not DataQualityIssues.is_valid_time(time_str):
                #print(f"Invalid time for: {val}")
                invalid_datetime_errors.append((idx, val))
            elif not DataQualityIssues.is_structurally_valid_date(date_str, deduced_format):
                #print(f"Date structure invalid for: {val}")
                invalid_datetime_errors.append((idx, val))

        total_issues = len(invalid_datetime_errors)
        if invalid_datetime_errors:
            #print(f"Total invalid datetime values: {total_issues}")
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_datetime_errors[:10]
                last_10 = invalid_datetime_errors[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                issue_data = str(display_list)
                message = f"{total_issues} Invalid datetime value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                issue_data = str(invalid_datetime_errors)
                message = f"{total_issues} Invalid datetime value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue":  "DQI #13 (Temporal mismatch - Accuracy, Timeliness)"
            }

        #print("No invalid datetimes found.")
        return {"issue": False}


    
    @staticmethod
    def handle_invalid_times(df, column):
        invalid_time_errors = []
        for idx, time_str in df[column].items():
            if not DataQualityIssues.is_valid_time(time_str):
                invalid_time_errors.append((idx, time_str))

        total_issues = len(invalid_time_errors)
        if invalid_time_errors:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_time_errors[:10]
                last_10 = invalid_time_errors[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Invalid time value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(invalid_time_errors)
                message = f"{total_issues} Invalid time value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #13 (Temporal mismatch - Accuracy, Timeliness)"
            }

        return {"issue": False}

    @staticmethod
    def deduce_regional_format(date_samples):
        month_mapping = {
            'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
            'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12',
            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
            'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
        }
        standardized_dates = []
        format_counts = {'DDMMYYYY': 0, 'MMDDYYYY': 0, 'YYYYMMDD': 0}

        for date in date_samples:
            # Convert textual months to numbers
            for month, num in month_mapping.items():
                date = re.sub(r'\b' + month + r'\b', num, date, flags=re.IGNORECASE)

            # Handle continuous string dates (without separators)
            if len(date) == 8 and date.isdigit():
                # Check if the first four characters represent a plausible year
                if 1800 <= int(date[:4]) <= 2100:
                    format_counts['YYYYMMDD'] += 1
                # Check if the first two characters represent a plausible day and next two a month
                elif 1 <= int(date[:2]) <= 31 and 1 <= int(date[2:4]) <= 12:
                    format_counts['DDMMYYYY'] += 1
                # Check if the first two characters represent a plausible month and next two a day
                elif 1 <= int(date[:2]) <= 12 and 1 <= int(date[2:4]) <= 31:
                    format_counts['MMDDYYYY'] += 1
            else:
                date_parts = re.split(r'[-/. ]', date)
                if len(date_parts) == 3:
                    # Check if the year is the first or last component
                    if len(date_parts[0]) == 4:
                        format_counts['YYYYMMDD'] += 1
                    elif len(date_parts[2]) == 4:
                        format_counts['DDMMYYYY'] += 1
                    else:
                        # Default to MMDDYYYY if year is not first or last
                        format_counts['MMDDYYYY'] += 1

                    standardized_date = '-'.join(date_parts)  # Rejoin the date parts for standardized dates
                    standardized_dates.append(standardized_date)

        preferred_format = max(format_counts, key=format_counts.get)
        #print(f"Deduced format: {preferred_format}")  # Diagnostic print

        return preferred_format, standardized_dates
    
    @staticmethod
    def handle_dates_format(df, column, expected_format):
        format_errors = []
        for idx, val in df[column].items():
            standardized_date = DataQualityIssues.standardize_date(str(val), expected_format)

            # Check if standardized_date is None
            if standardized_date is None:
                # If the date can't be standardized, skip it (it will be caught in other checks)
                continue
            
            # Check if the date is valid according to the expected format
            if not DataQualityIssues.is_date_valid(standardized_date, expected_format):
                format_errors.append((idx, val))

        total_issues = len(format_errors)
        if format_errors:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = format_errors[:10]
                last_10 = format_errors[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Date value(s) without format '{expected_format}' in [{min_valid_year}-{max_valid_year}] period at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Date value(s) without format '{expected_format}' in [{min_valid_year}-{max_valid_year}] period at index(es): {format_errors}"
           
            return{
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #14 (Different units/representations - Consistency)"
            }

        return {"issue": False}

    @staticmethod
    def handle_datetimes_format(df, column, expected_format):
        format_errors = []
 
        for idx, val in df[column].items():
            standardized_datetime = DataQualityIssues.standardize_date_time(str(val), expected_format)

            if not standardized_datetime:
                print(f"Index {idx}: Standardization failed.")
                continue

            date_str = standardized_datetime.split(' ')[0]
            
             # Check if the date is valid according to the expected format
            if DataQualityIssues.is_valid_in_any_format(date_str) and not DataQualityIssues.is_structurally_valid_date(date_str, expected_format):
                format_errors.append((idx, val))
       
        total_issues = len(format_errors)
        if format_errors:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = format_errors[:10]
                last_10 = format_errors[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Datetime value(s) without format '{expected_format}' in [{min_valid_year}-{max_valid_year}] period at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Datetime value(s) without format '{expected_format}' in [{min_valid_year}-{max_valid_year}] period at index(es): {format_errors}"
           
            return{
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #14 (Different units/representations - Consistency)"
            }

        return {"issue": False}

    @staticmethod
    def handle_negative_values(df, column):
        # Function to check if a value is negative
        def is_negative(x):
            try:
                num_val = float(x)
                return num_val < 0  # Check if the value is negative
            except (ValueError, TypeError):
                return False  # Non-numeric values are not considered here

        # Get indices and values for negative values
        negative_indices = df[df[column].apply(is_negative)].index
        negative_values = df.loc[negative_indices, column].tolist()
        issue_data = list(zip(negative_indices, negative_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Negative value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Negative value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    
    @staticmethod
    def handle_values_outside_range(df, column, min_value, max_value):
        # Function to check if a value is within the specified range
        def is_outside_range(x):
            try:
                num_val = float(x)
                return num_val < min_value or num_val > max_value
            except (ValueError, TypeError):
                return False  # Non-numeric values are not considered here

        # Get indices and values for values outside the specified range
        outside_range_indices = df[df[column].apply(is_outside_range)].index
        outside_range_values = df.loc[outside_range_indices, column].tolist()
        issue_data = list(zip(outside_range_indices, outside_range_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Value(s) outside range [{min_value}, {max_value}] at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Value(s) outside range [{min_value}, {max_value}] at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}
       
           
    @staticmethod
    def handle_floating_point_values(df, column):
        """
        Flags floating-point numbers in a column.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to check.

        Returns:
        - dict: A dictionary indicating if there are floating-point values, with their indices and values, and the associated DQI.
        """
        floating_point_indices = []
        for idx, value in df[column].items():
            try:
                # If it's a floating point and not an integer
                if float(value) and not float(value).is_integer():
                    floating_point_indices.append((idx, value))
            except ValueError:
                # Ignore non-numeric values, handle them in other checks
                continue

        total_issues = len(floating_point_indices)
        if floating_point_indices:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = floating_point_indices[:10]
                last_10 = floating_point_indices[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Floating-point number(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Floating-point number(s) at index(es): {floating_point_indices}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    @staticmethod
    def handle_capitalization_format(df, column, linking_words):
        """
        Check if the values in the column adhere to capitalization and format standards.
        """
        def is_capitalization_issue(word):
            # Allow all-uppercase words, ordinal numbers like '3rd', '5th', etc., and linking words
            if word.isupper() or re.match(r"^\d+(st|nd|rd|th)$", word.lower()) or word.lower() in linking_words:
                return False
            return word != word.title()

        format_issues = []
        for idx, val in df[column].items():
            words = str(val).split()
            if any(is_capitalization_issue(word) for word in words):
                format_issues.append((idx, val))

        total_issues = len(format_issues)
        if format_issues:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = format_issues[:10]
                last_10 = format_issues[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Capitalization/Format issue(s) at index(es): " + issue_data + " (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(format_issues)
                message = f"{total_issues} Capitalization/Format issue(s) at index(es): " + issue_data

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    @staticmethod
    def handle_capitalization_format_country(df, column, linking_words):
        """
        Check if the values in the column adhere to capitalization and format standards,
        with exceptions for certain patterns typical in country names.
        """
        def is_exceptional_case(word, prev_word=None):
            # Directly return False (no issue) for all-uppercase words or ordinal numbers
            if word.isupper() or re.match(r"^\d+(st|nd|rd|th)$", word.lower()):
                return False
            # Check if the word is a linking word or part of an exception pattern (e.g., within parentheses or after a hyphen)
            if word.lower() in linking_words or (prev_word and prev_word.endswith('-')):
                return True
            return False
        
        def is_capitalization_issue(words):
            # Check for capitalization issue while considering exceptions
            for i, word in enumerate(words):
                prev_word = words[i-1] if i > 0 else None
                # Skip words that are part of an exceptional case
                if is_exceptional_case(word, prev_word):
                    continue
                # Identify words that fail the capitalization check (excluding exceptions)
                if not word.istitle() and not word.isupper():
                    return True
            return False

        format_issues = []
        for idx, val in df[column].items():
            original_val = str(val)
            # Split the value considering spaces and treating content in parentheses as a single block
            words = re.findall(r'\b\w+\b', original_val)
            if is_capitalization_issue(words):
                format_issues.append((idx, val))

        total_issues = len(format_issues)
        if format_issues:
            # Prepare the message with a simplified display if there are many issues
            display_list = format_issues[:10] + [('...', '...')] if total_issues > 20 else format_issues
            issue_data = str(display_list)
            message = f"{total_issues} Capitalization/Format issue(s) at index(es): " + issue_data
            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    @staticmethod
    def handle_short_length_values(df, column, min_length):
        """
        Flags values in a column that are shorter than the specified minimum length, excluding non-alphanumeric characters.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to check.
        - min_length (int): The minimum acceptable length for values.

        Returns:
        - dict: A dictionary indicating if there are short values, with their indices and values, and the associated DQI.
        """
        def is_short_and_alphanumeric(x):
            return isinstance(x, str) and len(x) < min_length and x.isalnum()

        short_values = df[df[column].apply(is_short_and_alphanumeric)]
        issue_data = list(zip(short_values.index, short_values[column]))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Short length alphanumeric value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Short length alphanumeric value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    @staticmethod
    def normalize_month(month_str):
        """
        Normalize the given month string to its full month name if valid.

        Parameters:
        - month_str (str): The month string to normalize.

        Returns:
        - str/None: Normalized month name if valid, None otherwise.
        """
        try:
            if month_str.isdigit():
                month_val = int(month_str)
                if 1 <= month_val <= 12:
                    return datetime(2000, month_val, 1).strftime('%B')
            elif len(month_str) == 3:
                return datetime.strptime(month_str.title(), '%b').strftime('%B')
            else:
                datetime.strptime(month_str.title(), '%B')
                return month_str.title()
        except ValueError:
            return None
    
    @staticmethod
    def handle_invalid_months(df, column):
        """
        Check if all values in the specified column are valid representations of months.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to analyze.

        Returns:
        - dict: A dictionary indicating if there are invalid month values, with their indices and values.
        """
        invalid_entries = []
        for idx, val in df[column].items():
            if not DataQualityIssues.normalize_month(str(val)):
                invalid_entries.append((idx, val))

        total_issues = len(invalid_entries)
        if invalid_entries:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_entries[:10]
                last_10 = invalid_entries[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Invalid month value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Invalid month value(s) at index(es): {invalid_entries}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    
    @staticmethod
    def handle_invalid_weekdays(df, column):
        """
        Check if all values in the specified column are valid representations of weekdays.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to analyze.

        Returns:
        - dict: A dictionary indicating if there are invalid weekday values, with their indices and values.
        """
        invalid_entries = []
        weekdays = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
        weekday_abbr = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
        weekday_abbr_short = ["su", "mo", "tu", "we", "th", "fr", "sa"]

        for idx, val in df[column].items():
            val_str = str(val).strip().lower()
            # Normalize full names and abbreviations to full weekday name
            if val_str.title() in weekdays or val_str.title() in weekday_abbr or val_str in weekday_abbr_short:
                continue
            elif val_str.isdigit() and 0 <= int(val_str) <= 7:
                continue
            else:
                invalid_entries.append((idx, val))

        total_issues = len(invalid_entries)
        if invalid_entries:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_entries[:10]
                last_10 = invalid_entries[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Invalid weekday value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Invalid weekday value(s) at index(es): {invalid_entries}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    @staticmethod
    def handle_street_format(df, column):
        """
        Check if the street names conform to a typical street format.
        """
        invalid_format_indices = []
        street_format_regex = re.compile(r'^[\dA-Za-zÀ-ÖØ-öø-ÿ .,\-\'#]+(?:\s[A-Za-zÀ-ÖØ-öø-ÿ]+)*$')

        for idx, val in df[column].items():
            if not isinstance(val, str) or not street_format_regex.match(val):
                invalid_format_indices.append((idx, val))

        total_issues = len(invalid_format_indices)
        if invalid_format_indices:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = invalid_format_indices[:10]
                last_10 = invalid_format_indices[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                issue_data = ", ".join([f"({idx}, '{value}')" for idx, value in display_list])
                message = f"{total_issues} Incorrect street format issue(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                issue_data = ", ".join([f"({idx}, '{value}')" for idx, value in invalid_format_indices])
                message = f"{total_issues} Incorrect street format issue(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    @staticmethod
    def standardize_phone_number(phone_number):
        """
        Standardize the phone number by removing common separators.
        """
        return re.sub(r'[()\-+ ]', '', str(phone_number))

    @staticmethod
    def handle_phone_number_format(df, column):
        """
        Check if phone numbers in the specified column conform to expected formats.
        """
        incorrect_format = []

        for idx, phone_number in df[column].items():
            if phone_number is None or isinstance(phone_number, str) and phone_number.strip() == '':
                incorrect_format.append((idx, phone_number))
                continue

            cleaned_number = DataQualityIssues.standardize_phone_number(phone_number)
            
            if not (cleaned_number.isdigit() and 3 <= len(cleaned_number) <= 15):
                incorrect_format.append((idx, phone_number))

        total_issues = len(incorrect_format)
        if incorrect_format:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = incorrect_format[:10]
                last_10 = incorrect_format[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Incorrect telephone number format issue(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(incorrect_format)
                message = f"{total_issues} Incorrect telephone number format issue(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}



    @staticmethod
    def handle_ip_format(df, column):
        # Regular expression patterns for IPv4 and IPv6
        ipv4_pattern = re.compile(r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$')
        ipv6_pattern = re.compile(r'^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$')

        incorrect_indices_and_values = []

        for idx, ip in df[column].items():
            ip_str = str(ip).strip()  # Convert to string and strip whitespace
            if not (ipv4_pattern.match(ip_str) or ipv6_pattern.match(ip_str)):
                incorrect_indices_and_values.append((idx, ip))

        total_issues = len(incorrect_indices_and_values)
        if incorrect_indices_and_values:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = incorrect_indices_and_values[:10]
                last_10 = incorrect_indices_and_values[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Invalid IP format issue(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(incorrect_indices_and_values)
                message = f"{total_issues} Invalid IP format issue(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}

    
    @staticmethod
    def handle_url_format(df, column):
        url_pattern = re.compile(
            r'^(https?:\/\/)?'  # protocol
            r'((([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})|'  # domain name
            r'localhost|'  # localhost
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # or ip
            r'(?::\d+)?(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?$'  # port and path
        )
        incorrect_indices_and_values = []

        for idx, url in df[column].items():
            url_str = str(url)  # Convert to string
            if not url_pattern.match(url_str):
                incorrect_indices_and_values.append((idx, url))

        total_issues = len(incorrect_indices_and_values)
        if incorrect_indices_and_values:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = incorrect_indices_and_values[:10]
                last_10 = incorrect_indices_and_values[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Invalid URL format issue(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(incorrect_indices_and_values)
                message = f"{total_issues} Invalid URL format issue(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    
    @staticmethod
    def is_valid_email(email):
        email_str = str(email)  # Convert to string

        # Check the length of the email
        if len(email_str) > 254:
            return False

        # Check if '@' is present
        if '@' not in email_str:
            return False

        # Enhanced email regex pattern
        email_pattern = re.compile(
            r'^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@'  # local part
            r'(?:[a-zA-Z0-9-]+)'  # domain name part (subdomains allowed)
            r'(?:\.[a-zA-Z0-9-]+)*'  # additional subdomains
            r'\.[a-zA-Z]{2,}$'  # TLD part
        )

        # Check for consecutive dots in local part
        if '..' in email.split('@')[0]:
            return False

        # Extract and check the domain part
        domain_part = email.split('@')[1]
        if '--' in domain_part or domain_part.startswith('-') or domain_part.endswith('-'):
            return False

        return email_pattern.match(email) is not None

    @staticmethod
    def handle_email_format(df, column):
        incorrect_indices_and_values = []
        
        for idx, email in df[column].items():
            if not DataQualityIssues.is_valid_email(email):
                incorrect_indices_and_values.append((idx, email))

        total_issues = len(incorrect_indices_and_values)
        if incorrect_indices_and_values:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = incorrect_indices_and_values[:10]
                last_10 = incorrect_indices_and_values[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Invalid email format issue(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(incorrect_indices_and_values)
                message = f"{total_issues} Invalid email format issue(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    @staticmethod
    def handle_binary_values(df, column):
        """
        Check if the values in the specified column are valid binary values or if the column has exactly two unique values.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to check.

        Returns:
        - dict: A dictionary indicating if there are non-binary values, with their indices and values, and the associated DQI.
        """
        # Calculate unique non-null values
        unique_values = df[column].dropna().unique()
        
        # If there are exactly two unique values, consider it binary and return no issue
        if len(unique_values) == 2:
            return {"issue": False}
        
        # Define acceptable binary values (including lowercase)
        true_values = ['1', 'True', 'Yes', 'T', 'Y']
        false_values = ['0', 'False', 'No', 'F','N']

        non_binary_indices = []

        for idx, val in df[column].items():
            str_val = str(val).strip().capitalize()  # Capitalize to match 'True', 'False', etc.
            if str_val not in true_values + false_values:
                non_binary_indices.append((idx, val))

        total_issues = len(non_binary_indices)
        if non_binary_indices:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = non_binary_indices[:10]
                last_10 = non_binary_indices[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Non-binary value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(non_binary_indices)
                message = f"{total_issues} Non-binary value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #15 (Domain Violation - Accuracy)"
            }

        return {"issue": False}


    
    @staticmethod
    def handle_non_numeric_values(df, column):
        # Define a function to check if a value is non-numeric and not a special placeholder
        def is_non_numeric_and_not_placeholder(x):
            special_placeholders = ["''", '""', "``"]  # List of special placeholder strings
            x_str = str(x).strip()
            if x_str in special_placeholders:
                return False  # Do not flag special placeholders as non-numeric

            try:
                float(x_str)  # Attempt to convert to float
                return False  # Conversion successful, value is numeric
            except (ValueError, TypeError):
                return True  # Conversion failed, value is non-numeric

        # Get indices and values for non-numeric values
        non_numeric_indices = df[df[column].apply(is_non_numeric_and_not_placeholder)].index
        non_numeric_values = df.loc[non_numeric_indices, column].tolist()
        issue_data = list(zip(non_numeric_indices, non_numeric_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Non-numeric value(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Non-numeric value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #17 (Wrong Data Type - Consistency)"
            }
        return {"issue": False}


    @staticmethod
    def handle_non_alphanumeric_values(df, column):
        # Define a function to check for strictly non-alphanumeric characters (excluding spaces and hyphens)
        def is_strictly_non_alphanumeric(x):
            return any(char not in string.ascii_letters + string.digits + ' -' for char in x)

        # Get indices and values that are strictly non-alphanumeric
        non_alphanumeric_indices = df[df[column].apply(lambda x: isinstance(x, str) and is_strictly_non_alphanumeric(x))].index
        non_alphanumeric_values = df.loc[non_alphanumeric_indices, column].tolist()
        issue_data = list(zip(non_alphanumeric_indices, non_alphanumeric_values))

        total_issues = len(issue_data)
        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Non-alphanumeric value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(issue_data)
                message = f"{total_issues} Non-alphanumeric value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #17 (Wrong Data Type - Consistency)"
            }

        return {"issue": False}



    @staticmethod
    def handle_non_string_values(df, column):
        """
        Flags values in a column that are not of string type.

        Parameters:
        - df (pd.DataFrame): The pandas DataFrame to analyze.
        - column (str): The name of the column to check.

        Returns:
        - dict: A dictionary indicating if there are non-string values, with their indices and values, and the associated DQI.
        """
        non_string_indices = df[df[column].apply(lambda x: not isinstance(x, str))].index
        non_string_values = df.loc[non_string_indices, column].tolist()
        issue_data = list(zip(non_string_indices, non_string_values))

        total_issues = len(issue_data)
        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Non-string value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(issue_data)
                message = f"{total_issues} Non-string value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #17 (Non-String Data Type - Consistency)"
            }

        return {"issue": False}


        
    @staticmethod
    def handle_alphanumeric_consistency(df, column):
        """
        Check if the values in the column are alphanumeric and of consistent length,
        excluding negative and floating-point numbers.
        """
        # Filter out negative and floating-point numbers
        filtered_df = df[df[column].apply(lambda x: not isinstance(x, (float, int)) or x >= 0 and float(x).is_integer())]

        # Sample the first 10 non-null, non-empty, non-negative, non-floating values
        id_samples = filtered_df[column].dropna().astype(str).str.strip()
        id_samples = id_samples[id_samples != ''].head(10)

        length_set = {len(val) for val in id_samples if val.isalnum()}
        alphanumeric_consistent = len(length_set) == 1

        if not alphanumeric_consistent:
            return {"issue": False}  # Return no issue if first 10 samples are not consistent

        consistent_length = length_set.pop()
        inconsistent_indices = []

        for idx, val in filtered_df[column].items():
            val_str = str(val).strip()
            if len(val_str) != consistent_length or not val_str.isalnum():
                inconsistent_indices.append((idx, val))

        total_issues = len(inconsistent_indices)
        if inconsistent_indices:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = inconsistent_indices[:10]
                last_10 = inconsistent_indices[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                # Convert the list to a string while keeping the square brackets
                issue_data = str(display_list)
                message = f"{total_issues} Inconsistent length in alphanumeric value(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                # Convert the list to a string while keeping the square brackets
                issue_data = str(inconsistent_indices)
                message = f"{total_issues} Inconsistent length in alphanumeric value(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #17 (Wrong Data Type - Consistency)"
            }

        return {"issue": False}




    @staticmethod
    def handle_uniqueness_violation(df, column):
        unique_violation_indices = df[df[column].duplicated()].index
        unique_violation_values = df.loc[unique_violation_indices, column].tolist()
        issue_data = list(zip(unique_violation_indices, unique_violation_values))
        total_issues = len(issue_data)

        if issue_data:
            if total_issues > 20:
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                message = f"{total_issues} Uniqueness violation(s) at index(es): {display_list} (displaying only the first and last 10 items)"
            else:
                message = f"{total_issues} Uniqueness violation(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #19 (Uniqueness Violation - Uniqueness)"
            }
        return {"issue": False}

    @staticmethod
    def handle_special_characters(df, column):
        """
        Check for special characters in the specified column.
        Allow periods in names for titles (e.g., Mr., Mrs.).
        """
        def has_special_chars(x):
            return any(char in x for char in ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';', '"', "'", '<', '>', ',', '/', '?']) and '.' not in x

        # Get indices and values with special characters
        special_chars_indices = df[df[column].apply(lambda x: isinstance(x, str) and has_special_chars(x))].index
        special_chars_values = df.loc[special_chars_indices, column].tolist()
        issue_data = list(zip(special_chars_indices, special_chars_values))

        total_issues = len(issue_data)
        if issue_data:
            if total_issues > 20:
                # Keep only the first 10 and last 10 items
                first_10 = issue_data[:10]
                last_10 = issue_data[-10:]
                display_list = first_10 + [('...', '...')] + last_10
                issue_data = ", ".join([f"({idx}, '{value}')" for idx, value in display_list])
                message = f"{total_issues} Special character(s) at index(es): {issue_data} (displaying only the first and last 10 items)"
            else:
                issue_data = ", ".join([f"({idx}, '{value}')" for idx, value in issue_data])
                message = f"{total_issues} Special character(s) at index(es): {issue_data}"

            return {
                "issue": True,
                "error_message": message,
                "dq_issue": "DQI #21 (Use of Special Characters - Consistency)"
            }

        return {"issue": False}

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Last run on: 2024-04-07 21:40:26


07 Check Numerical ge zero

In [12]:
def check_numerical_ge_zero(df: pd.DataFrame, column: str) -> dict:
    """
    Check if all the values in the specified numerical column are greater than or equal to zero,
    and also flag any non-numeric values and blank/empty/null/NaN values.
    Additionally, report the range of numeric values.
    """
    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Only proceed with other checks if the value is not blank/empty/null/NaN
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    
    result_negative = DataQualityIssues.handle_negative_values(df_filtered, column)
    result_non_numeric = DataQualityIssues.handle_non_numeric_values(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_negative['issue']:
        error_summary_parts.append(result_negative['dq_issue'] + ':\n ' + result_negative['error_message']+ '\n')

    if result_non_numeric['issue']:
        error_summary_parts.append(result_non_numeric['dq_issue'] + ':\n ' + result_non_numeric['error_message']+ '\n')

   # Convert the column to numeric, ignoring errors to leave non-numeric as NaN
    numeric_values = pd.to_numeric(df_filtered[column], errors='coerce')

    # Filter out non-numeric values to avoid them in min/max calculations
    numeric_values_filtered = numeric_values.dropna()

    # Calculate the range (min and max) of the numeric values
    min_value = numeric_values_filtered.min()
    max_value = numeric_values_filtered.max()
    
    # Count correct values
    correct_values_count = df_filtered[df_filtered[column].apply(lambda x: pd.to_numeric(x, errors='coerce') >= 0)].shape[0]

    if not error_summary_parts:
        range_summary = f"in the range ({min_value}:{max_value})" if not numeric_values_filtered.empty else "No valid numerical values found."
        return f"All {correct_values_count} values are numerical and greater or equal to 0 {range_summary}."

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)
    range_summary = f"Range of values: ({min_value}:{max_value})." if not numeric_values_filtered.empty else "No valid numerical values found."


    return f"{error_summary}\n{range_summary}"

# Test function
df_test = pd.DataFrame({'hours-per-week': [1, 2, 3, 'a3', 5, '?', None, -1, '', " ", 'null', 5.67, np.nan, '    ',  "''"]})
#df_test = pd.DataFrame({'hours-per-week': [1, 2, 3]})
result = check_numerical_ge_zero(df_test, 'hours-per-week')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 7 Blank/Empty/Null/NaN value(s) at index(es): [(6, None), (8, ''), (9, ' '), (10, 'null'), (12, ''), (13, '    '), (14, "''")]

DQI #15 (Domain Violation - Accuracy):
 1 Negative value(s) at index(es): [(7, -1)]

DQI #17 (Wrong Data Type - Consistency):
 2 Non-numeric value(s) at index(es): [(3, 'a3'), (5, '?')]

Range of values: (-1.0:5.67).
Last run on: 2024-04-07 21:40:26


08 Check Numerical

In [13]:
def check_numerical(df: pd.DataFrame, column: str) -> str:
    """
    Check if all the values in the specified numerical column are numerical.
    Additionally, report the range of numeric values.
    """
    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Only proceed with non-numeric value checks if the value is not blank/empty/null/NaN
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_non_numeric = DataQualityIssues.handle_non_numeric_values(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_non_numeric['issue']:
        error_summary_parts.append(result_non_numeric['dq_issue'] + ':\n ' + result_non_numeric['error_message'])

    # Convert the column to numeric, ignoring errors to leave non-numeric as NaN
    numeric_values = pd.to_numeric(df_filtered[column], errors='coerce')

    # Filter out non-numeric values to avoid them in min/max calculations
    numeric_values_filtered = numeric_values.dropna()

    # Calculate the range (min and max) of the numeric values
    min_value = numeric_values_filtered.min()
    max_value = numeric_values_filtered.max()

    # Count correct values
    correct_values_count = numeric_values_filtered.notna().sum()

    if not error_summary_parts:
        range_summary = f"in the range ({min_value}:{max_value})" if not numeric_values_filtered.empty else "No valid numeric values found."
        return f"All {correct_values_count} values are numerical {range_summary}."

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)
    range_summary = f"Range of values: ({min_value}:{max_value})." if not numeric_values_filtered.empty else "No valid numeric values found."

    return f"{error_summary}\n\n{range_summary}"

# Test function
#df_test = pd.DataFrame({'hours': [1, 2, 3, -2, 4.777]})
df_test = pd.DataFrame({'hours': [1, 2, 3, 'a3', 5, '?', None, -1, '', " ", 'null', 5.67, '  ']})
result = check_numerical(df_test, 'hours')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 5 Blank/Empty/Null/NaN value(s) at index(es): [(6, None), (8, ''), (9, ' '), (10, 'null'), (12, '  ')]

DQI #17 (Wrong Data Type - Consistency):
 2 Non-numeric value(s) at index(es): [(3, 'a3'), (5, '?')]

Range of values: (-1.0:5.67).
Last run on: 2024-04-07 21:40:26


09 Check Numerical between

In [14]:
def check_numerical_between(df: pd.DataFrame, column: str, min_value: float, max_value: float) -> str:
    """
    Check if all values in the specified column are numerical and fall within the given range.
    Additionally, report the actual range of numeric values.
    """
    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Only proceed with other checks if the value is not blank/empty/null/NaN
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Checks for non-numeric and out-of-range values
    result_non_numeric = DataQualityIssues.handle_non_numeric_values(df_filtered, column)
    result_outside_range = DataQualityIssues.handle_values_outside_range(df_filtered, column, min_value, max_value)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_outside_range['issue']:
        error_summary_parts.append(result_outside_range['dq_issue'] + ':\n ' + result_outside_range['error_message'] + '\n')

    if result_non_numeric['issue']:
        error_summary_parts.append(result_non_numeric['dq_issue'] + ':\n ' + result_non_numeric['error_message'] + '\n')

    # Convert the column to numeric, ignoring errors to leave non-numeric as NaN
    numeric_values = pd.to_numeric(df_filtered[column], errors='coerce')

    # Filter out non-numeric values to avoid them in min/max calculations
    numeric_values_filtered = numeric_values.dropna()

    # Calculate the range (min and max) of the numeric values
    actual_min_value = numeric_values_filtered.min()
    actual_max_value = numeric_values_filtered.max()

    range_summary = f"Actual range of values: ({actual_min_value} : {actual_max_value})" if not numeric_values_filtered.empty else "No valid numeric values found."

    if not error_summary_parts:
        return f"All {len(numeric_values_filtered)} values are numerical and valid in the range [{min_value}, {max_value}].\n{range_summary}"

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)

    return f"{error_summary}\n{range_summary}"

# Test function
df_test = pd.DataFrame({'age': [1, 2, 131, 'a3', 5, '?', None, -1, '', "", 'NULL', '  ']})
result = check_numerical_between(df_test, 'age', 0, 130)
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 5 Blank/Empty/Null/NaN value(s) at index(es): [(6, None), (8, ''), (9, ''), (10, 'NULL'), (11, '  ')]

DQI #15 (Domain Violation - Accuracy):
 2 Value(s) outside range [0, 130] at index(es): [(2, 131), (7, -1)]

DQI #17 (Wrong Data Type - Consistency):
 2 Non-numeric value(s) at index(es): [(3, 'a3'), (5, '?')]

Actual range of values: (-1.0 : 131.0)
Last run on: 2024-04-07 21:40:26


10 Check if ID

In [15]:
def check_id_attributes(df: pd.DataFrame, column: str) -> str:
    """
    Check if the values in the specified column are suitable for use as a Primary Key (PK).
    This function checks for blank, non-numeric, negative values, duplicates, and uniqueness violations.
    Additionally, report the range of alphanumeric ID values.
    """
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # Handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Filter out blank values for further checks
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
 
    # Handle negative values in the filtered DataFrame
    result_negative = DataQualityIssues.handle_negative_values(df_filtered, column)

    # Check for floating-point numbers in the filtered DataFrame
    result_floating_point = DataQualityIssues.handle_floating_point_values(df_filtered, column)

    # Check if values are alphanumeric and consistent in length in the filtered DataFrame
    result_alphanumeric_consistency = DataQualityIssues.handle_alphanumeric_consistency(df_filtered, column)
  
    # Handle duplicates and uniqueness violations in the original DataFrame
    result_duplicates = DataQualityIssues.handle_duplicates(df, column)
    result_uniqueness = DataQualityIssues.handle_uniqueness_violation(df, column)

    error_summary_parts = []

    # Append results to error summary
    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_duplicates['issue']:
        error_summary_parts.append(result_duplicates['dq_issue'] + ':\n ' + result_duplicates['error_message'] + '\n')

    if result_negative['issue']:
        error_summary_parts.append(result_negative['dq_issue'] + ':\n ' + result_negative['error_message'] + '\n')

    if result_floating_point['issue']:
        error_summary_parts.append(result_floating_point['dq_issue'] + ':\n ' + result_floating_point['error_message'] + '\n')

    if result_alphanumeric_consistency['issue']:
        error_summary_parts.append(result_alphanumeric_consistency['dq_issue'] + ':\n ' + result_alphanumeric_consistency['error_message'] + '\n')

    if result_uniqueness['issue']:
        error_summary_parts.append(result_uniqueness['dq_issue'] + ':\n ' + result_uniqueness['error_message'] + '\n')

    # Calculate the alphanumeric range (min and max) of the values
    alphanumeric_values = df_filtered[column].dropna().astype(str)
    alphanumeric_min_value = alphanumeric_values.min()
    alphanumeric_max_value = alphanumeric_values.max()

    range_summary = f"Alphanumeric range of values: ({alphanumeric_min_value} : {alphanumeric_max_value})" if not alphanumeric_values.empty else "No valid alphanumeric values found."

    if not error_summary_parts:
        return f"All {total_values_count} ID values are unique and valid, and thus suitable for use as a Primary Key.\n{range_summary}"

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)
    return f"{error_summary}\n{range_summary}"

# Example usage
df_test = pd.DataFrame({'id_column': [1, 2, 3, 5, None, -1, 1, '', " ", 'null', 5.67]})

'''df_test = pd.DataFrame({'id_column': [1, 2, 3, 5]})'''

'''df_test = pd.DataFrame({
    'id_column': ['AB123CD456', 'AB123CD457', 'AB123CD458', 'AB123CD459', 'AB123CD460', 
                  'AB123CD461', 'AB123CD462', 'AB123CD463', 'AB123CD464', 'AB123CD465', 
                  'AB123CD466', 'AB123CD467', 'AB123CD468', 'AB123CD469', 'AB123CD470', 
                  'AB123CD471', 'AB123CD472', 'AB123CD473', 'AB123CD474', 'AB123CD475', 
                  'Duplicate', 'Duplicate', 'WrongLength1', 'AB123CD456', 'AB123CD479', 
                  'AB123CD480', 'AB123CD481', 'AB123CD482', 'AB123CD483', 'AB123CD484', 
                  'AB123CD48', 'AB123CD456', '?','--', 'Aaa']})'''

'''df_test = pd.DataFrame({
    'id_column': ['AB123CD456', 'AB123CD457', 'AB123CD458', 'AB123CD459', 'AB123CD460', 
                  'AB123CD461', 'AB123CD462', 'AB123CD463', 'AB123CD464', 'AB123CD465', 
                  'AB123CD466', 'AB123CD467', 'AB123CD468', 'AB123CD469', 'AB123CD470', 
                  'AB123CD471', 'AB123CD472', 'AB123CD473']})'''

result = check_id_attributes(df_test, 'id_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 4 Blank/Empty/Null/NaN value(s) at index(es): [(4, None), (7, ''), (8, ' '), (9, 'null')]

DQI #9 (Duplicates - Uniqueness):
 2 Duplicate value(s) at index(es): [[0, 1], [6, 1]]

DQI #15 (Domain Violation - Accuracy):
 1 Negative value(s) at index(es): [(5, -1)]

DQI #15 (Domain Violation - Accuracy):
 1 Floating-point number(s) at index(es): [(10, 5.67)]

DQI #19 (Uniqueness Violation - Uniqueness):
 1 Uniqueness violation(s) at index(es): [(6, 1)]

Alphanumeric range of values: (-1 : 5.67)
Last run on: 2024-04-07 21:40:26


11 Check String

In [16]:
import pandas as pd
import numpy as np

def check_string_content(df: pd.DataFrame, column: str) -> str:
    """
    Check if all the values in the specified column are non-empty, non-null strings.
    Additionally, report the range of string values based on lexicographical order.
    """
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    if column not in df.columns:
        return f"Column {column} does not exist in the DataFrame"
    
    try:
        # First, handle blank/empty/null/NaN values
        result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

        # Handling special characters and extraneous data
        df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
        result_non_string = DataQualityIssues.handle_non_string_values(df_filtered, column)

        error_summary_parts = []
        
        if result_blank['issue']:
            error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')
        
        if result_non_string['issue']:
            error_summary_parts.append(result_non_string['dq_issue'] + ':\n ' + result_non_string['error_message'] + '\n')

        # Calculate the lexicographical range of the string values
        string_values = df_filtered[column].dropna().astype(str)
        lex_min_value = string_values.min()
        lex_max_value = string_values.max()

        range_summary = f"String range (lexicographical): ({lex_min_value} : {lex_max_value})" if not string_values.empty else "No valid string values found."

        if error_summary_parts:
            return "Error(s) found: \n" + "\n".join(error_summary_parts) + '\n' + range_summary
        else:
            return f"All {total_values_count} string values are valid.\n{range_summary}"
    except Exception as e:
        return str(e)

# Example usage
df_bad_string_data = pd.DataFrame({
    'text_column': [
        'Hello',          # Valid string
        None,             # Null value
        np.nan,           # NaN value (also treated as empty)
        '',               # Empty string
        '    ',           # String with only spaces
        123,              # Integer (non-string data type)
        5.67,             # Float (non-string data type)
        True,             # Boolean (non-string data type)
        {'key': 'value'}, # Dictionary (non-string data type)
        '  Goodbye  '     # Whitespace-padded string
    ]
}, columns=['text_column'])

'''df_bad_string_data = pd.DataFrame({
    'text_column': [
        'Hello',          # Valid string
        '  Goodbye  '     # Whitespace-padded string
    ]
}, columns=['text_column'])'''


result = check_string_content(df_bad_string_data, 'text_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 4 Blank/Empty/Null/NaN value(s) at index(es): [(1, None), (2, ''), (3, ''), (4, '    ')]

DQI #17 (Non-String Data Type - Consistency):
 4 Non-string value(s) at index(es): [(5, 123), (6, 5.67), (7, True), (8, {'key': 'value'})]

String range (lexicographical): (  Goodbye   : {'key': 'value'})
Last run on: 2024-04-07 21:40:26


12 Check if Categorical

In [17]:
import pandas as pd

def check_if_categorical(df, column, threshold=100):

    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # Handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Extract indices from the result
    blank_indices = []
    if result_blank["issue"]:
        # Assuming the format of the error message remains consistent
        blank_indices = re.findall(r"\((\d+),", result_blank["error_message"])

    # Convert indices to integers
    blank_indices = [int(idx) for idx in blank_indices]

    # Filter out blank/empty/null/NaN values for further analysis
    df_filtered = df[~df.index.isin(blank_indices)]

    # Exclude the indices/values identified in DQI #1 when checking for DQI #4
    df_filtered_for_dqi4 = df_filtered

    # Handle predefined unacceptable values
    result_unacceptable_values = DataQualityIssues.handle_predefined_unacceptable_values(df_filtered_for_dqi4, column)

    # Check for excessive distinct values
    result_excessive_values = DataQualityIssues.handle_excessive_distinct_values(df_filtered, column, threshold)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_unacceptable_values['issue']:
        error_summary_parts.append(result_unacceptable_values['dq_issue'] + ':\n ' + result_unacceptable_values['error_message'] + '\n')

    if result_excessive_values['issue']:
        error_summary_parts.append(result_excessive_values['dq_issue'] + ':\n ' + result_excessive_values['error_message'] + '\n')

    # Display frequency distribution with a maximum of 20 entries 10 from the beginning and 10 from the end)
    frequency_table = df_filtered[column].value_counts(dropna=True).reset_index()
    frequency_table.columns = ['Category', 'Frequency']
    # Sort first by Frequency (descending) then by Category (alphabetically)
    frequency_table = frequency_table.sort_values(by=['Frequency', 'Category'], ascending=[False, True]).reset_index(drop=True)

    # Select the first 10 and last 10 rows
    top_rows = frequency_table.head(10)
    bottom_rows = frequency_table.tail(10)
    if len(frequency_table) > 20:
        ellipsis_row = pd.DataFrame([("...", "...")], columns=['Category', 'Frequency'])
        display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
    else:
        display_table = frequency_table

    frequency_distribution_str = f"Categorical format with {df_filtered[column].nunique(dropna=False)} unique values:\n{display_table.to_string(index=False)}"

    # If no errors are found, return a message stating that all values are correct along with frequency distribution
    if not error_summary_parts:
        return f"All {total_values_count} values are correctly categorical.\n\n{frequency_distribution_str}"

    # Combine frequency distribution with error summary (if any)
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""
    return error_summary + frequency_distribution_str

# Example usage
#df_test = pd.DataFrame({'categorical_column': ['cat', 'dog', 'bird']})
df_test = pd.DataFrame({'categorical_column': ['cat', 'dog', 'bird', '', 'Null', 100, 200, 'cat', 'dog', 'fish', 'fish', 'bird', '?', None,' " "','', "", "''"]})
result = check_if_categorical(df_test, 'categorical_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 7 Blank/Empty/Null/NaN value(s) at index(es): [(3, ''), (4, 'Null'), (13, None), (14, ' " "'), (15, ''), (16, ''), (17, "''")]

DQI #4 (Ambiguous Data - Accuracy, Consistency):
 1 Unacceptable value(s) at index(es): [(12, '?')]
Categorical format with 7 unique values:
Category  Frequency
    bird          2
     cat          2
     dog          2
    fish          2
     100          1
     200          1
       ?          1
Last run on: 2024-04-07 21:40:26


13 Check Month

In [18]:
import pandas as pd
from datetime import datetime

def check_month(df, column):
    error_summary_parts = []
    month_counts = {}

    # Calculate the total number of values in the column
    total_values_count = df[column].size
    
    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    # Handling invalid months
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_invalid_months = DataQualityIssues.handle_invalid_months(df_filtered, column)

    if result_invalid_months['issue']:
        error_summary_parts.append(result_invalid_months['dq_issue'] + ':\n ' + result_invalid_months['error_message'] + '\n')

    # Create frequency distribution for valid months
    for val in df_filtered[column].dropna():
        normalized_month = DataQualityIssues.normalize_month(str(val))
        if normalized_month:
            month_counts[normalized_month] = month_counts.get(normalized_month, 0) + 1

    # Compile frequency distribution
    frequency_table = pd.DataFrame([
        (month, month_to_number(month), count) 
        for month, count in month_counts.items()
    ], columns=['Month', 'MonthNum', 'Frequency'])
    frequency_table = frequency_table.sort_values(by=['Frequency', 'MonthNum'], ascending=[False, True]).reset_index(drop=True)
    frequency_distribution = f"\nFrequency Distribution:\n{frequency_table[['Month', 'Frequency']].to_string(index=False)}"

    # If no errors are found, return a message stating that all values are valid along with frequency distribution
    if not error_summary_parts:
        return f"All {total_values_count} month values are valid.\n{frequency_distribution}"

    # Compile the final result message
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""
    return error_summary + frequency_distribution

def month_to_number(month_name):
    """
    Convert a month name to its corresponding numeric value.

    Parameters:
    - month_name (str): The full name of the month.

    Returns:
    - int: Numeric representation of the month.
    """
    try:
        return datetime.strptime(month_name, '%B').month
    except ValueError:
        return None

# Example usage
#df_test = pd.DataFrame({'month_column': [1, '3', 12, 'Jan', 'January', 'feb', 'NOV', 'Dec','FEBRUARY']})
df_test = pd.DataFrame({'month_column': [1, '3', 12, '0', '13', 'Jan', 'January', 'feb', 'NOV', 'Dec', 'not a month', None, 'mn', 'FEBRUARY']})
result = check_month(df_test, 'month_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 1 Blank/Empty/Null/NaN value(s) at index(es): [(11, None)]

DQI #15 (Domain Violation - Accuracy):
 4 Invalid month value(s) at index(es): [(3, '0'), (4, '13'), (10, 'not a month'), (12, 'mn')]

Frequency Distribution:
   Month  Frequency
 January          3
February          2
December          2
   March          1
November          1
Last run on: 2024-04-07 21:40:26


14 Check Weekday

In [19]:
from datetime import datetime

def normalize_weekday(weekday_str):
    """
    Normalize the given weekday string to its full weekday name if valid.

    Parameters:
    - weekday_str (str): The weekday string to normalize.

    Returns:
    - str/None: Normalized weekday name if valid, None otherwise.
    """
    weekdays = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
    weekday_abbr = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
    weekday_abbr_short = ["su", "mo", "tu", "we", "th", "fr", "sa"]

    weekday_str_norm = weekday_str.lower()

    # Normalize full names and abbreviations to full weekday name
    if weekday_str_norm.title() in weekdays:
        return weekday_str_norm.title()
    elif weekday_str_norm.title() in weekday_abbr:
        return weekdays[weekday_abbr.index(weekday_str_norm.title())]
    elif weekday_str_norm in weekday_abbr_short:
        return weekdays[weekday_abbr_short.index(weekday_str_norm)]
    elif weekday_str_norm.isdigit():
        # Convert numeric representation to weekday name
        weekday_num = int(weekday_str_norm)
        if 1 <= weekday_num <= 7:
            return weekdays[weekday_num - 1]  # Adjusting for 1-indexed weekdays
    return None

def check_weekday(df, column):
    error_summary_parts = []
    weekday_counts = {}

    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    # Handling invalid weekdays
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_invalid_weekdays = DataQualityIssues.handle_invalid_weekdays(df_filtered, column)

    if result_invalid_weekdays['issue']:
        error_summary_parts.append(result_invalid_weekdays['dq_issue'] + ':\n ' + result_invalid_weekdays['error_message'] + '\n')

    # Create frequency distribution for valid weekdays
    for val in df_filtered[column].dropna():
        normalized_weekday = normalize_weekday(str(val))
        if normalized_weekday:
            weekday_counts[normalized_weekday] = weekday_counts.get(normalized_weekday, 0) + 1

    # Compile frequency distribution
    frequency_table = pd.DataFrame(weekday_counts.items(), columns=['Weekday', 'Frequency'])
    frequency_table = frequency_table.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    frequency_distribution = f"\nFrequency Distribution:\n{frequency_table.to_string(index=False)}"

    # If no errors are found, return a message stating that all values are valid along with frequency distribution
    if not error_summary_parts:
        return f"All {total_values_count} weekday values are valid.\n{frequency_distribution}"

    # Compile the final result message
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""
    return error_summary + frequency_distribution

# Example usage
df_test = pd.DataFrame({'weekday_column': ['Monday', 'Tue', 2, 'wed', 'Sunday', 'sun', 0, 'not a weekday', None, 'Mo', 'WED', 7, -1, 'Mn']})
#df_test = pd.DataFrame({'weekday_column': ['Monday', 'Tue', 2, 'wed', 'Sunday', 'sun', 'Mo', 'WED', 7, 0]})

result = check_weekday(df_test, 'weekday_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 1 Blank/Empty/Null/NaN value(s) at index(es): [(8, None)]

DQI #15 (Domain Violation - Accuracy):
 3 Invalid weekday value(s) at index(es): [(7, 'not a weekday'), (12, -1), (13, 'Mn')]

Frequency Distribution:
  Weekday  Frequency
   Monday          3
Wednesday          2
   Sunday          2
  Tuesday          1
 Saturday          1
Last run on: 2024-04-07 21:40:26


15 Check Date 

In [20]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

min_valid_year = 1800
max_valid_year = 2100

def extract_indices_from_error_message(error_message):
    # Regular expression to find tuples in the format (index, 'date')
    return [int(m.group(1)) for m in re.finditer(r"\((\d+), '.*?'\)", error_message)]

def check_date(df, column, sample_size=10):
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    error_summary_parts = []

    # Handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')
        
    # Filter out blank/empty/null/NaN values
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Deduce regional format
    date_samples = df_filtered[column].astype(str).str.strip()
    date_samples = date_samples[:sample_size]
    deduced_date_format, _ = DataQualityIssues.deduce_regional_format(date_samples)
    deduced_strftime_format = DataQualityIssues.convert_to_strftime_format(deduced_date_format)

    # Handling outdated temporal data
    result_outdated = DataQualityIssues.handle_outdated_temporal_data(df_filtered, column, min_valid_year, max_valid_year)
    if result_outdated['issue']:
        error_summary_parts.append(result_outdated['dq_issue'] + ':\n ' + result_outdated['error_message'] + '\n')
        outdated_indices = extract_indices_from_error_message(result_outdated['error_message'])
        df_filtered = df_filtered[~df_filtered.index.isin(outdated_indices)]

    # Handling invalid dates
    result_invalid_dates = DataQualityIssues.handle_invalid_dates(df_filtered, column)
    if result_invalid_dates['issue']:
        error_summary_parts.append(result_invalid_dates['dq_issue'] + ':\n ' + result_invalid_dates['error_message'] + '\n')
        invalid_indices = extract_indices_from_error_message(result_invalid_dates['error_message'])
        df_filtered = df_filtered[~df_filtered.index.isin(invalid_indices )]

    # Handling format issues for valid dates
    result_format_issues = DataQualityIssues.handle_dates_format(df_filtered, column, deduced_date_format)
    if result_format_issues['issue']:
        error_summary_parts.append(result_format_issues['dq_issue'] + ':\n ' + result_format_issues['error_message'] + '\n')
    
    # Find the earliest and latest dates without strictly relying on deduced_strftime_format
    try:
        valid_dates = pd.to_datetime(df_filtered[column], errors='coerce').dropna()
    except Exception as e:
        print(f"Error converting dates: {e}")
        valid_dates = pd.Series()

    if not valid_dates.empty:
        earliest_date = valid_dates.min()
        latest_date = valid_dates.max()
        # Convert earliest and latest date to a more flexible format for display
        earliest_date_str = earliest_date.strftime('%Y-%m-%d')
        latest_date_str = latest_date.strftime('%Y-%m-%d')
        date_range_summary = f"Date range: {earliest_date_str} to {latest_date_str}"
    else:
        date_range_summary = "No valid date values found."
        
    # Compile the final result message
    if error_summary_parts:
        error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) + '\n' + date_range_summary
    else:
        error_summary = f"All {total_values_count} date values are valid in the {deduced_date_format} format in the range {earliest_date_str} to {latest_date_str}."

    return error_summary  # Add this line to return the summary


# Example usage
dates = [
    '3/4/2021', '14/5/2021', '01/01/2021', '12/31/2021', '2021/12/25', # Valid dates
    '14 January 2021', '1 Feb 2021', '28 Mar 2021', '4 Apr 2021', '15 Oct 2021', '23 Nov 2021', # Textual months
    '2021/04/7', '2021/08/15', '2021/11/03', '12/30/2021', '07/04/2021', '11/11/2021', # Various formats
    '32/01/2021', '29/02/2021', '31/11/2021', '00/01/2021', '01/00/2021', '2021/13/01', # Invalid dates
    'not a date', '', ' ', '2021-02-30', np.nan, None, 'Null', # Non-date and empty strings
    '1 Feb 2021','1 February 2021',
    '3/4/2121', '14/5/2222', '01/01/1500', '31/12/2321', '2121/12/25', # Dates before 1800 or greater than 2100
]

'''dates = [
'2010-12-01','2010-12-02','2010-12-03','2010-12-14','2010-12-05',
'2010-12-06','2010-12-07','2010-12-08','2010-12-09','2010-12-10',
'2010-12-11','2010-12-12','2010-12-13','2010-12-14']'''

'''dates = [
    '01/01/2021', '02/01/2021', '03/01/2021', 
    '04/01/2021', '05/01/2021', '06/01/2021',
    '07/01/2021', '08/01/2021', '09/01/2021', 
    '10/01/2021', '11/01/2021', '13/01/2021']'''
   
df_test_dates = pd.DataFrame({'date_column': dates})
result = check_date(df_test_dates, 'date_column')
print(result)

print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 5 Blank/Empty/Null/NaN value(s) at index(es): [(24, ''), (25, ' '), (27, ''), (28, None), (29, 'Null')]

DQI #6 (Outdated Temporal Data - Timeliness):
 5 Date value(s) not in [1800-2100] period at index(es): [(32, '3/4/2121'), (33, '14/5/2222'), (34, '01/01/1500'), (35, '31/12/2321'), (36, '2121/12/25')]

DQI #13 (Temporal mismatch - Accuracy, Timeliness):
 8 Invalid date value(s) at index(es): [(17, '32/01/2021'), (18, '29/02/2021'), (19, '31/11/2021'), (20, '00/01/2021'), (21, '01/00/2021'), (22, '2021/13/01'), (23, 'not a date'), (26, '2021-02-30')]

DQI #14 (Different units/representations - Consistency):
 6 Date value(s) without format 'DDMMYYYY' in [1800-2100] period at index(es): [(3, '12/31/2021'), (4, '2021/12/25'), (11, '2021/04/7'), (12, '2021/08/15'), (13, '2021/11/03'), (14, '12/30/2021')]

Date range: 2021-01-01 to 2021-12-31
Last run on: 2024-04-07 21:40:26


16 Check DateTime

In [21]:
def extract_indices_from_error_message(error_message):
    # Regular expression to find tuples in the format (index, 'value')
    return [int(m.group(1)) for m in re.finditer(r"\((\d+), '.*?'\)", error_message)]

def update_format_counts(standardized_date, format_counts):
    if len(standardized_date) == 8:
        day, month, year = int(standardized_date[:2]), int(standardized_date[2:4]), int(standardized_date[4:])
        if 1 <= day <= 31 and 1 <= month <= 12:
            format_counts['DDMMYYYY'] += 1
        if 1 <= month <= 12 and 1 <= day <= 31:
            format_counts['MMDDYYYY'] += 1
        year, month, day = int(standardized_date[:4]), int(standardized_date[4:6]), int(standardized_date[6:])
        if min_valid_year <= year <= max_valid_year and 1 <= month <= 12 and 1 <= day <= 31:
            format_counts['YYYYMMDD'] += 1
    return format_counts

def check_datetime(df, column, sample_size=10):
    # Calculate the total number of values in the column
    total_values_count = df[column].size
    error_summary_parts = []

    # Handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')
    
    # Filter out blank/empty/null/NaN values
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Deduce regional format
    datetime_samples = df_filtered[column].astype(str).str.strip()
    date_samples = [sample.split(' ')[0] for sample in datetime_samples][:sample_size]
    deduced_date_format, _ = DataQualityIssues.deduce_regional_format(date_samples)
    deduced_strftime_format = DataQualityIssues.convert_to_strftime_format(deduced_date_format) + " %H:%M"
    
    # Handling outdated temporal datetimes
    result_outdated = DataQualityIssues.handle_outdated_temporal_data_datetime(df_filtered, column, min_valid_year, max_valid_year)
    if result_outdated['issue']:
        error_summary_parts.append(result_outdated['dq_issue'] + ':\n ' + result_outdated['error_message'] + '\n')
        outdated_indices = extract_indices_from_error_message(result_outdated['error_message'])
        df_filtered = df_filtered[~df_filtered.index.isin(outdated_indices)]

    # Handling format issues for valid dates
    result_format_issues = DataQualityIssues.handle_datetimes_format(df_filtered, column, deduced_date_format)
    if result_format_issues['issue']:
        error_summary_parts.append(result_format_issues['dq_issue'] + ':\n ' + result_format_issues['error_message'] + '\n')
        invalid_indices = extract_indices_from_error_message(result_format_issues['error_message'])
        df_filtered = df_filtered[~df_filtered.index.isin(invalid_indices )]
  
    # Handle invalid datetime formats
    result_invalid_formats = DataQualityIssues.handle_invalid_datetimes(df_filtered, column, deduced_date_format)
    if result_invalid_formats['issue']:
        error_summary_parts.append(result_invalid_formats['dq_issue'] + ':\n ' + result_invalid_formats['error_message'] + '\n')

    # Find the earliest and latest datetime values
    valid_datetimes = pd.to_datetime(df_filtered[column], errors='coerce', format=deduced_strftime_format).dropna()
    earliest_datetime = valid_datetimes.min()
    latest_datetime = valid_datetimes.max()

    # Convert earliest and latest datetime to the deduced date format
    datetime_range_summary = ""
    if not valid_datetimes.empty:
        earliest_datetime_str = earliest_datetime.strftime(deduced_strftime_format)
        latest_datetime_str = latest_datetime.strftime(deduced_strftime_format)
        datetime_range_summary = f"Date range: {earliest_datetime_str} to {latest_datetime_str}\n"
    else:
        datetime_range_summary = "No valid datetime values found.\n"

   # Compile final result message
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else f"All {total_values_count} datetime values are valid in the {deduced_date_format} format in the range {earliest_datetime_str} to {latest_datetime_str}."
    return error_summary

# Example usage
timestamps = [
    '01/01/2021 13:00', '02/01/2021 14:30', '03/01/2021 15:45', 
    '04/01/2021 16:00', '05/01/2021 17:15', '06/01/2021 18:30',
    '07/01/2021 19:45', '08/01/2021 20:00', '09/01/2021 21:15', 
    '10/01/2021 22:30', '2021-01-11 23:45', '11-01-2021 10:00 PM', 
    '2021/01/12 23:00', '13/01/2021 12:60', 'January 14, 2021 12:00', 
    '15/01/2021', '2021/16/01 14:00', '17-01-2021', '18/01/2021 25:00', 
    '2021-01-19T15:30', '20th Jan 2021 16:00', '20 Jan 2021 16:00:10', '21/01/2021 16:00:60', 
    'not a datetime', '2021/01/23', '24/01/2021 26:30', '14 January 2021 12:00',
    np.nan, None, 'Null', '', '  ', '29/02/2021 15:20','2021-02-30 15:20:05',
    '3/4/2121 13:00', '14/5/2222 13:05', '01/01/1500 13:00:10', '31/12/2321 13:20', 
    '2121/12/25 13:00:12' # Dates before 1800 or greater than 2100
]

'''timestamps = [
    '01/01/2021 13:00', '02/01/2021 14:30', '03/01/2021 15:45', 
    '04/01/2021 16:00', '05/01/2021 17:15', '06/01/2021 18:30',
    '07/01/2021 19:45', '08/01/2021 20:00', '09/01/2021 21:15', 
    '10/01/2021 22:30']'''

'''timestamps = ['2010-12-01 08:26:00','2010-12-02 08:26:00','2010-12-03 08:28:00','2010-12-01 08:26:00','2010-12-01 08:26:45','2010-12-01 08:26:50',
              '2010-12-01 08:27:00','2010-12-01 08:27:00','2010-12-01 08:26:15','2010-12-01 08:26:00','2010-12-01 08:26:05','2010-12-01 08:26:10',
              '2010-12-01 08:26:20','2010-12-01 08:26:06']'''

df_test_timestamps = pd.DataFrame({'timestamp_column': timestamps})
result = check_datetime(df_test_timestamps, 'timestamp_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 5 Blank/Empty/Null/NaN value(s) at index(es): [(27, ''), (28, None), (29, 'Null'), (30, ''), (31, '  ')]

DQI #6 (Outdated Temporal Data - Timeliness):
 5 Datetime value(s) not in [1800-2100] period at index(es): [(34, '3/4/2121 13:00'), (35, '14/5/2222 13:05'), (36, '01/01/1500 13:00:10'), (37, '31/12/2321 13:20'), (38, '2121/12/25 13:00:12')]

DQI #14 (Different units/representations - Consistency):
 4 Datetime value(s) without format 'DDMMYYYY' in [1800-2100] period at index(es): [(10, '2021-01-11 23:45'), (12, '2021/01/12 23:00'), (14, 'January 14, 2021 12:00'), (24, '2021/01/23')]

DQI #13 (Temporal mismatch - Accuracy, Timeliness):
 9 Invalid datetime value(s) at index(es): [(13, '13/01/2021 12:60'), (16, '2021/16/01 14:00'), (18, '18/01/2021 25:00'), (19, '2021-01-19T15:30'), (22, '21/01/2021 16:00:60'), (23, 'not a datetime'), (25, '24/01/2021 26:30'), (32, '29/02/2021 15:20'), (33, '2021-02-30 15:20:05')]

Last run on: 20

17 Check Time

In [22]:
from datetime import datetime

def parse_time(time_str):
    # If the input is already a time object, return it directly
    if isinstance(time_str, time):
        return time_str

    for fmt in ('%H:%M:%S', '%H:%M', '%I:%M %p'):
        try:
            return datetime.strptime(time_str, fmt).time()
        except ValueError:
            continue
    return None

        
def check_time(df, column):
    error_summary_parts = []
    
    # Calculate the total number of values in the column
    total_values_count = df[column].size
    
    # Handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')
    
    # Filter out blank/empty/null/NaN values and create a copy for safe modification
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')].copy()

    # Handle invalid time formats
    result_invalid_times = DataQualityIssues.handle_invalid_times(df_filtered, column)
    if result_invalid_times['issue']:
        error_summary_parts.append(result_invalid_times['dq_issue'] + ':\n ' + result_invalid_times['error_message'] + '\n')

    # Convert time strings to datetime.time objects
    df_filtered['time_converted'] = df_filtered[column].apply(parse_time)

    valid_times = df_filtered['time_converted'].dropna()
    earliest_time = min(valid_times, default=None)
    latest_time = max(valid_times, default=None)

    time_range_summary = ""
    if earliest_time and latest_time:
        time_range_summary = f"\nTime range: ({earliest_time} to {latest_time})\n"
    else:
        time_range_summary = "No valid time values found.\n"

    final_summary = ""
    if error_summary_parts:
        final_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) + time_range_summary
    else:
        final_summary = f"All {total_values_count} time values are valid in the range {earliest_time} to {latest_time}."

    return final_summary

# Example usage
#df_test = pd.DataFrame({'time_column': ['12:30', '02:30 PM', '14:30:15', '03:05 AM']})
df_test = pd.DataFrame({'time_column': ['12:30', '13:61', '02:30 PM', '14:30:15', '03:05 AM', 'invalid', '', None, '02:30 PN', '25:03']})

result = check_time(df_test, 'time_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 2 Blank/Empty/Null/NaN value(s) at index(es): [(6, ''), (7, None)]

DQI #13 (Temporal mismatch - Accuracy, Timeliness):
 4 Invalid time value(s) at index(es): [(1, '13:61'), (5, 'invalid'), (8, '02:30 PN'), (9, '25:03')]

Time range: (03:05:00 to 14:30:15)

Last run on: 2024-04-07 21:40:26


18 Check Model Name

In [23]:
def check_model_name(df, column):
    error_summary_parts = []
    
    total_values_count = df[column].size
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""

    # Frequency distribution calculation
    model_counts = df[column].value_counts().to_dict()
    frequency_table = pd.DataFrame(model_counts.items(), columns=[column, 'Frequency'])
    frequency_table = frequency_table.sort_values(by=['Frequency', column], ascending=[False, True]).reset_index(drop=True)

    if len(frequency_table) > 20:
        top_rows = frequency_table.head(10)
        bottom_rows = frequency_table.tail(10)
        ellipsis_row = pd.DataFrame([("...", "...")], columns=[column, 'Frequency'])
        display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
    else:
        display_table = frequency_table

    frequency_distribution = f"\nFrequency Distribution:\n{display_table.to_string(index=False)}\n"

    # Get the range of values (smallest and largest)
    sorted_df = df.sort_values(by=column)
    smallest_model = sorted_df.iloc[0][column]
    biggest_model = sorted_df.iloc[-1][column]
    range_of_values = f"\nRange of Values: ({smallest_model} to {biggest_model})"

    return error_summary + frequency_distribution + range_of_values if error_summary_parts else f"All {total_values_count} {column} values are valid.\n{frequency_distribution}{range_of_values}"

# Test the function with your dataframe
df_test = pd.DataFrame({
    'model_name_column': [
        '32/60', '470v/7', 'vs-100', '', '  ', '?',
        '90/80-model-3', '11',  '50-850-ii'
        'dn420', '580-5840'
    ]
})
result = check_model_name(df_test, 'model_name_column')
print(result)

print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")



Error(s) found: 
DQI #1 (Missing Data - Completeness):
 2 Blank/Empty/Null/NaN value(s) at index(es): [(3, ''), (4, '  ')]

Frequency Distribution:
model_name_column  Frequency
                           1
                           1
               11          1
            32/60          1
           470v/7          1
   50-850-iidn420          1
         580-5840          1
    90/80-model-3          1
                ?          1
           vs-100          1

Range of Values: ( to vs-100)
Last run on: 2024-04-07 21:40:26


18.5 Check Name

In [24]:
import pandas as pd

def check_name(df, column):
    error_summary_parts = []
    linking_words = {'the', 'and', 'of', 'do', 'da', 'de', 'del', 'dos', 'e', 'md', 'ii', 'iii', 'iv', 'v', 'jr', 'sr', 'phd'}  # Set of lowercase linking words and suffixes
        
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Check if first 10 values are all lowercase
    first_10_lowercase = df[column].head(10).str.islower().all()

    # Handling special characters and extraneous data
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_extraneous = DataQualityIssues.handle_extraneous_data(df_filtered, column)

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    indices_to_exclude = set()

    if result_extraneous['issue']:
        # Convert indices to integers (if necessary)
        indices_to_exclude.update(result_extraneous.get('indices', []))
        #print("indices_to_exclude",indices_to_exclude)
        error_summary_parts.append(result_extraneous['dq_issue'] + ':\n ' + result_extraneous['error_message'] + '\n')

    if indices_to_exclude:
        #print(f"Indices to be excluded: {sorted(indices_to_exclude)}")    
        # Filter out rows with extraneous data
        df_filtered = df_filtered.loc[~df_filtered.index.isin(indices_to_exclude)]    
 
    # Perform capitalization format check only if not all first 10 names are lowercase
    if not first_10_lowercase:
        result_capitalization_format = DataQualityIssues.handle_capitalization_format(df_filtered, column, linking_words)
        if result_capitalization_format['issue']:
            error_summary_parts.append(result_capitalization_format['dq_issue'] + ':\n ' + result_capitalization_format['error_message']+ '\n')
    
    # Compile the final result message
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""

   # Frequency distribution calculation
    name_counts = df[column].value_counts().to_dict()
    frequency_table = pd.DataFrame(name_counts.items(), columns=[column, 'Frequency'])
    frequency_table = frequency_table.sort_values(by=['Frequency', column], ascending=[False, True]).reset_index(drop=True)

    if len(frequency_table) > 20:
        top_rows = frequency_table.head(10)
        bottom_rows = frequency_table.tail(10)
        ellipsis_row = pd.DataFrame([("...", "...")], columns=[column, 'Frequency'])
        display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
    else:
        display_table = frequency_table

    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else ""
    frequency_distribution = f"\nFrequency Distribution:\n{display_table.to_string(index=False)}\n"

    # Get the range of values (smallest and largest)
    sorted_df = df.sort_values(by=column)
    smallest_name = sorted_df.iloc[0][column]
    biggest_name = sorted_df.iloc[-1][column]
    range_of_values = f"\nRange of Values: ({smallest_name} to {biggest_name})"

    return error_summary + frequency_distribution + range_of_values if error_summary_parts else f"All {total_values_count} {column} values are valid.\n{frequency_distribution}{range_of_values}"

# Test the function
df_test = pd.DataFrame({
    'name_column': [
        'John Doe', 'jane doe', 'Mr. Smith', 'Anne-Marie', '', '  ', '?',
        'John3 Doe', 'Emily!', '11', 'Mary Joe MD', 'John Williams II', 'Madonna',
        'Jean Paul Gautier, Jr', 'João Paulo Pereira e Souza Filho', 'José Augusto Napoleão Ferreira dos Santos', 
        'John F. Kennedy, Phd', 'John Newman, PhD'
    ]
})

'''df_test = pd.DataFrame({
    'name_column': [
        'John Doe','Mr. Smith', 'Anne-Marie', 'Madonna',
        'Jean Paul Gautier, Jr', 'João Paulo Pereira e Souza Filho', 'José Augusto Napoleão Ferreira dos Santos', 
        'John F. Kennedy, Phd', 'John Newman, PhD'
    ]
})'''
result = check_name(df_test, 'name_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 2 Blank/Empty/Null/NaN value(s) at index(es): [(4, ''), (5, '  ')]

DQI #5 (Extraneous Data - Consistency, Uniqueness):
 4 Extraneous data value(s) at index(es): [(6, '?'), (7, 'John3 Doe'), (8, 'Emily!'), (9, '11')]

DQI #15 (Domain Violation - Accuracy):
 1 Capitalization/Format issue(s) at index(es): [(1, 'jane doe')]

Frequency Distribution:
                              name_column  Frequency
                                                   1
                                                   1
                                       11          1
                                        ?          1
                               Anne-Marie          1
                                   Emily!          1
                    Jean Paul Gautier, Jr          1
                                 John Doe          1
                     John F. Kennedy, Phd          1
                         John Newman, PhD          1
             

19 Check Street

In [25]:
def check_street(df, column):
    """
    Check if street names in the specified column conform to expected standards.

    Parameters:
    - df (pd.DataFrame): The pandas DataFrame to analyze.
    - column (str): The name of the column with street names.

    Returns:
    - str: A message indicating the result of the street name checks.
    """
    
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    error_summary_parts = []
    linking_words = {'the', 'and', 'of', 'do', 'da', 'de', 'del', 'e', 'th','rd'}  # Set of lowercase linking words

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Handling special characters and extraneous data
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_street_extraneous = DataQualityIssues.handle_street_extraneous_data(df_filtered, column)
    result_capitalization_format = DataQualityIssues.handle_capitalization_format(df_filtered, column, linking_words)

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_street_extraneous['issue']:
        error_summary_parts.append(result_street_extraneous['dq_issue'] + ':\n ' + result_street_extraneous['error_message'] + '\n')

    if result_capitalization_format['issue']:
        error_summary_parts.append(result_capitalization_format['dq_issue'] + ':\n ' + result_capitalization_format['error_message']+ '\n')
    
    # Compile the final result message
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)
    
    # Sort the DataFrame by the name column
    sorted_df = df.sort_values(by=column)

    # Get the first and last name after sorting
    smallest_name = sorted_df.iloc[0][column]
    biggest_name = sorted_df.iloc[-1][column]
 
    return error_summary if error_summary_parts else f"All {total_values_count} street values are valid in the range ({smallest_name} to {biggest_name}).\n"

# Test the function with sample data
# Sample data for street checks
streets = [
    '123 Main St', '45 Oxford Road', 'Broadway Ave', '5th Avenue', 
    'Mt. Everest Street', 'InvalidStreet', '12, Elm Street', '77 Sunset Strip', 
    '221B Baker Street', 'Elm St.', 'Ocean Drive', 'Park Ave', 'Sesame St', 
    'Main Street 123', 'Pennsylvania Avenue NW', 'Sunset boulevard', 
    'Abbey Road', 'Fleet Street', 'Diagon Alley', '15/250 Beaufort St',
    None, '', '  ', '?', 'John3 Doe', 'Emily!', '11', 'R. Prof Paulo Roberto Martins, 2', 'Null'
]

'''streets = [
    '123 Main St', '45 Oxford Road', 'Broadway Ave', '5th Avenue', 
    'Mt. Everest Street', '12, Elm Street', '77 Sunset Strip', 
    '221B Baker Street', 'Elm St.', 'Ocean Drive', 'Park Ave', 'Sesame St', 
    'Main Street 123', 'Pennsylvania Avenue NW', 
    'Abbey Road', 'Fleet Street', 'Diagon Alley', '15/250 Beaufort St',
    'R. Prof Paulo Roberto Martins, 2'
]'''

df_streets = pd.DataFrame({'street': streets})
result = check_street(df_streets, 'street')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 4 Blank/Empty/Null/NaN value(s) at index(es): [(20, None), (21, ''), (22, '  '), (28, 'Null')]

DQI #5 (Extraneous Data - Consistency, Uniqueness):
 3 Extraneous street data value(s) at index(es): [(23, '?'), (25, 'Emily!'), (26, '11')]

DQI #15 (Domain Violation - Accuracy):
 2 Capitalization/Format issue(s) at index(es): [(5, 'InvalidStreet'), (15, 'Sunset boulevard')]

Last run on: 2024-04-07 21:40:26


20 Check City

In [26]:
def check_city(df, column):
    """
    Check if city names in the specified column conform to expected standards and provide a frequency distribution.
    This includes checks for proper capitalization and invalid characters.

    Parameters:
    - df (pd.DataFrame): The pandas DataFrame to analyze.
    - column (str): The name of the column with city names.

    Returns:
    - str: A message indicating the result of the city name checks and their frequency distribution.
    """
    
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    city_counts = {}
    linking_words = {'the', 'and', 'of', 'do', 'da', 'de', 'del', 'e','dos'}  # Set of lowercase linking words

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Handling special characters and extraneous data
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_extraneous = DataQualityIssues.handle_extraneous_data(df_filtered, column)
    result_capitalization_format = DataQualityIssues.handle_capitalization_format(df_filtered, column, linking_words)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_extraneous['issue']:
        error_summary_parts.append(result_extraneous['dq_issue'] + ':\n ' + result_extraneous['error_message']+ '\n')

    if result_capitalization_format['issue']:
        error_summary_parts.append(result_capitalization_format['dq_issue'] + ':\n ' + result_capitalization_format['error_message']+ '\n')

    # Frequency distribution calculation
    for idx, city in df_filtered.iterrows():
        city_str = str(city[column]).strip()

        if city_str not in [item[1] for item in error_summary_parts]:
            # Counting occurrences of each city
            city_counts[city_str] = city_counts.get(city_str, 0) + 1

    # Creating a frequency table sorted first by frequency and then alphabetically
    if city_counts:
        frequency_table = pd.DataFrame(city_counts.items(), columns=['City', 'Frequency'])
        frequency_table = frequency_table.sort_values(by=['Frequency', 'City'], ascending=[False, True]).reset_index(drop=True)
        # Select the first 10 and last 10 rows if more than 20 distinct items
        top_rows = frequency_table.head(10)
        bottom_rows = frequency_table.tail(10)
        if len(frequency_table) > 20:
            ellipsis_row = pd.DataFrame([("...", "...")], columns=['City', 'Frequency'])
            display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
        else:
            display_table = frequency_table

        result_str = f"Frequency Distribution (showing top and bottom 10 of {len(frequency_table)} categories):\n{display_table.to_string(index=False)}\n"

        # Compile the final result message
        error_summary = ''
        if error_summary_parts:
            error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) + "\n"

        final_message = error_summary + result_str

        # If no errors are found, return a message stating that all values are valid along with frequency distribution
        if not error_summary_parts:
            final_message = f"All {total_values_count} city values are valid.\n{result_str}"

        return final_message

# Sample data for city checks
cities = [
    'New York', 'London', 'Paris', 'Tokyo', 'Tokyo',
    'los angeles', 'Sydney', 'Beijing', 'Cairo', 'new delhi', 'San Francisco', 
    'San francisco', 'Chicago', 'Boston', 'Berlin', 'Amsterdam', 
    'Hong Kong', 'Singapore', 'Dubai', 'Moscow', 'São Paulo','São José dos Campos',
    None, '', '  ', '?', 'Dubai!',  11
]

'''cities = [
    'New York', 'London', 'Paris', 'Tokyo', 'Tokyo',
    'Sydney', 'Beijing', 'Cairo', 'San Francisco', 
    'Chicago', 'Boston', 'Berlin', 'Amsterdam', 
    'Hong Kong', 'Singapore', 'Dubai', 'Moscow', 'São Paulo','São José dos Campos',
    'NY', 'Los Angeles', 'Cairo', 'Brasilia'
]'''
df_cities = pd.DataFrame({'city': cities})
result = check_city(df_cities, 'city')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Error(s) found: 
DQI #1 (Missing Data - Completeness):
 3 Blank/Empty/Null/NaN value(s) at index(es): [(22, None), (23, ''), (24, '  ')]

DQI #5 (Extraneous Data - Consistency, Uniqueness):
 3 Extraneous data value(s) at index(es): [(25, '?'), (26, 'Dubai!'), (27, 11)]

DQI #15 (Domain Violation - Accuracy):
 3 Capitalization/Format issue(s) at index(es): [(5, 'los angeles'), (9, 'new delhi'), (11, 'San francisco')]

Frequency Distribution (showing top and bottom 10 of 24 categories):
               City Frequency
              Tokyo         2
                 11         1
                  ?         1
          Amsterdam         1
            Beijing         1
             Berlin         1
             Boston         1
              Cairo         1
            Chicago         1
              Dubai         1
                ...       ...
           New York         1
              Paris         1
      San Francisco         1
      San francisco         1
          Singapore         1


21 Check State

In [27]:
def check_state(df, column):
    """
    Check if state names in the specified column conform to expected standards of capitalization
    and provide a frequency distribution, allowing certain lowercase words and abbreviations.

    Parameters:
    - df (pd.DataFrame): The pandas DataFrame to analyze.
    - column (str): The name of the column with state names.

    Returns:
    - str: A message indicating the result of the state name checks and their frequency distribution.
    """
    incorrect_indices_and_values = []
    state_counts = {}
    lowercase_exceptions = {"e", "do", "dos", "da", "das", "de","dos"}  # Lowercase exceptions
    
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Handling special characters and extraneous data
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_extraneous = DataQualityIssues.handle_extraneous_data(df_filtered, column)
    result_capitalization_format = DataQualityIssues.handle_capitalization_format(df_filtered, column, lowercase_exceptions)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_extraneous['issue']:
        error_summary_parts.append(result_extraneous['dq_issue'] + ':\n ' + result_extraneous['error_message']+ '\n')

    if result_capitalization_format['issue']:
        error_summary_parts.append(result_capitalization_format['dq_issue'] + ':\n ' + result_capitalization_format['error_message']+ '\n')

    # Frequency distribution calculation
    for idx, state in df.iterrows():
        state_str = str(state[column]).strip()

        # Skip blank/empty/null/NaN values and incorrect values for frequency calculation
        if state_str and state_str not in [item[1] for item in incorrect_indices_and_values]:
            # Counting occurrences of each state
            state_counts[state_str] = state_counts.get(state_str, 0) + 1

    # Creating a frequency table sorted first by frequency and then alphabetically
    if state_counts:
        frequency_table = pd.DataFrame(state_counts.items(), columns=['State', 'Frequency'])
        frequency_table = frequency_table.sort_values(by=['Frequency', 'State'], ascending=[False, True]).reset_index(drop=True)

        # Select the first 10 and last 10 rows if more than 20 distinct items
        top_rows = frequency_table.head(10)
        bottom_rows = frequency_table.tail(10)
        if len(frequency_table) > 20:
            ellipsis_row = pd.DataFrame([("...", "...")], columns=['State', 'Frequency'])
            display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
        else:
            display_table = frequency_table

        result_str = f"\nFrequency Distribution (showing top and bottom 10 of {len(frequency_table)} categories):\n{display_table.to_string(index=False)}\n"   

        # Compile the final result message
        error_summary = ''
        if error_summary_parts:
            error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) + "\n"

        final_message = error_summary + result_str

        # If no errors are found, return a message stating that all values are valid along with frequency distribution
        if not error_summary_parts:
            final_message = f"All {total_values_count} state values are valid.\n{result_str}"

        return final_message

# Sample data for state checks
states = [
     'CA2', 'New York', 'Texas', 'FL', 'Texas',
    'Nevada', 'WA', 'Queensland', 'Bavaria', 'Delhi', 'New york',
    'Illinois', 'Victoria', 'Ontario', 'Colorado', 'Arizona', 
    'NSW', 'Gauteng', 'Hawaii', 'Alaska', 'Punjab', 'new south wales',
    None, '', '  ', '?', 'California!', '11', 'Rio Grande do Sul', 'new Jersey', 'n york', 'N. Dakota', 'São Paulo'
]

'''states = [
     'CA', 'New York', 'Texas', 'FL', 'Texas',
    'Nevada', 'WA', 'Queensland', 'Bavaria', 'Delhi', 'New York',
    'Illinois', 'Victoria', 'Ontario', 'Colorado', 'Arizona', 
    'NSW', 'Gauteng', 'Hawaii', 'Alaska', 'Punjab', 
    'Rio Grande do Sul', 'N. Dakota', 'São Paulo'
]'''

df_states = pd.DataFrame({'state': states})
result = check_state(df_states, 'state')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Error(s) found: 
DQI #1 (Missing Data - Completeness):
 3 Blank/Empty/Null/NaN value(s) at index(es): [(22, None), (23, ''), (24, '  ')]

DQI #5 (Extraneous Data - Consistency, Uniqueness):
 4 Extraneous data value(s) at index(es): [(0, 'CA2'), (25, '?'), (26, 'California!'), (27, '11')]

DQI #15 (Domain Violation - Accuracy):
 4 Capitalization/Format issue(s) at index(es): [(10, 'New york'), (21, 'new south wales'), (29, 'new Jersey'), (30, 'n york')]


Frequency Distribution (showing top and bottom 10 of 30 categories):
            State Frequency
            Texas         2
               11         1
                ?         1
           Alaska         1
          Arizona         1
          Bavaria         1
              CA2         1
      California!         1
         Colorado         1
            Delhi         1
              ...       ...
          Ontario         1
           Punjab         1
       Queensland         1
Rio Grande do Sul         1
        São Paulo       

22 Check Country

In [28]:
def check_country(df, column):
    """
    Check if country names in the specified column conform to expected standards of capitalization 
    and provide a frequency distribution.
    """
    incorrect_indices_and_values = []
    country_counts = {}
    linking_words = {'the', 'and', 'of', 'do', 'da', 'de', 'del', 'e', 'dos','etc'}  # Set of lowercase linking words

    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Filtering df based on blank checks
    indices_to_exclude_blank = [idx for idx, _ in result_blank.get('indices_and_values', [])]
    print(indices_to_exclude_blank)
    df_filtered = df.drop(indices_to_exclude_blank)

    # Handling special characters and extraneous data
    #df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_extraneous = DataQualityIssues.handle_extraneous_data(df_filtered, column)

    if result_extraneous['issue']:
        indices_to_exclude_extraneous = result_extraneous['indices']  # Directly use the indices
        df_filtered = df_filtered.drop(index=indices_to_exclude_extraneous)

    result_capitalization_format = DataQualityIssues.handle_capitalization_format_country(df_filtered, column, linking_words)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_extraneous['issue']:
        error_summary_parts.append(result_extraneous['dq_issue'] + ':\n ' + result_extraneous['error_message']+ '\n')

    if result_capitalization_format['issue']:
        error_summary_parts.append(result_capitalization_format['dq_issue'] + ':\n ' + result_capitalization_format['error_message']+ '\n')

    # Frequency distribution calculation
    for idx, country in df.iterrows():
        country_str = str(country[column]).strip()
        
        # Skip blank/empty/null/NaN values and incorrect values for frequency calculation
        if country_str and country_str not in [item[1] for item in incorrect_indices_and_values]:
            # Counting occurrences of each country
            country_counts[country_str] = country_counts.get(country_str, 0) + 1

    # Creating a frequency table sorted first by frequency and then alphabetically
    if country_counts:
        frequency_table = pd.DataFrame(country_counts.items(), columns=['Country', 'Frequency'])
        frequency_table = frequency_table.sort_values(by=['Frequency', 'Country'], ascending=[False, True]).reset_index(drop=True)

        # Select the first 10 and last 10 rows if more than 20 distinct items
        top_rows = frequency_table.head(10)
        bottom_rows = frequency_table.tail(10)
        if len(frequency_table) > 20:
            ellipsis_row = pd.DataFrame([("...", "...")], columns=['Country', 'Frequency'])
            display_table = pd.concat([top_rows, ellipsis_row, bottom_rows], ignore_index=True)
            result_str = f"\nFrequency Distribution (showing top and bottom 10 of {len(frequency_table)} categories):\n{display_table.to_string(index=False)}\n"
        else:
            result_str = f"\nFrequency Distribution:\n{frequency_table.to_string(index=False)}\n"
    else:
        result_str = "All country values are valid"

    # Compile the final result message
    error_summary = ''
    if error_summary_parts:
        error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts) + "\n"

    final_message = error_summary + result_str

    # If no errors are found, return a message stating that all values are valid along with frequency distribution
    if not error_summary_parts:
        final_message = f"All {total_values_count} country values are valid.\n{result_str}"

    return final_message


# Test function
countries = [
    'USA', 'United Kingdom', 'France', 'Japan', 'Australia', 
    'India', 'Australia', 'China', 'Egypt', 'Canada', 
    'Germany', 'Brazil', 'South Africa', 'Russia', 'puerto rico', 
    'ITA', 'SPA', 'BR', 'SWE', 'Papua New Guinea',
    None, '', '  ', '?', 'Canada!', '11', 'The United States of America', 'Outlying-US (Guam-USVI-etc)',
    'guatemala', 'papua New Guinea', 'Saint Vincent and the Grenadines', 'Null', 'US-Virgin-Isles'
]

'''"AnalysedColumns 2202.xlsx"countries = [
    'USA', 'United Kingdom', 'France', 'Japan', 'Australia', 
    'India', 'Australia', 'China', 'Egypt', 'Canada', 
    'Germany', 'Brazil', 'South Africa', 'Russia', 'Puerto Rico', 
    'ITA', 'SPA', 'BR', 'SWE', 'Papua New Guinea',
    'Canada', 'The United States of America',
    'Guatemala', 'Papua New Guinea', 'Saint Vincent and the Grenadines', 'US-Virgin-Isles','?'
]'''

df_countries = pd.DataFrame({'country': countries})
result = check_country(df_countries, 'country')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

[]
Error(s) found: 
DQI #1 (Missing Data - Completeness):
 4 Blank/Empty/Null/NaN value(s) at index(es): [(20, None), (21, ''), (22, '  '), (31, 'Null')]

DQI #5 (Extraneous Data - Consistency, Uniqueness):
 3 Extraneous data value(s) at index(es): [(23, '?'), (24, 'Canada!'), (25, '11')]

DQI #15 (Domain Violation - Accuracy):
 3 Capitalization/Format issue(s) at index(es): [(14, 'puerto rico'), (28, 'guatemala'), (29, 'papua New Guinea')]


Frequency Distribution (showing top and bottom 10 of 30 categories):
                         Country Frequency
                       Australia         2
                              11         1
                               ?         1
                              BR         1
                          Brazil         1
                          Canada         1
                         Canada!         1
                           China         1
                           Egypt         1
                          France         1
           

23 Check Postal Code

In [29]:
def check_postal_code(df: pd.DataFrame, column: str) -> dict:
    """
    Check if postal code entries in the specified column are valid.
    """
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Only proceed with other checks if the value is not blank/empty/null/NaN
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    
    result_non_alphanumeric = DataQualityIssues.handle_non_alphanumeric_values(df_filtered, column)
    result_short_length = DataQualityIssues.handle_short_length_values(df_filtered, column, 4)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_short_length['issue']:
        error_summary_parts.append(result_short_length['dq_issue'] + ':\n ' + result_short_length['error_message']+ '\n')
        
    if result_non_alphanumeric['issue']:
        error_summary_parts.append(result_non_alphanumeric['dq_issue'] + ':\n ' + result_non_alphanumeric['error_message']+ '\n')
 
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)  
    # Sort the DataFrame by the name column
    sorted_df = df.sort_values(by=column)

    # Get the first and last name after sorting
    smallest_name = sorted_df.iloc[0][column]
    biggest_name = sorted_df.iloc[-1][column]
 
    return error_summary if error_summary_parts else f"All {total_values_count} postal codes values are valid in the range ({smallest_name} to {biggest_name}).\n"

# Sample data for postal code checks, including edge cases
'''postal_codes = [
    '10001', 'SW1A 1AA', '75008', '100-0001', 
    '110001', '2000', '100000', '11511', 'M4W 1A8', 
    '10115', '01311', '2001', '101000', '06500', 
    '00184', '28013', '1012 WX', '111 20', '0101', 
    '71676-110', '6000', None, '', '  ', '?', '1000!', '11'
]'''

postal_codes = [
    '10001', 'SW1A 1AA', '75008', '100-0001', 
    '110001', '2000', '100000', '11511', 'M4W 1A8', 
    '10115', '01311', '2001', '101000', '06500', 
    '00184', '28013', '1012 WX', '111 20', '0101', 
    '71676-110', '6000'
]

df_postal_codes = pd.DataFrame({'postal_code': postal_codes})

# Test the function
result = check_postal_code(df_postal_codes, 'postal_code')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 21 postal codes values are valid in the range (00184 to SW1A 1AA).

Last run on: 2024-04-07 21:40:27


24 Check Phone Numbers

In [30]:
def check_phone_numbers(df, column):
    """
    Check if phone number entries in the specified column are valid.
    """
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # First, handle blank/empty/null/NaN values
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)

    # Only proceed with other checks if the value is not blank/empty/null/NaN
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]
    result_format = DataQualityIssues.handle_phone_number_format(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message']+ '\n')

    if result_format['issue']:
        error_summary_parts.append(result_format['dq_issue'] + ':\n ' + result_format['error_message']+ '\n')
      
    error_summary = "Error(s) found: \n" + "\n".join(error_summary_parts)

    # Sort the DataFrame by the name column
    sorted_df = df.sort_values(by=column)

    # Get the first and last name after sorting
    smallest_name = sorted_df.iloc[0][column]
    biggest_name = sorted_df.iloc[-1][column]
 
    return error_summary if error_summary_parts else f"All {total_values_count} telephone numbers are valid in the range ({smallest_name} to {biggest_name}).\n"
    
# Test data
'''df_test = pd.DataFrame({
    'phone_numbers': [
        '123-456-7890', '(123) 456-7890', '+1 123 456 7890', 'InvalidNumber', 
        '+55 21 11 3415 1515', '04148991268624', '+55 48 3224-4209', '+55 48 91268-624', 
        '000', '+61137425', '+1 414-690-7935', '04121993720444', '01188335944', 
        '4144494331', '+55 31 3414-2179', '+61 405 833 952', '0405 833 952',
        None, '', '  ', '?', 'John Doe', '0405 833 952!', 11
    ]
})'''

df_test = pd.DataFrame({
    'phone_numbers': [
        '123-456-7890', '(123) 456-7890', '+1 123 456 7890', 
        '+55 21 11 3415 1515', '04148991268624', '+55 48 3224-4209', '+55 48 91268-624', 
        '000', '+61137425', '+1 414-690-7935', '04121993720444', '01188335944', 
        '4144494331', '+55 31 3414-2179', '+61 405 833 952', '0405 833 952',
        '0405 833 952'
    ]
})

result = check_phone_numbers(df_test, 'phone_numbers')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 17 telephone numbers are valid in the range ((123) 456-7890 to 4144494331).

Last run on: 2024-04-07 21:40:27


25 Check IP format

In [31]:
def check_ip_format(df, column):

    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # Handle blank/empty/null/NaN values first
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Then check IP format
    result_ip_format = DataQualityIssues.handle_ip_format(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_ip_format['issue']:
        error_summary_parts.append(result_ip_format['dq_issue'] + ':\n ' + result_ip_format['error_message'] + '\n')

    return "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else f"All {total_values_count} IP values are valid."

# Example usage
'''df_test = pd.DataFrame({
    'ip_column': [
        '192.168.1.1', '256.256.256.256', '127.0.0.1', '1.1', 
        '2001:0db8:85a3:0000:0000:8a2e:0370:7334', '::1', 
        '2001:db8::1234:5678', 'fe80::1ff:fe23:4567:890a', 
        None, '', '  ', '?', '0.0.0.0.0!', '11', 'incorrect:ipv6:address'
    ]
})'''

df_test = pd.DataFrame({
    'ip_column': [
        '192.168.1.1', '256.256.256.256', '127.0.0.1'
    ]
})

result = check_ip_format(df_test, 'ip_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 3 IP values are valid.
Last run on: 2024-04-07 21:40:27


26 Check URL format

In [32]:
def check_url_format(df, column):
    # Calculate the total number of values in the column
    total_values_count = df[column].size
    
    # Handle blank/empty/null/NaN values first
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Then check URL format
    result_url_format = DataQualityIssues.handle_url_format(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_url_format['issue']:
        error_summary_parts.append(result_url_format['dq_issue'] + ':\n ' + result_url_format['error_message'] + '\n')

    return "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else f"All {total_values_count} URL values are valid."

'''test_urls = [
    'https://www.example.com', 'http://example.org', 'http://192.168.1.1', 'http://localhost/test',
    'https://www.example.com:8080/path/to/resource', 'ftp://example.com', 'http://exa_mple.com',
    'http://999.999.999.999', 'https://', 'http://', 'http://exam!ple.com', 'https://www.example..com',
    'http:// example .com', 'justsometext', '12345', '', '  ', None, 'null',
    'https://chat.openai.com/c/9c317ba2-cefe-44b9-b9f4-7ef818744434',
    'https:--www.uol.com.br', 'https://www.uol.com.br', 'https:///www.uol.com.br'
]'''

test_urls = [
    'https://www.example.com', 'http://example.org', 'http://192.168.1.1', 'http://localhost/test',
    'https://www.example.com:8080/path/to/resource', 'https://www.uol.com.br'
]

# Example usage
df_test_urls = pd.DataFrame({'url_column': test_urls})
result = check_url_format(df_test_urls, 'url_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 6 URL values are valid.
Last run on: 2024-04-07 21:40:27


27 Check Email Format

In [33]:

def check_email_format(df, column):
    # Calculate the total number of values in the column
    total_values_count = df[column].size

    # Handle blank/empty/null/NaN values first
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Then check email format
    result_email_format = DataQualityIssues.handle_email_format(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_email_format['issue']:
        error_summary_parts.append(result_email_format['dq_issue'] + ':\n ' + result_email_format['error_message'] + '\n')

    return "Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else f"All {total_values_count} email values are valid."

# Example usage

'''df_test = pd.DataFrame({
    'email_column': [
        'example.com', 'userexample.com', 'name.domain.com',  # Missing @ Symbol
        'user@.com', 'name@',  # Missing Domain
        'user name@example.com', 'user@ exam ple.com', 'user @example.com',  # Spaces in Email Address
        'user!@example.com', 'name#domain.com', 'user*name@example.com',  # Special Characters
        'user@@example.com', 'name@domain@domain.com',  # Multiple @ Symbols
        '@example.com', '@domain.com',  # Missing Username
        'user@example.c', 'name@domain.',  # Domain Extension Too Short or Missing
        'user..name@example.com', 'user@domain..com',  # Consecutive Dots
        'user@-example.com', 'user@domain--name.com',  # Dashes in Domain
        'user@[192.168.0.1]', 'name@[123.123.123.123]',  # IP Address in Domain
        'user[name]@example.com', 'name[123]@domain.com',  # Brackets in Local Part
        'a'*255 + '@example.com',  # Too Long Email Address
        '', '  ', None, 'null'
    ]
})'''

df_test = pd.DataFrame({
    'email_column': [
        'marcelo.valentimsilva@postgrad.curtin.edu.au', 'marcelovalentimsilva@gmail.com', 'marcelo_valentim@uol.com.br'
    ]
})

result = check_email_format(df_test, 'email_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 3 email values are valid.
Last run on: 2024-04-07 21:40:27


28 Check Binary Values

In [34]:
def check_binary_values(df, column):
    """
    Check if the values in the specified column conform to binary values and provide a frequency distribution.

    Parameters:
    - df (pd.DataFrame): The pandas DataFrame to analyze.
    - column (str): The name of the column with binary values.

    Returns:
    - str: A message indicating the result of the binary value checks.
    """
    # Calculate the total number of values in the column
    total_values_count = df[column].size
    
    # Handle blank/empty/null/NaN values first
    result_blank = DataQualityIssues.handle_blank_empty_null_nan(df, column)
    df_filtered = df[~df[column].apply(lambda x: pd.isnull(x) or str(x).strip() == '' or str(x).lower() == 'null')]

    # Then check binary format
    result_binary_format = DataQualityIssues.handle_binary_values(df_filtered, column)

    error_summary_parts = []

    if result_blank['issue']:
        error_summary_parts.append(result_blank['dq_issue'] + ':\n ' + result_blank['error_message'] + '\n')

    if result_binary_format['issue']:
        error_summary_parts.append(result_binary_format['dq_issue'] + ':\n ' + result_binary_format['error_message'] + '\n')

    # Frequency distribution calculation
    binary_counts = df_filtered[column].value_counts(dropna=False).reset_index()
    binary_counts.columns = ['Value', 'Frequency']
    # Sort by frequency and then alphabetically
    binary_counts = binary_counts.sort_values(by=['Frequency', 'Value'], ascending=[False, True])
    frequency_distribution_str = f"\nFrequency Distribution:\n{binary_counts.to_string(index=False)}\n"

    return ("Error(s) found: \n" + "\n".join(error_summary_parts) if error_summary_parts else f"All {total_values_count} binary values are valid.") + '\n'+ frequency_distribution_str


'''df_test = pd.DataFrame({
    'binary_column': ['1', '0', 'Yes', 'YES', 'y', 'no', 'true', 'False', 'Invalid', '2', 'Y', 0, 3, None, '', '?','   ',"T", 0.1, '-2', 'no']
})'''

df_test = pd.DataFrame({
    'binary_column': ['1', '0', 'Yes', 'YES', 'y', 'no', 'true', 'False', 'Y', 0, "T", 'no']
})

result = check_binary_values(df_test, 'binary_column')
print(result)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

All 12 binary values are valid.

Frequency Distribution:
Value  Frequency
   no          2
    0          1
    0          1
    1          1
False          1
    T          1
    Y          1
  YES          1
  Yes          1
 true          1
    y          1

Last run on: 2024-04-07 21:40:27


29 Analyse Data Quality

In [35]:
def analyse_data_quality(df, analysed_columns_df, desired_dataset_index):

    print(f"Start of Analysis on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    all_results_ordered = {}

    # Define a range for valid years and weeks
    min_valid_year = 1800
    max_valid_year = 2100
        
    categorical_threshold = 100

    column_order = analysed_columns_df[analysed_columns_df['index'] == desired_dataset_index].sort_values('ID')['Column'].tolist()

    keyword = "categorical"  # Keyword to search for in the "FinalFormat"

    # Find columns marked as "categorical" for the specified dataset index
    categorical_columns = analysed_columns_df[
        (analysed_columns_df['index'] == desired_dataset_index) & 
        (analysed_columns_df['FinalFormat'].str.contains(keyword, case=False, na=False))
    ]['Column'].tolist()

    # Iterate through the columns in the custom order and analyze them
    for column in column_order:
        # Ensure that the column exists in the DataFrame
        if column in dataset_df.columns:
            if column in categorical_columns:
                all_results_ordered[column] = check_if_categorical(dataset_df, column, categorical_threshold)
            else:
                # Check the format specified in the "AnalysedColumns" sheet
                format_in_sheet = analysed_columns_df[
                    (analysed_columns_df['index'] == desired_dataset_index) &
                    (analysed_columns_df['Column'] == column)
                ]['FinalFormat'].iloc[0]
                
                if pd.isna(format_in_sheet):
                    all_results_ordered[column] = "Target word not found, and Format not determined"
                else:
                    if "ID column" in format_in_sheet:
                        all_results_ordered[column] = {
                            "ID column format": check_id_attributes(dataset_df, column),
                        }
                    elif "numerical >= 0" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Numerical >=0 format": check_numerical_ge_zero(dataset_df, column),
                        }
                    elif "percentage" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Percentage format": check_numerical_between(dataset_df, column, 0, 100),
                        }
                    elif "age" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Age format": check_numerical_between(dataset_df, column, 0, 130),
                        }
                    elif "numerical between 0 and 24" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Numerical (between 0 and 24) format": check_numerical_between(dataset_df, column, 0, 24),
                        }
                    elif "numerical between 0 and 360" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Numerical (between 0 and 360) format": check_numerical_between(dataset_df, column, 0, 360),
                        }
                    elif "numerical between 0 and 60" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Numerical (between 0 and 60) format": check_numerical_between(dataset_df, column, 0, 60),
                        }
                    elif "numerical" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Numerical format": check_numerical(dataset_df, column),
                        }
                    elif "string" in format_in_sheet:
                        all_results_ordered[column] = {
                            "String format": check_string_content(dataset_df, column),
                        }
                    elif "datetime" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Datetime format": check_datetime(dataset_df, column),
                        }
                    elif "date" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Date format": check_date(dataset_df, column),
                        }
                    elif "time" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Time format": check_time(dataset_df, column),
                        }
                    elif "month" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Month format": check_month(dataset_df, column),
                        }
                    elif "year" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Year format": check_numerical_between(dataset_df, column, min_valid_year, max_valid_year),
                        }
                    elif "weekday" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Weekday format": check_weekday(dataset_df, column),
                        } 
                    elif "week" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Week format": check_numerical_between(dataset_df, column, 1, 53),
                        }
                    elif "day" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Day format" : check_numerical_between(dataset_df, column, 1, 366),
                        }
                    elif "model name" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Model Name format": check_model_name(dataset_df, column),
                        }
                    elif "name" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Name format": check_name(dataset_df, column),
                        }
                    elif "street" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Street format": check_street(dataset_df, column),
                        }
                    elif "city" in format_in_sheet:
                        all_results_ordered[column] = {
                            "City format": check_city(dataset_df, column),
                        }
                    elif "state" in format_in_sheet:
                        all_results_ordered[column] = {
                            "State format": check_state(dataset_df, column),
                        }
                    elif "country" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Country format": check_country(dataset_df, column),
                        }
                    elif "postal code" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Postal Code format": check_postal_code(dataset_df, column),
                        }  
                    elif "phone" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Phone format": check_phone_numbers(dataset_df, column),
                        }
                    elif "ph" in format_in_sheet:
                        all_results_ordered[column] = {
                            "pH format": check_numerical_between(dataset_df, column, 0, 14),
                        }    
                    elif "latitude" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Latitude format": check_numerical_between(dataset_df, column, -90, 90),
                        }
                    elif "longitude" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Longitude format": check_numerical_between(dataset_df, column, -180, 180),
                        }
                    elif "normalized" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Normalized format": check_numerical_between(dataset_df, column, 0, 1),
                        }
                    elif "IP format" in format_in_sheet:
                        all_results_ordered[column] = {
                            "IP Address format": check_ip_format(df, column),
                        }
                    elif "URL format" in format_in_sheet:
                        url_result = check_url_format(df, column)
                        all_results_ordered[column] = {
                            "URL format": url_result
                        }
                    elif "E-mail format" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Email format": check_email_format(df, column),
                        }
                    elif "binary" in format_in_sheet:
                        all_results_ordered[column] = {
                            "Binary format": check_binary_values(df, column),
                        }
                    else:
                        print(f"Column '{column}' not found in the dataset.")
        
    # Loop to print the results in the desired format
    for column, analysis_result in all_results_ordered.items():
        # Fetch the 'SourceKeyword' for this column from 'analysed_columns_df'
        SourceKeyword_value = analysed_columns_df.loc[(analysed_columns_df['index'] == desired_dataset_index) & (analysed_columns_df['Column'] == column), 'SourceKeyword'].iloc[0]
        # Check if 'SourceKeyword' is not NaN before converting to lower case
        if pd.notna(SourceKeyword_value) and SourceKeyword_value.lower() != column.lower():
            print(f"{column} ({SourceKeyword_value}):")
        else:
            print(f"{column}:")
            
        if isinstance(analysis_result, pd.DataFrame):
            print(analysis_result.to_string(index=False))
        elif isinstance(analysis_result, dict):
            for key, value in analysis_result.items():
                print(f"  {key}: {value}")
        elif isinstance(analysis_result, str):  # Added condition to handle string results
            print(f"  {analysis_result}")
        else:
            print("No results available.")
        
        print()  # Add an empty line for separation

analyse_data_quality(dataset_df, analysed_columns_df, desired_dataset_index)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Start of Analysis on: 2024-04-07 21:40:27


instant:
  Numerical >=0 format: All 17379 values are numerical and greater or equal to 0 in the range (1:17379).

dteday (date):
  Date format: All 17379 date values are valid in the YYYYMMDD format in the range 2011-01-01 to 2012-12-31.

season:
  All 17379 values are correctly categorical.

Categorical format with 4 unique values:
 Category  Frequency
        3       4496
        2       4409
        1       4242
        4       4232

yr (nominal):
  All 17379 values are correctly categorical.

Categorical format with 2 unique values:
 Category  Frequency
        1       8734
        0       8645

mnth (month):
  Month format: All 17379 month values are valid.

Frequency Distribution:
    Month  Frequency
      May       1488
     July       1488
 December       1483
   August       1475
    March       1473
  October       1451
     June       1440
    April       1437
September       1437
 November       1437
  January       1429
 February       1341

hr (hour):
  Numerical (betwe

EXIT


In [36]:
raise SystemExit

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


30 Load CHANGED dataset

In [None]:
# Load the CHANGED dataset from changed_dataset_local_path 
#df = pd.read_csv('your_file.csv', na_values=['', ' ', 'null', 'none', 'None', 'Nan', 'NULL'], keep_default_na=False)

#changed_df = pd.read_csv(changed_dataset_local_path, header=None, delimiter=',', na_values=['', ' ', 'null', 'none', 'None', 'Nan', 'NULL'], keep_default_na=False)
#changed_df = pd.read_csv(changed_dataset_local_path, header=None, delimiter=',', na_values=[], keep_default_na=False)

#changed_df = pd.read_csv(changed_dataset_local_path, delimiter=';', na_values=[], keep_default_na=False) #186, 222
#changed_df = pd.read_excel(changed_dataset_local_path, header=header, parse_dates=parse_dates, na_values=[], keep_default_na=False) #602 350
#changed_df.head(10)

changed_df = pd.read_csv(changed_dataset_local_path, header=None, delimiter=',', na_values=[], keep_default_na=False)
changed_df.head(10)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
changed_df

Last run on: 2024-03-12 19:34:33


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,@,2,,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,%%%,5,12,-3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,,AA,11,6,-8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,b,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,a3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [None]:
dataset_df = changed_df


# Call the function to assign the column names
dataset_df = assign_column_names(analysed_columns_df, desired_dataset_index, dataset_df, dataset_name)


from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

dataset_df.head(10)



INFO:root:Successfully assigned column names to the dataset 'Letter Recognition' for index 58


Last run on: 2024-03-12 19:34:44


Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,@,2,,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,%%%,5,12.0,-3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,,AA,11.0,6,-8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,b,11.0,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1.0,a3,1,1,8,6,6,6,6,5,9,1,7,5,10
5,S,4,11.0,5,8,3,8,8,6,9,5,6,6,0,8,9,7
6,B,4,2.0,5,4,4,8,7,6,6,7,6,6,2,8,7,10
7,A,1,1.0,3,2,1,8,2,2,2,8,2,8,1,6,2,7
8,J,2,2.0,4,4,2,10,6,2,6,12,4,8,1,6,1,7
9,M,11,15.0,13,9,7,13,2,6,2,12,1,9,8,1,1,8


31 Run CHANGED dataset

In [None]:
#print(changed_df, analysed_columns_df, desired_dataset_index)

analyse_data_quality(changed_df, analysed_columns_df, desired_dataset_index)

from datetime import datetime
print(f"Last run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Start of Analysis on: 2024-03-12 19:34:54
lettr (letter):
  String format: Error(s) found: 
DQI #1 (Missing Data - Completeness):
 1 Blank/Empty/Null/NaN value(s) at index(es): [(2, '')]

String range (lexicographical): (%%% : Z)

x-box (x):
  Numerical format: Error(s) found: 
DQI #17 (Wrong Data Type - Consistency):
 2 Non-numeric value(s) at index(es): [(2, 'AA'), (3, 'b')]

Range of values: (0.0:15.0).

y-box (y):
  Numerical format: Error(s) found: 
DQI #1 (Missing Data - Completeness):
 1 Blank/Empty/Null/NaN value(s) at index(es): [(0, '')]


Range of values: (0:15).

width:
  Numerical >=0 format: Error(s) found: 
DQI #15 (Domain Violation - Accuracy):
 1 Negative value(s) at index(es): [(1, '-3')]

DQI #17 (Wrong Data Type - Consistency):
 1 Non-numeric value(s) at index(es): [(4, 'a3')]

Range of values: (-3.0:15.0).

high (height):
  Numerical >=0 format: Error(s) found: 
DQI #15 (Domain Violation - Accuracy):
 1 Negative value(s) at index(es): [(2, -8)]

Range of values: (-