# This python notebook is used to perform Exploratory Data Analysis (EDA) on fields from TED data.


In [17]:
from pathlib import Path
from ted_data_eu.services.etl_pipelines.postgres_etl_pipeline import POSTGRES_URL, SQLALCHEMY_ISOLATION_LEVEL
import sqlalchemy
import pandas as pd


DB_QUERY = f"""
            SELECT
                contact_point."ContactPointId",
                contact_point."ContactPointFax",
                contact_point."ContactPointTelephone",
                contact_point."ContactPointName",
                contact_email."Email",
                contact_internet_addr."InternetAddress"
            FROM public."ContactPoint" contact_point
            JOIN public."ContactPointEmail" contact_email
                ON contact_point."ContactPointId" = contact_email."ContactPointId"
            JOIN public."ContactPointInternetAddress" contact_internet_addr
                ON contact_point."ContactPointId" = contact_internet_addr."ContactPointId"
            """

COLUMNS = {
    'ContactPointFax': str,
    'ContactPointTelephone': str,
    'ContactPointName': str,
    'Email': str,
    'InternetAddress': str
}

In [19]:
def create_csv_file_path():
    """
    Creates a directory named "data" (if it doesn't exist already) and returns the path to the CSV file "eda_data.csv" within that directory
    :return: the path to the CSV file
    """
    data_dir = Path("data")
    if not data_dir.exists():
        data_dir.mkdir()
    return data_dir / "eda_data.csv"

CSV_FILE_PATH = create_csv_file_path()

In [20]:
sql_engine = sqlalchemy.create_engine(POSTGRES_URL, echo=False, isolation_level=SQLALCHEMY_ISOLATION_LEVEL)
with sql_engine.connect() as sql_connection:
    df = pd.read_sql(DB_QUERY, sql_connection)
df.to_csv (CSV_FILE_PATH, index = False)

In [None]:
import plotly.graph_objects as go
import re
import phonenumbers


def add_sphere_trace(fig, x, color, label):
    """
    Creates spheres of indicators in the histogram
    :param fig: the figure itself
    :param x: calculated indicators
    :param color: the color of the sphere
    :param label: the name of the sphere representing the indicator
    :return: the ID of the added view
    """
    fig.add_trace(go.Scatter(
        x=[x],
        y=[0],
        mode='markers',
        marker=dict(
            size=10,
            symbol='circle',
            color=color,
            line=dict(color='black', width=1),
            opacity=0.7
        ),
        name=label
    ))

def generate_histogram(data, column, nbinsx):
    """
    Generates the histogram of each field from the table
    :param data: the length of each field
    :param column: name of the column
    :param nbinsx: number of bins
    :return: the figure with the histogram result
    """
    fig = go.Figure()

    std = data.std()
    average = data.mean()
    percentile1 = data.quantile(0.01)
    percentile99 = data.quantile(0.99)
    min = data.min()
    max = data.max()
    median = data.median()
    z_score = (data - average) / std
    iqr = data.quantile(0.75) - data.quantile(0.25)


    fig.add_trace(go.Histogram(x=data, nbinsx=nbinsx))
    std_line_color = 'white'
    std_line_width = 6
    fig.add_shape(
        type="line",
        x0=0,
        y0=std,
        x1=nbinsx,
        y1=std,
        line=dict(color=std_line_color, width=std_line_width, dash="dash"),
        layer='below'
    )
    add_sphere_trace(fig, average, 'red', 'Average')
    add_sphere_trace(fig, std, 'white', 'STD')
    add_sphere_trace(fig, percentile1, 'blue', 'Percentile 1')
    add_sphere_trace(fig, percentile99, 'blue', 'Percentile 99')
    add_sphere_trace(fig, min, 'green', 'Min')
    add_sphere_trace(fig, max, 'green', 'Max')
    add_sphere_trace(fig, median, 'orange', 'Median')
    add_sphere_trace(fig, z_score, 'purple', 'Z-Score')
    add_sphere_trace(fig, iqr, 'yellow', 'IQR')

    title = f'<b>Distribution of the length of the string for {column}</b><br>'
    title += f'Std: {std:.2f}, Average: {average:.2f}, Percentile 1: {percentile1:.2f}, Percentile 99: {percentile99:.2f}<br>'
    title += f'Min: {min:.2f}, Max: {max:.2f}, Median: {median:.2f}, IQR: {iqr:.2f}<br>'
    title += f'Z_Score: {z_score.values}<br>'
    fig.update_layout(
        title=title,
        xaxis_title='The length of the string',
        yaxis_title='Number of records',
        title_font=dict(size=14)
    )
    fig.show()

for column in df.columns:
    if column in COLUMNS and df[column].dtype == 'object':
        df['number_of_characters'] = df[column].str.len()
        generate_histogram(df['number_of_characters'], column, nbinsx=100)
    elif df[column].dtype == 'int64':
        generate_histogram(df[column], column, nbinsx=100)

In [None]:

def calculate_data_completeness(column):
    """
    Calculates the data completeness for a specific column in the DataFrame
    :param column: name of the column for which data completeness is calculated
    :return: data completeness percentage as a floating-point number
    """
    number_of_records = len(df[column])
    number_of_records_not_null = df[column].notnull().sum()
    data_completeness = (number_of_records_not_null / number_of_records) * 100
    return data_completeness

def calculate_data_consistency(column):
    """
    Calculates the data consistency for a specific column in the DataFrame
    :param df: dataFrame containing the data
    :param column: name of the column for which data consistency is calculated
    :return:  data consistency percentage as a floating-point number
    """
    number_of_records = len(df[column])
    df[f'length {column}'] = df[column].str.len()
    percentile_1 = df[f'length {column}'].quantile(0.01)
    percentile_99 = df[f'length {column}'].quantile(0.99)
    df[f'threshold {column}'] = df[f'length {column}'].between(percentile_1, percentile_99)
    number_of_items = df[f'threshold {column}'].sum()
    data_consistency = (number_of_items / number_of_records) * 100
    return data_consistency

def calculate_data_uniqueness(column):
    """
    Calculates the data uniqueness for a specific column in the DataFrame
    :param df: dataFrame containing the data
    :param column: name of the column for which data uniqueness is calculated
    :return: data uniqueness percentage as a floating-point number
    """
    number_of_records = len(df[column])
    number_of_unique_values = df[column].nunique()
    data_uniqueness = (number_of_unique_values / number_of_records) * 100
    return data_uniqueness

def calculate_data_completness_int(column):
    """
    Calculates the data completeness for a specific column type int in the DataFrame
    :param column: name of the column for which data completeness is calculated
    :return: data completeness percentage as a floating-point number
    """
    total_values = len(df[column])
    num_non_zeros = (df[column] != 0).sum()
    percentage_non_zeros = (num_non_zeros / total_values) * 100
    return percentage_non_zeros


def is_valid_email(email):
    """
    Checks if an email has a correct pattern
    :param email: email
    :return: a boolean result based on email pattern
    """
    if isinstance(email, (str, bytes)):
        pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
        return bool(re.fullmatch(pattern, email))
    return None


def is_valid_phone_number(phone_number):
    """
    Checks if a phone number has a correct pattern
    :param phone_number: phone_number
    :return: a boolean result based on phone pattern
    """
    try:
        parsed_number = phonenumbers.parse(phone_number, None)
        return phonenumbers.is_valid_number(parsed_number)
    except phonenumbers.phonenumberutil.NumberParseException:
        return False

def is_valid_fax_number(fax_number):
    """
    Checks if a fax number has a correct pattern
    :param fax_number: fax_number
    :return: a boolean result based on fax pattern
    """
    if isinstance(fax_number, (str, bytes)):
        pattern = r"^\+[\d\s/-]+$"
        return bool(re.match(pattern, fax_number))
    return None

for column in df.columns:
    if column in COLUMNS:
        if df[column].apply(lambda x: isinstance(x, str) and not pd.isna(x)).any():
            data_completeness = calculate_data_completeness(column)
            print(f"The KPI data completeness in the field {column} is {data_completeness:.2f}%.")
            data_consistency = calculate_data_consistency(column)
            print(f"The KPI data consistency in the field {column} is {data_consistency:.2f}%.")
            data_uniqueness = calculate_data_uniqueness(column)
            print(f"The KPI data uniqueness in the field {column} is {data_uniqueness:.2f}%.")
            email_validation = df[column].apply(is_valid_email)
            valid_email_percentage = email_validation.mean() * 100
            if valid_email_percentage is not None and valid_email_percentage > 10:
                print(f"The KPI data validity in the field {column} is {valid_email_percentage:.2f}%.")
            phone_validation = df[column].apply(is_valid_phone_number)
            valid_telephone_percentage = phone_validation.mean() * 100
            if valid_telephone_percentage is not None and valid_telephone_percentage > 10:
                print(f"The KPI data validity in the field {column} is {valid_telephone_percentage:.2f}%")
            fax_validation = df[column].apply(lambda x: is_valid_fax_number(str(x))).sum()
            total_number = len(df[column])
            valid_fax_percentage = (fax_validation / total_number) * 100
            if valid_fax_percentage is not None and valid_fax_percentage > 10:
                print(f"The KPI data validity in the field {column} is {valid_fax_percentage:.2f}%")
        else:
            data_completeness_int = calculate_data_completness_int(column)
            print(f"The KPI data completeness in the field {column} is {data_completeness_int:.2f}%.")

