# This python notebook is used to perform Exploratory Data Analysis (EDA) on the Lot table from TED data.


In [3]:
from pathlib import Path
from ted_data_eu.services.etl_pipelines.postgres_etl_pipeline import POSTGRES_URL, SQLALCHEMY_ISOLATION_LEVEL
import sqlalchemy
import pandas as pd

CSV_FILE_PATH = Path("/data/eda_data.csv")

DB_QUERY = f"""
            SELECT
                contact_point."ContactPointId",
                contact_point."ContactPointFax",
                contact_point."ContactPointTelephone",
                contact_point."ContactPointName",
                contact_email."Email",
                contact_internet_addr."InternetAddress"
            FROM public."ContactPoint" contact_point
            JOIN public."ContactPointEmail" contact_email
                ON contact_point."ContactPointId" = contact_email."ContactPointId"
            JOIN public."ContactPointInternetAddress" contact_internet_addr
                ON contact_point."ContactPointId" = contact_internet_addr."ContactPointId"
            """

COLUMNS = {
    'ContactPointId': str,
    'ContactPointFax': str,
    'ContactPointTelephone': str,
    'ContactPointName': str,
    'Email': str,
    'InternetAddress': str
}

In [None]:
sql_engine = sqlalchemy.create_engine(POSTGRES_URL, echo=False, isolation_level=SQLALCHEMY_ISOLATION_LEVEL)
with sql_engine.connect() as sql_connection:
    df = pd.read_sql(DB_QUERY, sql_connection)
df.to_csv (CSV_FILE_PATH, index = False)

data_table = pd.read_csv(CSV_FILE_PATH, dtype=COLUMNS)
print(data_table.info())

In [None]:
print(data_table.head())

In [None]:
import plotly.graph_objects as go
import re
import phonenumbers


def add_sphere_trace(fig, x, color, label):
    """
    Creates spheres of indicators in the histogram
    :param fig: the figure itself
    :param x: calculated indicators
    :param color: the color of the sphere
    :param label: the name of the sphere representing the indicator
    :return: the ID of the added view
    """
    fig.add_trace(go.Scatter(
        x=[x],
        y=[0],
        mode='markers',
        marker=dict(
            size=10,
            symbol='circle',
            color=color,
            line=dict(color='black', width=1),
            opacity=0.7
        ),
        name=label
    ))

def generate_histogram(data, column, nbinsx):
    """
    Generates the histogram of each field from the table
    :param data: the length of each field
    :param column: name of the column
    :param nbinsx: number of bins
    :return: the figure with the histogram result
    """
    fig = go.Figure()

    std = data.std()
    average = data.mean()
    percentile1 = data.quantile(0.01)
    percentile99 = data.quantile(0.99)
    min = data.min()
    max = data.max()
    median = data.median()
    z_score = (data - average) / std
    iqr = data.quantile(0.75) - data.quantile(0.25)

    fig.add_trace(go.Histogram(x=data, nbinsx=nbinsx))
    std_line_color = 'white'
    std_line_width = 6
    fig.add_shape(
        type="line",
        x0=0,
        y0=std,
        x1=nbinsx,
        y1=std,
        line=dict(color=std_line_color, width=std_line_width, dash="dash"),
        layer='below'
    )
    add_sphere_trace(fig, average, 'red', 'Average')
    add_sphere_trace(fig, std, 'white', 'STD')
    add_sphere_trace(fig, percentile1, 'blue', 'Percentile 1')
    add_sphere_trace(fig, percentile99, 'blue', 'Percentile 99')
    add_sphere_trace(fig, min, 'green', 'Min')
    add_sphere_trace(fig, max, 'green', 'Max')
    add_sphere_trace(fig, median, 'orange', 'Median')
    add_sphere_trace(fig, z_score, 'purple', 'Z-Score')
    add_sphere_trace(fig, iqr, 'yellow', 'IQR')

    title = f'<b>Distribution of the length of the string for {column}</b><br>'
    title += f'Std: {std:.2f}, Average: {average:.2f}, Percentile 1: {percentile1:.2f}, Percentile 99: {percentile99:.2f}<br>'
    title += f'Min: {min:.2f}, Max: {max:.2f}, Median: {median:.2f}, IQR: {iqr:.2f}<br>'
    title += f'Z_Score: {z_score.values}<br>'
    fig.update_layout(
        title=title,
        xaxis_title='The length of the string',
        yaxis_title='Number of records',
        title_font=dict(size=14)
    )
    fig.show()

def is_valid_email(email):
    """
    Checks if an email has a correct pattern
    :param email: email
    :return: a boolean result based on email pattern
    """
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return bool(re.fullmatch(pattern, email))

def is_valid_phone_number(phone_number):
    """
    Checks if a phone number has a correct pattern
    :param phone_number: phone_number
    :return: a boolean result based on phone pattern
    """
    try:
        parsed_number = phonenumbers.parse(phone_number, None)
        return phonenumbers.is_valid_number(parsed_number)
    except phonenumbers.phonenumberutil.NumberParseException:
        return False

def is_valid_fax_number(fax_number):
    """
    Checks if a fax number has a correct pattern
    :param fax_number: fax_number
    :return: a boolean result based on fax pattern
    """
    pattern = r"^\+[\d\s/-]+$"
    return bool(re.match(pattern, fax_number))

for column in df.columns:
    if df[column].dtype == 'object':
        df['number_of_characters'] = df[column].str.len()
        generate_histogram(df['number_of_characters'], column, nbinsx=100)
    elif df[column].dtype == 'int64':
        generate_histogram(df[column], column, nbinsx=100)

In [None]:
columns_to_calculate = ["ContactPointFax", "ContactPointTelephone", "Email", "ContactPointName", "InternetAddress"]

def calculate_data_completeness(column):
    """
    Calculates the data completeness for a specific column in the DataFrame
    :param column: name of the column for which data completeness is calculated
    :return: data completeness percentage as a floating-point number
    """
    number_of_records = len(df[column])
    number_of_records_not_null = df[column].notnull().sum()
    data_completeness = (number_of_records_not_null / number_of_records) * 100
    return data_completeness

for column in df.columns:
    if column in columns_to_calculate:
        data_completeness = calculate_data_completeness(column)
        print(f"The KPI data completeness in the field {column} is {data_completeness:.2f}%.")

    if column == 'Email':
        email_validation = df[column].apply(is_valid_email)
        valid_email_percentage = email_validation.mean() * 100
        print(f"The KPI validity for Email field is {valid_email_percentage:.2f}%")

    elif column == 'ContactPointTelephone':
        phone_validation = df[column].apply(is_valid_phone_number)
        valid_telephone_percentage = phone_validation.mean() * 100
        print(f"The KPI validity for ContactPointTelephone field is {valid_telephone_percentage:.2f}%")

    elif column == 'ContactPointFax':
        total_number = len(df[column])
        fax_validation = df[column].apply(lambda x: is_valid_fax_number(str(x))).sum()
        valid_fax_percentage = (fax_validation / total_number) * 100
        print(f"The KPI validity for ContactPointFax field is {valid_fax_percentage:.2f}%")