# Info Parameters

## Packgages

In [1]:
import sqlite3
import pandas as pd
import re
from unidecode import unidecode
import matplotlib.pyplot as plt

## Code (functions)

In [2]:
synonym_groups = {
  'halflife': [
        'halflife',
        'eliminationhalflife',
        't12',
        'terminalhalflife',
        'terminalhalflifet12',
        'halflive',
        't12beta',
        't12alpha',
        'halflifet12',
        'biologicalhalflife',
        'eliminationt12',
        'eliminationhalflife',
        'eliminationhalflifet12',
        'eliminationhalflive',
        'terminalhalflifet12',
        'terminaleliminationhalflifet12',
        'terminalplasmahalflife',
        'apparentterminalhalflife',
        'serumeliminationhalflife',
        'plasmahalflifet12',
        'plasmahalflife'
    ],
    'cmax': [
    'cmax',
    'maximumconcentration',
    'peakplasmaconcentration',
    'maximumplasmaconcentration',
    'peakplasmalevel',
    'peakconcentration',
    'maximalplasmaconcentration',
    'maximalconcentration',
    'maximalplasmaconcentrationscmax',
    'troughconcentration',
    'plasmacmax',
    'maximumplasmaconcentrationcmax',
    'maximalplasmaconcentrationcmax',
    'ctrough',
    'troughplasmaconcentration'
    ],
    'timetocmax': [
      'timetomaximumplasmaconcentration',
    'timetomaximumconcentration',
    'timetocmax',
    'timetocmaxtmax',
    'timetopeakplasmaconcentration',
    'timetopeakconcentration',
    'timetoreachcmaxtmax',
    'peaktime'

    ],
    'cssmax': [
      'steady-stateplasmaconcentration',
    'cssmax',
    'steadystateplasmaconcentration'
    ],
    
    'auc': [
      'auc',
    'auc0infinity',
    'auc0t',
    'auc0',
    'aucratio',
    'auc024',
    'auc024h',
    'areaunderthecurve',
    'areaundertheconcentrationtimecurve',
    'areaundertheplasmaconcentrationtimecurve',
    'areaundertheplasmaconcentrationtimecurveauc',
    'areaunderthecurveauc',
    'auclast',
    'aucinf',
    'auc012h',
    'auct',
    'aucinfinity',
    'auc08',
    'auc048'
    ],
    'clearance': [
      'clearance',
    'totalbodyclearance',
    'plasmaclearance',
    'systemicclearance',
    'apparentclearance',
    'apparentoralclearance',
    'clearancerate',
    'renalclearance',
    'totalclearance',
    'excretion',
    'eliminationrateconstant',
    'totalbodyclearancecl',
    'renalclearanceclr',
    'clearanceclf',
    'metabolicclearance',
    'bloodclearance',
    'hepaticclearance',
    'eliminationclearance',
    'metabolicclearancerate',
    'absorptionrateconstantka',
    'nonrenalclearance',
    'plasmaclearancecl'
    ],
    'bioavailability': [
      'bioavailability',
    'absolutebioavailability',
    'oralbioavailability',
    'relativebioavailability',
    'systemicavailability',
    'absoluteoralbioavailability',
    'biologicalavailability',
    'bioavailabilitie',
    'bioavailabilities',
    'relativebioavailabilitie',
    'bioavailabilityf',
    'absolutebioavailabilitie',
    'relativeoralbioavailability',
    'bioavailable'
    ],
    
    'maximaltolerateddose': [
      'mtd',
    'maximaltolerateddose'
    ]

}



## Main

In [4]:
def normalize_parameter_name(parameter):
    """
    Normalize and clean a parameter name for uniformity and better processing.

    This function takes a parameter name as input and performs the following operations:
    1. Normalizes special characters to their basic ASCII form using the `unidecode` function.
    2. Converts the parameter name to lowercase.
    3. Removes trailing "s" characters from the end of words.
    4. Removes hyphens ("-") and spaces from the parameter name.
    5. Removes any non-alphanumeric characters from the parameter name.

    Parameters:
    parameter (str): The input parameter name to be normalized.

    Returns:
    str: The normalized and cleaned parameter name.

    Example:
    >>> normalize_parameter_name("t1/2 and t ½")
    't12andt12'
    >>> normalize_parameter_name("Example-Parameter-s")
    'exampleparameter'
    """
    # Normalize special characters and convert to lowercase
    normalized = unidecode(parameter).lower()
    
    # Remove trailing "s" characters from the end of words
    normalized = normalized.rstrip("s")
    
    # Remove hyphens and spaces
    normalized = normalized.replace("-", "").replace(" ", "")
    
    # Remove non-alphanumeric characters
    normalized = re.sub(r'[^a-zA-Z0-9]', '', normalized)
    
    return normalized
def analyze_table_for_synonym_groups(table_name, synonym_groups):
    """
    Analyze a table (drug) for the presence of synonym groups in its parameter columns.

    This function takes a table name and a dictionary of synonym groups as input and analyzes the table to determine
    if any of its parameter columns contain parameters that belong to a synonym group.

    Parameters:
    table_name (str): The name of the table (drug) to analyze.
    synonym_groups (dict): A dictionary of synonym groups where keys are group names and values are sets of synonyms.

    Returns:
    list: A list of group names to which the table's parameters belong.
    """
    # Connect to the SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Query to retrieve the parameter columns from the table
    columns_query = f"""PRAGMA table_info("{table_name}")"""
    cursor.execute(columns_query)
    column_info = cursor.fetchall()

    groups_found = set()

    # Iterate through the columns and check for synonym groups in parameter columns
    for col_info in column_info:
        col_name = col_info[1]
        if col_name.startswith("Parameter"):
            query = f"""SELECT {col_name} FROM "{table_name}" """
            df = pd.read_sql_query(query, conn)
            for value in df[col_name]:
                normalized_value = normalize_parameter_name(value)
                entro = False
                for group_name, synonyms in synonym_groups.items():
                    if normalized_value in synonyms:
                        entro = True
                        groups_found.add(group_name)
                        update_query = f"""UPDATE "{table_name}" SET parameter = ? WHERE parameter = ?"""
                        cursor.execute(update_query, (group_name, value))
                if entro == False:  
                    cursor.execute(f"""DELETE FROM "{table_name}" WHERE parameter = ?""", (value,))
                    
                        
    conn.commit()
    # Close the database connection
    conn.close()

    return list(groups_found)

def analyze_tables_for_synonym_groups(db_file, synonym_groups):
    """
    Analyze all tables (drugs) in the database for the presence of synonym groups in their parameter columns.

    This function connects to an SQLite database specified by the 'db_file' parameter and analyzes each table to determine
    if any of its parameter columns contain parameters that belong to a synonym group. It then calculates the percentage
    of drugs that have at least one column with a synonym group and the percentage of drugs that do not.

    Parameters:
    db_file (str): The path to the SQLite database file.
    synonym_groups (dict): A dictionary of synonym groups where keys are group names and values are sets of synonyms.

    Returns:
    float: Percentage of drugs that have at least one column with a synonym group.
    float: Percentage of drugs that do not have any column with a synonym group.
    dict: A dictionary where keys are group names and values are the number of drugs that belong to that group.
    """
    # Connect to the SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Get the names of all tables in the database
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = cursor.fetchall()

    # Initialize counters
    drugs_with_group_column = 0
    drugs_without_group_column = 0
    group_drug_counts = {group_name: 0 for group_name in synonym_groups.keys()}

    # Iterate through each table and analyze for synonym groups
    for table in table_names:
        table_name = table[0]
        groups_found = analyze_table_for_synonym_groups(table_name, synonym_groups)
        if groups_found:
            drugs_with_group_column += 1
            for group in groups_found:
                group_drug_counts[group] += 1
        else:
            drugs_without_group_column += 1
        cursor.execute(f"""SELECT count(*) FROM "{table_name}" """)
        num_filas = cursor.fetchone()[0]
        if num_filas == 0:
            cursor.execute(f"""DROP TABLE "{table_name}" """)

    # Calculate percentages
    total_drugs = len(table_names)
    percentage_with_group_column = (drugs_with_group_column / total_drugs) * 100
    percentage_without_group_column = (drugs_without_group_column / total_drugs) * 100

    # Close the database connection
    conn.close()

    return percentage_with_group_column, percentage_without_group_column, group_drug_counts

if __name__ == "__main__":
    # Specify the path to the SQLite database file
    db_file = "DRUG_DB.db"  # Replace with the path to your SQLite database

    # Define your synonym groups
  

    # Analyze tables for synonym groups
    with_group, without_group, group_drug_counts = analyze_tables_for_synonym_groups(db_file, synonym_groups)

    # Print results
    print(f"Percentage of drugs with at least one column with a synonym group: {with_group}%")
    print(f"Percentage of drugs without any column with a synonym group: {without_group}%")
    print("Number of drugs in each synonym group:")
    for group, count in group_drug_counts.items():
        print(f"{group}: {count}")


Percentage of drugs with at least one column with a synonym group: 94.48160535117057%
Percentage of drugs without any column with a synonym group: 5.518394648829431%
Number of drugs in each synonym group:
halflife: 1057
cmax: 891
timetocmax: 344
cssmax: 182
auc: 809
clearance: 866
bioavailability: 840
maximaltolerateddose: 126
