In [6]:
# Create co-occurrence matrices
def create_co_occurrence_matrix(df):
    if df.empty:
        return pd.DataFrame()
    patient_matrix = df.pivot_table(index='PatientID', columns='Codes', aggfunc='size', fill_value=0)
    patient_matrix = patient_matrix.loc[:, (patient_matrix != 0).any(axis=0)]
    co_occurrence_matrix = patient_matrix.T.dot(patient_matrix)
    np.fill_diagonal(co_occurrence_matrix.values, 0)
    return co_occurrence_matrix



def is_icd_code(code):
    """Check if the given code is a valid ICD code."""
    return bool(re.match(r"^[A-Z]", code))

def is_loinc_code(code):
    """Check if the given code is a valid LOINC code with a hyphen at [-2]."""
    return len(code) > 1 and code[-2] == '-'

def is_ops_code(code):
    """Check if the given code is a valid OPS code."""
    return len(code) > 1 and code[1] == '-'

def get_resource_type(code):
    """Determine the resource type based on the code."""
    if is_icd_code(code):
        return "ICD"
    elif is_loinc_code(code):
        return "LOINC"
    elif is_ops_code(code):
        return "OPS"
    else:
        return "Unknown"  # Default case for unrecognized codes

def get_color_for_resource_type(resource_type):
    """Map resource types to colors using SUBGROUP_COLORS."""
    return SUBGROUP_COLORS.get(resource_type, 'gray')  # Default to gray if not found


def generate_network_viz(df, code1_col, code2_col, weight_col, 
                         layout='barnes_hut', node_color=None, edge_color=None,
                         central_gravity=0.005,
                         node_distance=420,
                         spring_length=1000,
                         spring_constant=0.01,
                         spring_strength=0.15,
                         damping=0.96):
    # Generate a NetworkX graph
    G = nx.from_pandas_edgelist(df, source=code1_col, target=code2_col, edge_attr=weight_col)

    bgcolor, font_color = 'white', 'black'  # Default colors

    # Initiate PyVis network object
    net = Network(
        height='700px', 
        width='100%',
        bgcolor=bgcolor, 
        font_color=font_color, 
        notebook=True
    )

    # Take NetworkX graph and translate it to a PyVis graph format
    net.from_nx(G)

    # Set colors for nodes
    if node_color is not None:
        for node in G.nodes():
            net.get_node(node)['color'] = node_color.get(node, 'gray')  # Default to gray if no color is provided

    # Set colors for edges
    if edge_color is not None:
        for u, v in G.edges():
            net.get_edge(u, v)['color'] = edge_color.get((u, v), 'rgba(255, 255, 255, 0.3)')  # Default to white with transparency

    # Default to barnes_hut layout
    net.barnes_hut(
        central_gravity=central_gravity, 
        spring_length=spring_length, 
        spring_strength=spring_strength, 
        damping=damping
    )      

    return net

    


def create_dendrogram_plot(cooccurrence_array, labels, flat_df, show_labels):
    # Adjust labels based on the 'show_labels' input
    if 'show' in show_labels:
        # Use 'Displays' from flat_df for labels
        labels = [
            flat_df.loc[flat_df['Codes'] == label, 'Displays'].iloc[0] 
            if not flat_df.loc[flat_df['Codes'] == label, 'Displays'].empty 
            else label  # Fallback to code if display is missing
            for label in labels
        ]
#     else:
#         # Use truncated codes (remove the first two characters)
#         #labels = [label[2:] for label in labels]

    # Create the dendrogram plot with Plotly
    fig = ff.create_dendrogram(cooccurrence_array, orientation='bottom', labels=labels)

        # Update line color for all links in the dendrogram
    for line in fig.data:
        line.update(line=dict(color='gray'))  # Set your desired color here
    
    # Update layout to improve appearance
    fig.update_layout(
        title='Dendrogram',
        title_x=0.5,
        xaxis_title='',
        yaxis_title='Distance',
        xaxis={'tickangle': -45},  # Rotate labels for better readability
    )
    
    return fig




# MAIN CODE

In [23]:
import os
import pandas as pd
import dash
from dash import html, dcc
from dash.dependencies import Input, Output, State
from pyvis.network import Network
import tempfile
import base64
import io
import numpy as np
import plotly.figure_factory as ff
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from dash import dcc, html, Input, Output, State
import plotly.graph_objs as go
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
import re


SUBGROUP_COLORS = {
    'ICD': "#00bfff",
    'LOINC': "#ffc0cb",
    'OPS': "#9a31a8"
}

# Dash application setup
app = dash.Dash(__name__)
server = app.server


app.layout = html.Div([
    html.H1("Co-Occurrences in FHIR Codes"),
    dcc.Upload(
        id='upload-data',
        children=html.Button('Upload Data'),
        multiple=False
    ),
    html.Div(id='upload-feedback', children='', style={'color': 'red'}),

    
    # Slider for the number of nodes
    html.Div(id='slider-container', children=[
        html.Label("Select the number of nodes to visualize:"),
        dcc.Slider(
            id='num-nodes-slider',
            min=1,
            max=10,
            step=1,
            value=1,
            marks={i: str(i) for i in range(1, 11)},
            tooltip={"placement": "bottom", "always_visible": False}
        )
    ], style={'display': 'none'}),  # Initially hidden
    
    # Slider for the levels
    html.Div(id='level-slider-container', children=[
        html.Label("Select the hierarchy level:"),
        dcc.Slider(
            id='level-slider',
            min=1,
            max=4,
            step=1,
            value=1,
            marks={i: str(i) for i in range(5)},  # 0 to 4
            tooltip={"placement": "bottom", "always_visible": False}
        )
    ], style={'display': 'none'}),  # Initially hidden 
    
    html.Div([
        html.Label("Select a code:"),
        dcc.Dropdown(
            id='code-dropdown',
            options=[],  # Options will be populated after loading data
            placeholder="Select a code",
            clearable=False
        )
    ]),
    dcc.Checklist(
        id='show-labels',
        options=[{'label': 'Show Labels', 'value': 'show'}],
        value=[]  # Start with an empty list so labels are not shown by default
    ),
    
    dcc.Loading(
        id="loading",
        type="circle",
        children=[
            html.Div(id='data-container', style={'display': 'none'}),  # Hidden div for callbacks
            
        ]
    ),

    # Graphs positioned side-by-side using CSS flexbox
    html.Div([
        # Left column - PyVis graph
        html.Div([
            html.Label("Enter code to search:"),
            dcc.Input(id='code-input', type='text', placeholder='Enter code', debounce=False),  # Debounce=False to update as you type
            html.Iframe(id='graph-iframe', style={'width': '100%', 'height': '100%'}),
        ], style={'flex': '1', 'padding': '10px'}),  # PyVis on the left side, takes 50% of space

        # Right column - Bar chart and dendrogram stacked
        html.Div([
            dcc.Store(id='codes-of-interest-store'),
            dcc.Graph(id='dendrogram', style={ 'width': '100%', 'height': '50%'}),  # Dendrogram below bar chart
            dcc.Graph(id='bar-chart', style={'width': '100%', 'height': '50%'})  # Bar chart on top , 'margin-bottom': '20px'
        ], style={'flex': '1', 'padding': '0px'}),  # Bar chart and dendrogram on the right side, takes 50% of space

    ], style={'display': 'flex', 'flex-direction': 'row'}),  # Use flexbox to position the graphs side by side
    
    dcc.Store(id='data-store')  # Hidden store to keep data
])

@app.callback(
    Output('slider-container', 'style'),
    Output('level-slider-container', 'style'),
    Input('code-dropdown', 'value')
)
def update_slider_visibility(selected_code):
    if selected_code == 'ALL_CODES':
        return {'display': 'none'}, {'display': 'block'}  # Show level slider, hide num-nodes
    else:
        return {'display': 'block'}, {'display': 'none'}  # Show num-nodes slider, hide level slider


def fetch_and_process_data(file_content):
    
    # Read CSV data from uploaded content
    flat_df = pd.read_parquet(io.BytesIO(file_content))

    # Check for required columns
    required_columns = ['PatientID', 'Codes', 'ResourceType']
    missing_columns = [col for col in required_columns if col not in flat_df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns: {', '.join(missing_columns)}")

    icd_df = pd.read_csv('ICD_Katalog_2023_DWH_export_202406071440.csv')    # Make sure to adjust the column names as necessary
    ops_df = pd.read_csv('OPS_Katalog_2023_DWH_export_202409200944.csv')    # Make sure to adjust the column names as necessary
    loinc_df = pd.read_csv('LOINC_DWH_export_202409230739.csv')  # Make sure to adjust the column names as necessary
    
    def get_display_label(code, level,  resource_type):
        """Retrieve the display label for codes and their associated group or chapter labels based on resource type."""
        code = str(code).strip()
        # Attempt to get the main display label based on the specific code
        if resource_type == 'ICD':
            print('ICD code', code)
            if level == 4:
                result = icd_df.loc[icd_df['ICD_CODE'] == code, 'ICD_NAME']
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = icd_df.loc[icd_df['GRUPPE_CODE'] == code, 'GRUPPE_NURNAME']
                print('GRUPPE', code, gruppe_result.iloc[0])
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                    
            if level == 2:
                icd_df['KAPITEL_CODE'] = icd_df['KAPITEL_CODE'].astype(str)  # Convert KAPITEL_CODE to string
                code = str(code).strip()

                # Attempt to get group or chapter label for level 2
                kapitel_result = icd_df.loc[icd_df['KAPITEL_CODE'] == code, 'KAPITEL_NURNAME']

                if not kapitel_result.empty:
                    print(f"Level 2 display found: {kapitel_result.iloc[0]}")
                    return kapitel_result.iloc[0]  # Return the first result


        elif resource_type == 'OPS':
            print('OPS code', code)
            if level == 4:
                result = ops_df.loc[ops_df['OPS_CODE'] == code, 'OPS_NAME']
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
                
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = ops_df.loc[ops_df['GRUPPE_CODE'] == code, 'GRUPPE_NURNAME']
                print('GRUPPE', code, gruppe_result.iloc[0])
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                
            if level == 2:
                icd_df['KAPITEL_CODE'] = icd_df['KAPITEL_CODE'].astype(str)  # Convert KAPITEL_CODE to string
                code = str(code).strip()
                kapitel_result = ops_df.loc[ops_df['KAPITEL_CODE'] == code, 'KAPITEL_NURNAME']
                if not kapitel_result.empty:
                    return kapitel_result.iloc[0]  # Return the first result

        elif resource_type == 'LOINC':
            print('LOINC code', code)
            if level == 4:            
                result = loinc_df.loc[loinc_df['LOINC_CODE'] == code, 'LOINC_NAME']
                print('LOINC result',result)
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
                
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = loinc_df.loc[loinc_df['LOINC_PROPERTY'] == code, 'LOINC_PROPERTY']
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                
            if level == 2:
                kapitel_result = loinc_df.loc[loinc_df['LOINC_SYSTEM'] == code, 'LOINC_SYSTEM']
                if not kapitel_result.empty:
                    return kapitel_result.iloc[0]  # Return the first result

        return None  # If resource type is unknown or no labels found

    
##################################################################################################   
    
    main_df = create_co_occurrence_matrix(flat_df)

    # Initialize a list to store code pairs
    code_pairs = []

    # Iterate through main_df to create initial pairs
    for i in range(len(main_df)):
        for j in range(i + 1, len(main_df)):
            code1 = main_df.index[i]
            code2 = main_df.columns[j]
            weight = main_df.iloc[i, j]

            if weight > 0:
                code_pairs.append((code1, code2, weight))

    # Create pairs_df DataFrame
    pairs_df = pd.DataFrame(code_pairs, columns=['Code1', 'Code2', 'Weight'])


    # Step 3: Assign level 3 to current pairs
    pairs_df['level'] = 4
    
    def build_hierarchy_and_get_pairs(df, code_column, kapitel_column, gruppe_column):
        if df is None:
            return []

        # Step 3: Filter df based on flat_df['Codes']
        df = df[df[code_column].isin(flat_df['Codes'])]

        # Step 4: Extract relevant columns
        df_subset = df[[kapitel_column, gruppe_column, code_column]]  # Select by column names

        # Step 5: Build the hierarchy using anytree
        level_0 = []

        for index, row in df_subset.iterrows():
            level_2 = str(row[kapitel_column])
            #print('level_2', level_2)
            level_3 = f"{level_2},{str(row[gruppe_column])}"  # Make level unique
            #print('level_3', level_3)
            level_4 = f"{level_3},{str(row[code_column])}"
            #print('level_4', level_4)      

            resource_type1 = get_resource_type(row[code_column])  # Custom function to get resource type

            if resource_type1 == 'ICD':
                level_1 = f"{'ICD'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

            if resource_type1 == 'OPS':
                level_1 = f"{'OPS'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

            if resource_type1 == 'LOINC':
                level_1 = f"{'LOINC'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

        return level_0

    # Get node structure for each DataFrame
    icd_level_0 = build_hierarchy_and_get_pairs(icd_df, 'ICD_CODE', 'KAPITEL_CODE', 'GRUPPE_CODE')
    ops_level_0 = build_hierarchy_and_get_pairs(ops_df, 'OPS_CODE', 'KAPITEL_CODE', 'GRUPPE_CODE')  # Adjust column names if necessary
    loinc_level_0 = build_hierarchy_and_get_pairs(loinc_df, 'LOINC_CODE', 'LOINC_SYSTEM', 'LOINC_PROPERTY')  # Adjust column names if necessary


    # Initialize a list to store new rows
    new_rows = []

    # level 0
    new_rows.append({'Code1':'FHIR' , 'Code2':'ICD' , 'Weight': len(icd_level_0), 'level': 0, 'ResourceType':'ICD'})
    new_rows.append({'Code1':'FHIR' , 'Code2':'OPS' , 'Weight': len(ops_level_0), 'level': 0, 'ResourceType':'OPS'})
    new_rows.append({'Code1':'FHIR' , 'Code2':'LOINC' , 'Weight': len(loinc_level_0), 'level': 0, 'ResourceType':'LOINC'})

    # For level 1 rows, we'll calculate the counts (weights) for the connections
    # Level 1 for ICD
    # Level 1 - Split the 3rd item (index 2) in icd_level_0
    icd_items = [item.split(',')[2] for item in icd_level_0]
    icd_item_counts = Counter(icd_items)

    # Iterate over each unique ICD level 1 item and its count
    for item, count in icd_item_counts.items():
        # Add a row for each level 1 ICD item
        new_rows.append({'Code1': 'ICD', 'Code2': 'icd'+item, 'Weight': count, 'level': 1, 'ResourceType':'ICD',
                        'Displays': 'ICD'})

        # Level 2 - Split the 4th item (index 3) for level 1 connections
        icd_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in icd_level_0 if lvl_0_item.split(',')[2] == item]
        icd_item_counts1 = Counter(icd_items1)

        for item1, count1 in icd_item_counts1.items():
            new_rows.append({
                            'Code1': 'icd' + item,  # Ensure the code is prefixed with 'icd'
                            'Code2': item1,          # Level 2 ICD code
                            'Weight': count1,        # Count for this item
                            'level': 2,              # Specify level
                            'ResourceType': 'ICD',   # Set resource type
                            'Displays': get_display_label(item, 2, 'ICD')  # Fetch display label or group name
                        })
            print(item, 2, 'ICD')

            # Level 3 - Split the 5th item (index 4) for level 2 connections
            icd_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in icd_level_0 if lvl_0_item.split(',')[3] == item1]
            icd_item_counts2 = Counter(icd_items2)

            for item2, count2 in icd_item_counts2.items():
                new_rows.append({
                            'Code1': item1,  # Ensure the code is prefixed with 'icd'
                            'Code2': item2,          # Level 2 ICD code
                            'Weight': count2,        # Count for this item
                            'level': 3,              # Specify level
                            'ResourceType': 'ICD',   # Set resource type
                            'Displays': get_display_label(item1, 3, 'ICD')  # Fetch display label or group name
                        })
            print(item1, 3, 'ICD')

    # OPS Level 1 - Split the 3rd item (index 2) in ops_level_0
    ops_items = [item.split(',')[2] for item in ops_level_0]
    ops_item_counts = Counter(ops_items)

    # Iterate over each unique OPS level 1 item and its count
    for item, count in ops_item_counts.items():
        # Add a row for each level 1 OPS item
        new_rows.append({'Code1': 'OPS', 'Code2': 'ops'+item, 'Weight': count, 'level': 1, 'ResourceType':'OPS',
                        'Displays': 'OPS'})

        # OPS Level 2 - Split the 4th item (index 3) for level 1 connections
        ops_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in ops_level_0 if lvl_0_item.split(',')[2] == item]
        ops_item_counts1 = Counter(ops_items1)

        for item1, count1 in ops_item_counts1.items():
            new_rows.append({
                            'Code1': 'ops' + item,  # Ensure the code is prefixed with 'icd'
                            'Code2': item1,          # Level 2 ICD code
                            'Weight': count1,        # Count for this item
                            'level': 2,              # Specify level
                            'ResourceType': 'OPS',   # Set resource type
                            'Displays': get_display_label(item, 2, 'OPS')  # Fetch display label or group name
                        })

            # OPS Level 3 - Split the 5th item (index 4) for level 2 connections
            ops_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in ops_level_0 if lvl_0_item.split(',')[3] == item1]
            ops_item_counts2 = Counter(ops_items2)

            for item2, count2 in ops_item_counts2.items():
                new_rows.append({
                            'Code1': item1,  # Ensure the code is prefixed with 'icd'
                            'Code2': item2,          # Level 2 ICD code
                            'Weight': count2,        # Count for this item
                            'level': 3,              # Specify level
                            'ResourceType': 'OPS',   # Set resource type
                            'Displays': get_display_label(item1, 3, 'OPS')  # Fetch display label or group name
                        })

    # LOINC Level 1 - Split the 3rd item (index 2) in loinc_level_0
    loinc_items = [item.split(',')[2] for item in loinc_level_0]
    print('loinc_items', loinc_items)
    loinc_item_counts = Counter(loinc_items)

    # Iterate over each unique LOINC level 1 item and its count
    for item, count in loinc_item_counts.items():
        # Add a row for each level 1 LOINC item
        new_rows.append({'Code1': 'LOINC', 'Code2': item, 'Weight': count, 'level': 1, 'ResourceType':'LOINC',
                        'Displays': 'LOINC'})

        # LOINC Level 2 - Split the 4th item (index 3) for level 1 connections
        loinc_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in loinc_level_0 if lvl_0_item.split(',')[2] == item]
        loinc_item_counts1 = Counter(loinc_items1)

        for item1, count1 in loinc_item_counts1.items():
            # Add a row for each level 2 LOINC item
            new_rows.append({'Code1': item, 'Code2': item1, 'Weight': count1, 'level': 2, 'ResourceType':'LOINC',
                            'Displays':get_display_label(item, 2, 'LOINC') })

            # LOINC Level 3 - Split the 5th item (index 4) for level 2 connections
            loinc_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in loinc_level_0 if lvl_0_item.split(',')[3] == item1]

            loinc_item_counts2 = Counter(loinc_items2)

            for item2, count2 in loinc_item_counts2.items():
                # Add a row for each level 3 LOINC item
                new_rows.append({'Code1': item1, 'Code2': item2, 'Weight': count2, 'level': 3, 'ResourceType':'LOINC',
                                'Displays':get_display_label(item1, 3, 'LOINC')})


    # Convert the new_rows list into a DataFrame
    new_entries_df = pd.DataFrame(new_rows)
    

    new_pairs_df = pd.concat([pairs_df, new_entries_df], ignore_index=True)
    print('new_pairs_df', new_pairs_df)

    new_pairs_df = new_pairs_df.drop_duplicates(subset=['Code1', 'Code2', 'Weight','level'])

#################################################################################################################


    # Fill the Displays column
    flat_df['Displays'] = flat_df.apply(
        lambda row: get_display_label(row['Codes'], 4, row['ResourceType']),
        axis=1
    )
    # Apply transformation for ICD and OPS only
    flat_df.loc[flat_df['ResourceType'].isin(['ICD', 'OPS']), 'Displays'] = \
        flat_df.loc[flat_df['ResourceType'].isin(['ICD', 'OPS']), 'Displays'].apply(lambda x: ': '.join(x.split(':')[1:]).strip())

    # For LOINC, you may want to modify the Displays as needed without splitting
    # Here, I'm just assigning a placeholder or retaining the existing display
    flat_df.loc[flat_df['ResourceType'] == 'LOINC', 'Displays'] = flat_df.loc[flat_df['ResourceType'] == 'LOINC', 'Displays']
   
    
    flat_df['Displays'] = flat_df['Displays'].astype(str)

    flat_df['Full_Displays'] = flat_df['Displays']  # Store the full text
    flat_df['Displays'] = flat_df['Displays'].str.slice(0, 11) + '...'
    print('flat_df', flat_df)


    ICD_df = flat_df[flat_df['ResourceType'] == 'ICD']
    LOINC_df = flat_df[flat_df['ResourceType'] == 'LOINC']
    OPS_df = flat_df[flat_df['ResourceType'] == 'OPS']



    co_occurrence_matrices = {
        'Main': create_co_occurrence_matrix(flat_df),
        'ICD': create_co_occurrence_matrix(ICD_df),
        'LOINC': create_co_occurrence_matrix(LOINC_df),
        'OPS': create_co_occurrence_matrix(OPS_df)
    }


    # Include new_pairs_df in the returned data
    return {
        'success': True,
        'message': 'Data is loaded.',
        'data': {
            'flat_df': flat_df.to_dict(),
            'co_occurrence_matrices': co_occurrence_matrices,
            'new_pairs_df': new_pairs_df.to_dict()  # Ensure this is returned
        }
    }

@app.callback(
    Output('upload-feedback', 'children'),
    Output('data-container', 'style'),
    Output('code-dropdown', 'options'),
    Output('data-store', 'data'),  # Store `flat_df` and matrices here
    Input('upload-data', 'contents')
)


def upload_file(file_content):
    feedback_message = ""
    data_style = {'display': 'none'}
    options = []
    data = {}

    if file_content:
        # Decode and process uploaded file
        content_type, content_string = file_content.split(',')
        decoded = base64.b64decode(content_string)
        result = fetch_and_process_data(decoded)
        
        if result['success']:
            co_occurrence_matrices = result['data']['co_occurrence_matrices']
            flat_df = result['data']['flat_df']
            new_pairs_df = result['data']['new_pairs_df']  # Retrieve new_pairs_df
            
            # Prepend "All codes" to the dropdown options
            options = [{'label': 'All codes', 'value': 'ALL_CODES'}] + [{'label': code, 'value': code} for code in co_occurrence_matrices.get('Main', pd.DataFrame()).columns]

            # Update this part in upload_file callback
            data = {
                'flat_df': flat_df,  # Store the `flat_df` here
                'co_occurrence_matrices': {
                    key: matrix.to_dict() for key, matrix in co_occurrence_matrices.items()
                },
                'new_pairs_df': new_pairs_df  # Store new_pairs_df here directly
            }

            feedback_message = result['message']
            data_style = {'display': 'block'}
        else:
            feedback_message = result['message']

    return feedback_message, data_style, options, data


@app.callback(
    [Output('graph-iframe', 'srcDoc'),
     Output('codes-of-interest-store', 'data'),
     Output('graph-iframe', 'style'),
     Output('bar-chart', 'style'),
     Output('dendrogram', 'style')],
    [Input('code-dropdown', 'value'),
     Input('num-nodes-slider', 'value'),
     Input('level-slider', 'value'),
     Input('show-labels', 'value'),
     Input('code-input', 'value'),  # Add input for user code
     State('data-store', 'data')]
)


#def update_graph(selected_code, num_nodes_to_visualize, show_labels, user_code, data):
def update_graph(selected_code, num_nodes_to_visualize, selected_level, show_labels, user_code, data):

    # Initialize styles
    graph_style = {'display': 'none'}
    bar_chart_style = {'display': 'none'}
    dendrogram_style = {'display': 'none'}
    
    # Check if selected_code is None or empty
    if not selected_code:
        return "", {'codes_of_interest': [], 'top_neighbor_info': {}}, graph_style, bar_chart_style, dendrogram_style

    # Initialize PyVis network
    net = Network(notebook=True, cdn_resources='remote')

    # Handle data
    try:
        flat_df = pd.DataFrame(data.get('flat_df', {}))
        co_occurrence_matrices = data.get('co_occurrence_matrices', {})
        new_pairs_df = pd.DataFrame(data.get('new_pairs_df', {}))  # Access new_pairs_df
        main_df = pd.DataFrame(co_occurrence_matrices.get('Main', {}))
    except Exception as e:
        print(f"Error processing data: {e}")
        return "", {'codes_of_interest': [], 'top_neighbor_info': {}}, graph_style, bar_chart_style, dendrogram_style



    codes_of_interest = []
    top_neighbor_info = {}
    flat_df['Color'] = flat_df['ResourceType'].map(SUBGROUP_COLORS)
    color_mapping = flat_df.set_index('ResourceType')['Color'].to_dict()

    # Define the maximum number of level 4 nodes to display for each resource type
    MAX_LEVEL_4_NODES_PER_TYPE = 2  # Maximum nodes per resource type

    if selected_code == 'ALL_CODES':
        # Ensure all node IDs are strings
        new_pairs_df['Code1'] = new_pairs_df['Code1'].astype(str)
        new_pairs_df['Code2'] = new_pairs_df['Code2'].astype(str)

        # Define node sizes based on levels
        size_mapping = {0: 128, 1: 64, 2: 32, 3: 16, 4: 4}

        filtered_pairs_df = new_pairs_df 
        print('new_pairs_df', new_pairs_df.columns)

        # Generate the network visualization using the filtered DataFrame
        fhir_net = generate_network_viz(new_pairs_df, 'Code1', 'Code2', 'Weight', layout='barnes_hut')

        # Dictionary to store level 4 nodes by resource type
        level_4_nodes_by_type = {}

        # First, process each node to assign sizes, colors, and labels
        for node in fhir_net.nodes:
            # Identify the node's level and assign the size
            level = filtered_pairs_df.loc[filtered_pairs_df['Code1'] == node['id'], 'level'].values
            level = level[0] if len(level) > 0 else None

            # Set the node size
            size = size_mapping.get(level, 5)
            node['size'] = size

            # Determine the color based on the level
            if node['id'] == 'FHIR':
                color = 'black'  # FHIR node is black
                
            elif level in [1, 2, 3]:  # Levels 1, 2, and 3 get their color from ResourceType
                resource_type = filtered_pairs_df.loc[filtered_pairs_df['Code1'] == node['id'], 'ResourceType'].values
                if len(resource_type) > 0:
                    color = color_mapping.get(resource_type[0], 'gray')
                else:
                    color = 'gray'

                # Check if 'show' is in show_labels before assigning display labels for level 2 and level 3 nodes
                if 'show' in show_labels:  # Only assign labels if 'show' is in show_labels
                    display_label = filtered_pairs_df.loc[filtered_pairs_df['Code1'] == node['id'], 'Displays'].values
                    if len(display_label) > 0 and display_label[0] is not None:
                        full_display = display_label[0]  # Full display label
                        truncated_label = full_display[:15]  # Truncate to first 15 characters

                        node['label'] = truncated_label  # Set the truncated display label
                        node['title'] = full_display #if full_display else 'No Description Available'  # Store full display label in 'title'
                        node['text'] = full_display  # Use truncated label for the text field as well
                    else:
                        node['label'] = 'No Label'  # Default or empty label if None
                        node['title'] = 'No Description Available'  # Default title if None

            elif level == 4:  # Level 4 nodes
                resource_type = node['id']  # Store the current node's ID

                # Fetch resource type and assign color
                resource_type_result = get_resource_type(resource_type)
                color = get_color_for_resource_type(resource_type_result)

                # Store level 4 nodes by resource type for later filtering
                if resource_type_result not in level_4_nodes_by_type:
                    level_4_nodes_by_type[resource_type_result] = []
                level_4_nodes_by_type[resource_type_result].append(node['id'])

                # Add labels and titles for level 4 nodes using flat_df['Displays']
                if 'show' in show_labels:
                    display_label = flat_df.loc[flat_df['Codes'] == node['id'], 'Displays'].values
                    if len(display_label) > 0 and display_label[0] is not None:
                        full_display = display_label[0]
                        truncated_label = full_display[:15]  # Limit to first 15 characters

                        node['label'] = truncated_label
                        node['title'] = full_display #if full_display else 'No Description Available'
                        node['text'] = full_display
                    else:
                        node['label'] = 'No Label'
                        node['title'] = 'No Description Available'


                # Store the color for the level 4 node
                node['color'] = color  # Ensure that color is assigned to the node
                print(f"Assigned color: {color}")  # Debug line
            else:
                color = 'gray'  # Default color for undefined levels

            # Finally, assign the determined color to the node
            node['color'] = color  # Assign color to the node



            # Optionally set font size
            node['font'] = {'size': 88}
        print('level_4_nodes_by_type', level_4_nodes_by_type)
        print('fhir_net.nodes', fhir_net.nodes)
        
        # Group nodes by level
        nodes_by_level = {0: [], 1: [], 2: [], 3: [], 4: []}
        for node in fhir_net.nodes:
            level = filtered_pairs_df.loc[filtered_pairs_df['Code1'] == node['id'], 'level'].values
            level = level[0] if len(level) > 0 else None
            if level in nodes_by_level:
                nodes_by_level[level].append(node)

        # Filter nodes based on levels and the selected level
        filtered_nodes = []
        for level in range(selected_level + 1):  # Include levels up to the selected_level
            print('level', level)
            if not level==4:
                filtered_nodes.extend(nodes_by_level[level])
                    # Ensure level 4 nodes are always included
            # Handle level 4 nodes with a limit for each resource type
            else:
                for resource_type, nodes in level_4_nodes_by_type.items():
                    limited_nodes = nodes[:MAX_LEVEL_4_NODES_PER_TYPE]  # Limit to a certain number of nodes
                    for node in nodes_by_level[4]:
                        if node['id'] in limited_nodes:
                            filtered_nodes.append(node)

        # Finally, update the fhir_net nodes with the filtered ones
        fhir_net.nodes = filtered_nodes

        # Debugging output to verify the final filtered nodes
        print("Final Nodes in fhir_net after level-based filtering:", [node['id'] for node in fhir_net.nodes])
        
            
        # Highlight the selected node and its connections
        if user_code:
            # Search for the node in the network
            node_found = False
            for node in fhir_net.nodes:
                print('USER_CODE',node['id'], user_code)
                if node['id'].strip() == user_code:
                    node_found = True
                    node['color'] = 'lime'  # Highlight the selected node
                    node['size'] += 50  # Increase size for visibility

            # Debugging output
            if node_found:
                print(f"User code {user_code} found and highlighted.")
            else:
                print(f"User code {user_code} not found in the network.")

            # Highlight connected edges
            for edge in fhir_net.edges:
                if edge['from'] == user_code or edge['to'] == user_code:
                    edge['color'] = 'lime'  # Highlight edges connected to the selected node

        # Show the network visualization
        fhir_net.show('fhir_interactions_highlighted.html')

        # Read the HTML file and return its content for the iframe
        graph_style = {'display': 'block', 'width': '200%', 'height': '750px'}
        try:
            with open('fhir_interactions_highlighted.html', 'r') as file:
                html_content = file.read()
            return html_content, {'codes_of_interest': codes_of_interest, 'top_neighbor_info': top_neighbor_info}, graph_style, bar_chart_style, dendrogram_style
        except Exception as e:
            print(f"Error reading HTML file: {e}")
            return "", {'codes_of_interest': [], 'top_neighbor_info': {}}, graph_style, bar_chart_style, dendrogram_style

    else:
        graph_style = {'display': 'block', 'width': '100%', 'height': '600px'}
        bar_chart_style = {'display': 'block', 'width': '95%', 'height': '300px'}
        dendrogram_style = {'display': 'block', 'width': '95%', 'height': '300px'}

        # Calculate node degrees (number of neighbors)
        node_degree = main_df.astype(bool).sum(axis=1)
        
        # Get neighbors of the selected code
        if selected_code not in main_df.index:
            return "", {'codes_of_interest': [], 'top_neighbor_info': {}}, graph_style, bar_chart_style, dendrogram_style

        neighbors_sorted = main_df.loc[selected_code].sort_values(ascending=False)
        print('neighbors_sorted', neighbors_sorted)
        top_neighbors = list(neighbors_sorted.index[:])
        print('top_neighbors', top_neighbors)

        codes_of_interest = [selected_code]
        top_neighbor_info = {}
        
        def add_nodes_edges(graph, child_df, group_name):
            top_neighbor = None

            # Iterate over the neighbor codes in the DataFrame
            for neighbor_code in neighbors_sorted.index:
                if group_name == 'ICD' and is_icd_code(neighbor_code):
                    top_neighbor = neighbor_code
                    break
                elif group_name == 'LOINC' and is_loinc_code(neighbor_code):
                    top_neighbor = neighbor_code
                    break
                elif group_name == 'OPS' and is_ops_code(neighbor_code):
                    top_neighbor = neighbor_code
                    break

            if top_neighbor:
                top_neighbor_info['top_neighbor'] = top_neighbor
                top_neighbor_row = child_df.loc[top_neighbor].sort_values(ascending=False)
                top_neighbors_list = list(top_neighbor_row.index[:num_nodes_to_visualize])
                top_neighbor_info['top_neighbors_list'] = top_neighbors_list

                codes_of_interest.extend([top_neighbor] + top_neighbors_list)

                if 'show' in show_labels:
                    selected_code_label = flat_df.loc[flat_df['Codes'] == selected_code, 'Displays'].iloc[0] if not flat_df.empty else selected_code
                    top_neighbor_label = flat_df.loc[flat_df['Codes'] == top_neighbor, 'Displays'].iloc[0] if not flat_df.empty else top_neighbor
                else:
                    selected_code_label = selected_code
                    top_neighbor_label = top_neighbor

                group_name1 = 'ICD' if selected_code in co_occurrence_matrices.get('ICD', {}) else \
                              'LOINC' if selected_code in co_occurrence_matrices.get('LOINC', {}) else \
                              'OPS' if selected_code in co_occurrence_matrices.get('OPS', {}) else 'Unknown'

                node_size = int(node_degree.get(selected_code, 1)) * 2
                print('selected_code node size', node_size)
                if selected_code not in net.get_nodes():
                    net.add_node(selected_code, size=node_size, title=flat_df.loc[flat_df['Codes'] == selected_code, 'Full_Displays'].iloc[0], label=selected_code_label, color=SUBGROUP_COLORS.get(group_name1, 'gray'))

                node_size = int(node_degree.get(top_neighbor, 1)) * 2
                print('top_neighbor node size', node_size)
                if top_neighbor not in net.get_nodes():
                    net.add_node(top_neighbor, size=node_size, title=flat_df.loc[flat_df['Codes'] == top_neighbor, 'Full_Displays'].iloc[0], label=top_neighbor_label, color=SUBGROUP_COLORS.get(group_name, 'gray'))

                # Prevent adding edges if the nodes are the same
                if selected_code in net.get_nodes() and top_neighbor in net.get_nodes() and selected_code != top_neighbor:
                    edge_value = int(main_df.loc[selected_code, top_neighbor])
                    net.add_edge(selected_code, top_neighbor, value=edge_value / 2, color=SUBGROUP_COLORS.get(group_name, 'gray'))

                top_neighbor_row = child_df.loc[top_neighbor].sort_values(ascending=False)
                top_neighbors_list = list(top_neighbor_row.index[:num_nodes_to_visualize])

                for neighbor in top_neighbors_list:
                    if neighbor != top_neighbor and child_df.loc[top_neighbor, neighbor] > 0:
                        neighbor_label = flat_df.loc[flat_df['Codes'] == neighbor, 'Displays'].iloc[0] if 'show' in show_labels else neighbor

                        node_size = int(node_degree.get(neighbor, 1)) * 2
                        print('neighbor node size', node_size)
                        if neighbor not in net.get_nodes():
                            net.add_node(neighbor, size=node_size, title=flat_df.loc[flat_df['Codes'] == neighbor, 'Full_Displays'].iloc[0], label=neighbor_label, color=SUBGROUP_COLORS.get(group_name, 'gray'))

                        # Prevent adding edges if the nodes are the same
                        if top_neighbor in net.get_nodes() and neighbor in net.get_nodes() and top_neighbor != neighbor:
                            edge_value = int(child_df.loc[top_neighbor, neighbor])
                            net.add_edge(top_neighbor, neighbor, value=edge_value / 2)

                for i in range(len(top_neighbors_list)):
                    for j in range(i + 1, len(top_neighbors_list)):
                        neighbor1 = top_neighbors_list[i]
                        neighbor2 = top_neighbors_list[j]

                        if neighbor1 in child_df.index and neighbor2 in child_df.columns:
                            count = child_df.loc[neighbor1, neighbor2]
                            if count > 0:
                                # Prevent adding edges if the nodes are the same
                                if neighbor1 in net.get_nodes() and neighbor2 in net.get_nodes() and neighbor1 != neighbor2:
                                    net.add_edge(neighbor1, neighbor2, value=int(count) / 2, color=SUBGROUP_COLORS.get(group_name, 'gray'))


        # Check for specific keys and print the corresponding matrix in pandas DataFrame style
        if 'ICD' in co_occurrence_matrices:
            print('\nICD Co-Occurrence Matrix:')
            icd_matrix = pd.DataFrame(co_occurrence_matrices['ICD'])
            print(icd_matrix)
            add_nodes_edges(net, icd_matrix, 'ICD')

        if 'LOINC' in co_occurrence_matrices:
            print('\nLOINC Co-Occurrence Matrix:')
            loinc_matrix = pd.DataFrame(co_occurrence_matrices['LOINC'])
            print(loinc_matrix)
            add_nodes_edges(net, loinc_matrix, 'LOINC')

        if 'OPS' in co_occurrence_matrices:
            print('\nOPS Co-Occurrence Matrix:')
            ops_matrix = pd.DataFrame(co_occurrence_matrices['OPS'])
            print(ops_matrix)
            add_nodes_edges(net, ops_matrix, 'OPS')

        temp_file = tempfile.NamedTemporaryFile(delete=True, suffix='.html')
        temp_file_name = temp_file.name
        temp_file.close()

        net.show(temp_file_name)
        
        #graph_style = {'display': 'block', 'width': '100%', 'height': '600px'}  # Default height for specific codes
        return open(temp_file_name, 'r').read(), {'codes_of_interest': codes_of_interest, 'top_neighbor_info': top_neighbor_info}, graph_style, bar_chart_style, dendrogram_style


@app.callback(
    [Output('bar-chart', 'figure'),
     Output('dendrogram', 'figure')],
    [Input('code-dropdown', 'value'),
     Input('show-labels', 'value'),
     Input('num-nodes-slider', 'value'),
     Input('codes-of-interest-store', 'data')],  # Corrected input to fetch 'codes_of_interest'
    State('data-store', 'data')
)



def update_charts(selected_code, show_labels, slider_value, codes_of_interest, data):
    if not selected_code or selected_code == 'ALL_CODES':
        return (
            {
                'data': [],
                'layout': {'title': 'Bar chart not available'}
            },
            {
                'data': [],
                'layout': {'title': 'Dendrogram not available'}
            }
        )
    
    # Retrieve co-occurrence matrices and flat_df
    co_occurrence_matrices = data.get('co_occurrence_matrices', {})
    flat_df = pd.DataFrame(data.get('flat_df', {}))
    num_neighbors_to_display = slider_value or 0
    
    # Compute frequency distribution
    main_df = pd.DataFrame(co_occurrence_matrices.get('Main', {}))
    frequency_distribution = main_df.sum(axis=1)
    total_sum = frequency_distribution.sum()
    total_freq_dist = frequency_distribution / total_sum
    
    # Get the selected code's label
    selected_code_label = flat_df.loc[flat_df['Codes'] == selected_code, 'Displays'].iloc[0] if 'show' in show_labels else selected_code

    # Ensure codes_of_interest is a list
    codes_of_interest = codes_of_interest.get('codes_of_interest', [])
    print("Codes of interest:", codes_of_interest)

    # Prepare bar chart data
    bar_data = []
    x_labels = []
    y_values = []
    line_widths = []
    bar_colors = []

    for neighbor in codes_of_interest:
        occurrence_count = total_freq_dist.get(neighbor, 0)
        neighbor_label = flat_df.loc[flat_df['Codes'] == neighbor, 'Displays'].iloc[0] if 'show' in show_labels else neighbor
        bar_data.append({'x': neighbor_label, 'y': occurrence_count, 'code': neighbor})
        x_labels.append(neighbor_label)
        y_values.append(occurrence_count)
        line_widths.append(5 if neighbor == selected_code else 1)
        color = 'gray'
        for subgroup, color_code in SUBGROUP_COLORS.items():
            if neighbor in co_occurrence_matrices.get(subgroup, {}):
                color = color_code
                break
        bar_colors.append(color)

    # Sort the bar data based on the 'code' value
    bar_data_sorted = sorted(bar_data, key=lambda x: x['code'])
    sorted_x = [item['x'] for item in bar_data_sorted]
    sorted_y = [item['y'] for item in bar_data_sorted]
    sorted_line_widths = [line_widths[x_labels.index(item['x'])] for item in bar_data_sorted]
    sorted_colors = [bar_colors[x_labels.index(item['x'])] for item in bar_data_sorted]

    # Drop duplicates from sorted_x and keep corresponding y values
    unique_labels = []
    unique_y_values = []
    unique_colors = []
    unique_line_widths = []

    for x, y, color, line_width in zip(sorted_x, sorted_y, sorted_colors, sorted_line_widths):
        if x not in unique_labels:
            unique_labels.append(x)
            unique_y_values.append(y)
            unique_colors.append(color)
            unique_line_widths.append(line_width)

    # Create the bar chart
    bar_chart_figure = {
        'data': [{
            'x': unique_labels if 'show' in show_labels else str(unique_labels),  # Conditional for 'x'
            'y': unique_y_values,
            'type': 'bar',
            'name': 'Occurrences',
            'marker': {'color': unique_colors},
            'line': {'width': unique_line_widths},
            'text': unique_labels,#[flat_df.loc[flat_df['Codes'] == label, 'Full_Displays'].iloc[0] for label in unique_labels],#unique_labels,
            'textposition': 'none' if 'show' in show_labels else 'inside'
        }],
        'layout': {
            'title': f'Frequency Distribution',
            'xaxis': {
                'title': '',
                'tickangle': -45,  # Rotate x-tick labels for better readability
                'showticklabels': True,  # Show only the labels, no numbers
            },
            'yaxis': {'title': 'Frequency'}
        }
    }


    # Create dendrogram figure
    try:
        if len(codes_of_interest) < 1:
            raise ValueError("Not enough codes for clustering")

        def create_sub_cooccurrence_matrix(cooccurrence_dict, codes):
            valid_codes = [code for code in codes if code in cooccurrence_dict]
            if not valid_codes:
                raise ValueError("No valid codes found for sub-co-occurrence matrix")
            sub_matrix = pd.DataFrame(
                {code: {sub_code: cooccurrence_dict.get(code, {}).get(sub_code, 0) for sub_code in valid_codes} for code in valid_codes}
            ).fillna(0)
            return sub_matrix

        co_dict = co_occurrence_matrices.get('Main', {})
        cooccurrence_dict = create_sub_cooccurrence_matrix(co_dict, codes_of_interest)
        
        if cooccurrence_dict.shape[0] < 2:
            raise ValueError("Sub-co-occurrence matrix does not have enough samples for clustering")

        cooccurrence_matrix = cooccurrence_dict.dot(cooccurrence_dict.T).fillna(0)
        cooccurrence_array = cooccurrence_matrix.values

        print("Co-occurrence matrix:\n", cooccurrence_matrix)
        print("Co-occurrence array shape:", cooccurrence_array.shape)

        clustering = AgglomerativeClustering(n_clusters=1, metric='euclidean', linkage='ward')
        cluster_labels = clustering.fit_predict(cooccurrence_array)
        cooccurrence_matrix['Cluster'] = cluster_labels

        # Generate dendrogram plot
        dendrogram_figure = create_dendrogram_plot(cooccurrence_array, cooccurrence_matrix.index.tolist(), flat_df, show_labels)

        return bar_chart_figure, dendrogram_figure

    except Exception as e:
        print(f"Error in generating dendrogram: {e}")
        return bar_chart_figure, {'data': [], 'layout': {'title': 'Error generating dendrogram'}}

    
if __name__ == '__main__':
    app.run_server(debug=True, port=8051)


ICD code 19
Level 2 display found: Verletzungen, Vergiftungen und bestimmte andere Folgen äußerer Ursachen
 19 2 ICD
ICD code T51
GRUPPE T51 Toxische Wirkungen von vorwiegend nicht medizinisch verwendeten Substanzen
T51 3 ICD
ICD code 22
Level 2 display found: Schlüsselnummern für besondere Zwecke
 22 2 ICD
ICD code U00
GRUPPE U00 Vorläufige Zuordnungen für Krankheiten mit unklarer Ätiologie, belegte und nicht belegte Schlüsselnummern
U00 3 ICD
ICD code 13
Level 2 display found: Krankheiten des Muskel-Skelett-Systems und des Bindegewebes
 13 2 ICD
ICD code M86
GRUPPE M86 Sonstige Osteopathien
M86 3 ICD
ICD code 9
Level 2 display found: Krankheiten des Kreislaufsystems
 9 2 ICD
ICD code I30
GRUPPE I30 Sonstige Formen der Herzkrankheit
I30 3 ICD
OPS code 9
OPS code 9-65
GRUPPE 9-65 Behandlung bei psychischen und psychosomatischen Störungen und Verhaltensstörungen bei Kindern und Jugendlichen
OPS code 8
OPS code 8-80
GRUPPE 8-80 Maßnahmen für den Blutkreislauf
OPS code 6
OPS code 6-00
GRU

# NODE DEGREE DISTRIBUTION

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import io

# Use the correct path to read the parquet file
file_path = 'C:/dataset/FHIR_real_data.parquet'
flat_df = pd.read_parquet(file_path)

# Continue with the rest of your code


# Define resource type functions
def is_icd_code(code):
    """Check if the given code is a valid ICD code."""
    if not isinstance(code, str) or not code:  # Check for string type and non-empty
        return False
    return bool(re.match(r"^[A-Z]", code))

def is_loinc_code(code):
    """Check if the given code is a valid LOINC code with a hyphen at [-2]."""
    if not isinstance(code, str) or len(code) < 2:  # Check for string type and minimum length
        return False
    return code[-2] == '-'

def is_ops_code(code):
    """Check if the given code is a valid OPS code."""
    if not isinstance(code, str) or len(code) < 2:  # Check for string type and minimum length
        return False
    return code[1] == '-'

def get_resource_type(code):
    """Determine the resource type based on the code."""
    if is_icd_code(code):
        return "ICD"
    elif is_loinc_code(code):
        return "LOINC"
    elif is_ops_code(code):
        return "OPS"
    else:
        return "Unknown"  # Default case for unrecognized codes



# Create co-occurrence matrices
def create_co_occurrence_matrix(df):
    if df.empty:
        return pd.DataFrame()
    patient_matrix = df.pivot_table(index='PatientID', columns='Codes', aggfunc='size', fill_value=0)
    print("patient_matrix:\n", patient_matrix)  # Displaying patient matrix for debugging
    patient_matrix = patient_matrix.loc[:, (patient_matrix != 0).any(axis=0)]
    co_occurrence_matrix = patient_matrix.T.dot(patient_matrix)
    np.fill_diagonal(co_occurrence_matrix.values, 0)  # Filling diagonal with 0
    return co_occurrence_matrix

# Create co-occurrence matrix from flat_df
main_df = create_co_occurrence_matrix(flat_df)

# Get the degree for each code from the co-occurrence matrix
degrees = main_df.sum(axis=1).reset_index()
degrees.columns = ['Code', 'Degree']  # Renaming for clarity

# Assign resource types to degrees DataFrame using the new function
degrees['ResourceType'] = degrees['Code'].apply(get_resource_type)

# Extract degree values for each resource type
icd_degrees = degrees[degrees['ResourceType'] == 'ICD']['Degree']
loinc_degrees = degrees[degrees['ResourceType'] == 'LOINC']['Degree']
ops_degrees = degrees[degrees['ResourceType'] == 'OPS']['Degree']

# Combine degrees from all resource types for overall histogram properties
sorted_degree_values = pd.concat([icd_degrees, loinc_degrees, ops_degrees]).values

# Set fixed bin size for histogram
bins = np.arange(0, max(sorted_degree_values) + 10, 2)  # Adjusted to accommodate maximum degree

# Plot overlapping histograms for each resource type
plt.figure(figsize=(10, 6))
plt.xlim([min(sorted_degree_values) - 15, max(sorted_degree_values) + 15])



# Plot histograms
plt.hist(icd_degrees, bins=bins, alpha=0.5, color="#00bfff", label='ICD')
plt.hist(loinc_degrees, bins=bins, alpha=0.5, color="#ffc0cb", label='LOINC')
plt.hist(ops_degrees, bins=bins, alpha=0.5, color="#9a31a8", label='OPS')

# Add titles and labels
plt.title('Node Degree Distribution by Resource Type (Fixed Bin Size)')
plt.xlabel('Degree')
plt.ylabel('Count')
plt.grid(axis='y')  # Optional: Add grid for better visibility
plt.legend(loc='upper right')  # Add a legend

# Show the plot
plt.show()


In [None]:
import networkx as nx
from pyvis.network import Network
import random

# 1. Create a NetworkX Graph
G = nx.Graph()

# 2. Add nodes with attributes (e.g., age, location)
people = [
    (1, {'name': 'Alice', 'age': 25, 'location': 'New York'}),
    (2, {'name': 'Bob', 'age': 30, 'location': 'Paris'}),
    (3, {'name': 'Charlie', 'age': 35, 'location': 'London'}),
    (4, {'name': 'Diana', 'age': 28, 'location': 'Tokyo'}),
    (5, {'name': 'Edward', 'age': 40, 'location': 'Berlin'}),
]

G.add_nodes_from(people)

# 3. Add edges with weights (representing friendship strength)
friendships = [
    (1, 2, 5),  # Alice and Bob with friendship strength 5
    (1, 3, 2),  # Alice and Charlie with friendship strength 2
    (2, 3, 4),  # Bob and Charlie with friendship strength 4
    (3, 4, 1),  # Charlie and Diana with friendship strength 1
    (4, 5, 3),  # Diana and Edward with friendship strength 3
    (5, 1, 7),  # Edward and Alice with friendship strength 7
]

# Add edges with weights
G.add_weighted_edges_from(friendships)

# 4. Pyvis Visualization
net = Network(notebook=True, height='750px', width='100%', bgcolor='#222222', font_color='white')

# Set the physics layout of the network
net.force_atlas_2based(gravity=-50)

# Define color mapping by location
location_colors = {
    'New York': 'blue',
    'Paris': 'green',
    'London': 'red',
    'Tokyo': 'orange',
    'Berlin': 'purple',
}

# Add nodes with labels and custom attributes, including color based on location
for node, data in G.nodes(data=True):
    # Assign color based on location
    node_color = location_colors.get(data['location'], 'gray')
    
    # Format title with \n for newlines
    title_text = f"Age: {data['age']}\nLocation: {data['location']}"
    
    # Add the node with color and other attributes
    net.add_node(
        node, 
        label=data['name'], 
        title=title_text, 
        value=data['age'],  # Node size based on age
        color=node_color    # Assign node color based on location
    )

# Add edges with custom white color and transparency
for u, v, weight in G.edges(data='weight'):
    net.add_edge(
        u, 
        v, 
        value=weight, 
        title=f"Friendship Strength: {weight}",
        color='rgba(255, 255, 255, 0.3)'  # White with 30% transparency
    )

# Enable physics for a dynamic layout
net.toggle_physics(True)

# 5. Save and show the graph in an HTML file
net.show('social_network.html')


In [None]:
# Create co-occurrence matrices
def create_co_occurrence_matrix(df):
    if df.empty:
        return pd.DataFrame()
    patient_matrix = df.pivot_table(index='PatientID', columns='Codes', aggfunc='size', fill_value=0)
    patient_matrix = patient_matrix.loc[:, (patient_matrix != 0).any(axis=0)]
    co_occurrence_matrix = patient_matrix.T.dot(patient_matrix)
    np.fill_diagonal(co_occurrence_matrix.values, 0)
    return co_occurrence_matrix



def is_icd_code(code):
    """Check if the given code is a valid ICD code."""
    return bool(re.match(r"^[A-Z]", code))

def is_loinc_code(code):
    """Check if the given code is a valid LOINC code with a hyphen at [-2]."""
    return len(code) > 1 and code[-2] == '-'

def is_ops_code(code):
    """Check if the given code is a valid OPS code."""
    return len(code) > 1 and code[1] == '-'

def get_resource_type(code):
    """Determine the resource type based on the code."""
    if is_icd_code(code):
        return "ICD"
    elif is_loinc_code(code):
        return "LOINC"
    elif is_ops_code(code):
        return "OPS"
    else:
        return "Unknown"  # Default case for unrecognized codes

def get_color_for_resource_type(resource_type):
    """Map resource types to colors using SUBGROUP_COLORS."""
    return SUBGROUP_COLORS.get(resource_type, 'gray')  # Default to gray if not found


def generate_network_viz(df, code1_col, code2_col, weight_col, 
                         layout='barnes_hut', node_color=None, edge_color=None,
                         central_gravity=0.005,
                         node_distance=420,
                         spring_length=1000,
                         spring_constant=0.01,
                         spring_strength=0.15,
                         damping=0.96):
    # Generate a NetworkX graph
    G = nx.from_pandas_edgelist(df, source=code1_col, target=code2_col, edge_attr=weight_col)

    bgcolor, font_color = 'white', 'black'  # Default colors

    # Initiate PyVis network object
    net = Network(
        height='700px', 
        width='100%',
        bgcolor=bgcolor, 
        font_color=font_color, 
        notebook=True
    )

    # Take NetworkX graph and translate it to a PyVis graph format
    net.from_nx(G)

    # Set colors for nodes
    if node_color is not None:
        for node in G.nodes():
            net.get_node(node)['color'] = node_color.get(node, 'gray')  # Default to gray if no color is provided

    # Set colors for edges
    if edge_color is not None:
        for u, v in G.edges():
            net.get_edge(u, v)['color'] = edge_color.get((u, v), 'rgba(255, 255, 255, 0.3)')  # Default to white with transparency

    # Default to barnes_hut layout
    net.barnes_hut(
        central_gravity=central_gravity, 
        spring_length=spring_length, 
        spring_strength=spring_strength, 
        damping=damping
    )      

    return net

    


def create_dendrogram_plot(cooccurrence_array, labels, flat_df, show_labels):
    # Adjust labels based on the 'show_labels' input
    if 'show' in show_labels:
        # Use 'Displays' from flat_df for labels
        labels = [
            flat_df.loc[flat_df['Codes'] == label, 'Displays'].iloc[0] 
            if not flat_df.loc[flat_df['Codes'] == label, 'Displays'].empty 
            else label  # Fallback to code if display is missing
            for label in labels
        ]
#     else:
#         # Use truncated codes (remove the first two characters)
#         #labels = [label[2:] for label in labels]

    # Create the dendrogram plot with Plotly
    fig = ff.create_dendrogram(cooccurrence_array, orientation='bottom', labels=labels)

        # Update line color for all links in the dendrogram
    for line in fig.data:
        line.update(line=dict(color='gray'))  # Set your desired color here
    
    # Update layout to improve appearance
    fig.update_layout(
        title='Dendrogram',
        title_x=0.5,
        xaxis_title='',
        yaxis_title='Distance',
        xaxis={'tickangle': -45},  # Rotate labels for better readability
    )
    
    return fig



def fetch_and_process_data(file_content):
    
    # Read CSV data from uploaded content
    flat_df = pd.read_parquet(io.BytesIO(file_content))

    # Check for required columns
    required_columns = ['PatientID', 'Codes', 'ResourceType']
    missing_columns = [col for col in required_columns if col not in flat_df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns: {', '.join(missing_columns)}")

    icd_df = pd.read_csv('ICD_Katalog_2023_DWH_export_202406071440.csv')    # Make sure to adjust the column names as necessary
    ops_df = pd.read_csv('OPS_Katalog_2023_DWH_export_202409200944.csv')    # Make sure to adjust the column names as necessary
    loinc_df = pd.read_csv('LOINC_DWH_export_202409230739.csv')  # Make sure to adjust the column names as necessary
    
    def get_display_label(code, level,  resource_type):
        """Retrieve the display label for codes and their associated group or chapter labels based on resource type."""
        code = str(code).strip()
        # Attempt to get the main display label based on the specific code
        if resource_type == 'ICD':
            print('ICD code', code)
            if level == 4:
                result = icd_df.loc[icd_df['ICD_CODE'] == code, 'ICD_NAME']
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = icd_df.loc[icd_df['GRUPPE_CODE'] == code, 'GRUPPE_NURNAME']
                print('GRUPPE', code, gruppe_result.iloc[0])
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                    
            if level == 2:
                icd_df['KAPITEL_CODE'] = icd_df['KAPITEL_CODE'].astype(str)  # Convert KAPITEL_CODE to string
                code = str(code).strip()

                # Attempt to get group or chapter label for level 2
                kapitel_result = icd_df.loc[icd_df['KAPITEL_CODE'] == code, 'KAPITEL_NURNAME']

                if not kapitel_result.empty:
                    print(f"Level 2 display found: {kapitel_result.iloc[0]}")
                    return kapitel_result.iloc[0]  # Return the first result


        elif resource_type == 'OPS':
            print('OPS code', code)
            if level == 4:
                result = ops_df.loc[ops_df['OPS_CODE'] == code, 'OPS_NAME']
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
                
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = ops_df.loc[ops_df['GRUPPE_CODE'] == code, 'GRUPPE_NURNAME']
                print('GRUPPE', code, gruppe_result.iloc[0])
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                
            if level == 2:
                icd_df['KAPITEL_CODE'] = icd_df['KAPITEL_CODE'].astype(str)  # Convert KAPITEL_CODE to string
                code = str(code).strip()
                kapitel_result = ops_df.loc[ops_df['KAPITEL_CODE'] == code, 'KAPITEL_NURNAME']
                if not kapitel_result.empty:
                    return kapitel_result.iloc[0]  # Return the first result

        elif resource_type == 'LOINC':
            if level == 4:            
                result = loinc_df.loc[loinc_df['LOINC_CODE'] == code, 'LOINC_NAME']
                if not result.empty:
                    return result.iloc[0]  # Return the first result if found
                
            if level == 3:
                # Attempt to get group or chapter label
                gruppe_result = ops_df.loc[ops_df['LOINC_PROPERTY'] == code, 'LOINC_PROPERTY']
                if not gruppe_result.empty:
                    return gruppe_result.iloc[0]  # Return the first result
                
            if level == 2:
                kapitel_result = ops_df.loc[ops_df['LOINC_SYSTEM'] == code, 'LOINC_SYSTEM']
                if not kapitel_result.empty:
                    return kapitel_result.iloc[0]  # Return the first result

        return None  # If resource type is unknown or no labels found

    
##################################################################################################   
    
    main_df = create_co_occurrence_matrix(flat_df)

    # Initialize a list to store code pairs
    code_pairs = []

    # Iterate through main_df to create initial pairs
    for i in range(len(main_df)):
        for j in range(i + 1, len(main_df)):
            code1 = main_df.index[i]
            code2 = main_df.columns[j]
            weight = main_df.iloc[i, j]

            if weight > 0:
                code_pairs.append((code1, code2, weight))

    # Create pairs_df DataFrame
    pairs_df = pd.DataFrame(code_pairs, columns=['Code1', 'Code2', 'Weight'])


    # Step 3: Assign level 3 to current pairs
    pairs_df['level'] = 4
    
    def build_hierarchy_and_get_pairs(df, code_column, kapitel_column, gruppe_column):
        if df is None:
            return []

        # Step 3: Filter df based on flat_df['Codes']
        df = df[df[code_column].isin(flat_df['Codes'])]

        # Step 4: Extract relevant columns
        df_subset = df[[kapitel_column, gruppe_column, code_column]]  # Select by column names

        # Step 5: Build the hierarchy using anytree
        level_0 = []

        for index, row in df_subset.iterrows():
            level_2 = str(row[kapitel_column])
            #print('level_2', level_2)
            level_3 = f"{level_2},{str(row[gruppe_column])}"  # Make level unique
            #print('level_3', level_3)
            level_4 = f"{level_3},{str(row[code_column])}"
            #print('level_4', level_4)      

            resource_type1 = get_resource_type(row[code_column])  # Custom function to get resource type

            if resource_type1 == 'ICD':
                level_1 = f"{'ICD'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

            if resource_type1 == 'OPS':
                level_1 = f"{'OPS'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

            if resource_type1 == 'LOINC':
                level_1 = f"{'LOINC'}, {level_4}"
                level_0.append((f"{'FHIR'}, {level_1}"))
                #print('level_0', level_0)

        return level_0

    # Get node structure for each DataFrame
    icd_level_0 = build_hierarchy_and_get_pairs(icd_df, 'ICD_CODE', 'KAPITEL_CODE', 'GRUPPE_CODE')
    ops_level_0 = build_hierarchy_and_get_pairs(ops_df, 'OPS_CODE', 'KAPITEL_CODE', 'GRUPPE_CODE')  # Adjust column names if necessary
    loinc_level_0 = build_hierarchy_and_get_pairs(loinc_df, 'LOINC_CODE', 'LOINC_SYSTEM', 'LOINC_PROPERTY')  # Adjust column names if necessary


    # Initialize a list to store new rows
    new_rows = []

    # level 0
    new_rows.append({'Code1':'FHIR' , 'Code2':'ICD' , 'Weight': len(icd_level_0), 'level': 0, 'ResourceType':'ICD'})
    new_rows.append({'Code1':'FHIR' , 'Code2':'OPS' , 'Weight': len(ops_level_0), 'level': 0, 'ResourceType':'OPS'})
    new_rows.append({'Code1':'FHIR' , 'Code2':'LOINC' , 'Weight': len(loinc_level_0), 'level': 0, 'ResourceType':'LOINC'})

    # For level 1 rows, we'll calculate the counts (weights) for the connections
    # Level 1 for ICD
    # Level 1 - Split the 3rd item (index 2) in icd_level_0
    icd_items = [item.split(',')[2] for item in icd_level_0]
    icd_item_counts = Counter(icd_items)

    # Iterate over each unique ICD level 1 item and its count
    for item, count in icd_item_counts.items():
        # Add a row for each level 1 ICD item
        new_rows.append({'Code1': 'ICD', 'Code2': 'icd'+item, 'Weight': count, 'level': 1, 'ResourceType':'ICD',
                        'Displays': 'ICD'})

        # Level 2 - Split the 4th item (index 3) for level 1 connections
        icd_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in icd_level_0 if lvl_0_item.split(',')[2] == item]
        icd_item_counts1 = Counter(icd_items1)

        for item1, count1 in icd_item_counts1.items():
            new_rows.append({
                            'Code1': 'icd' + item,  # Ensure the code is prefixed with 'icd'
                            'Code2': item1,          # Level 2 ICD code
                            'Weight': count1,        # Count for this item
                            'level': 2,              # Specify level
                            'ResourceType': 'ICD',   # Set resource type
                            'Displays': get_display_label(item, 2, 'ICD')  # Fetch display label or group name
                        })
            print(item, 2, 'ICD')

            # Level 3 - Split the 5th item (index 4) for level 2 connections
            icd_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in icd_level_0 if lvl_0_item.split(',')[3] == item1]
            icd_item_counts2 = Counter(icd_items2)

            for item2, count2 in icd_item_counts2.items():
                new_rows.append({
                            'Code1': item1,  # Ensure the code is prefixed with 'icd'
                            'Code2': item2,          # Level 2 ICD code
                            'Weight': count2,        # Count for this item
                            'level': 3,              # Specify level
                            'ResourceType': 'ICD',   # Set resource type
                            'Displays': get_display_label(item1, 3, 'ICD')  # Fetch display label or group name
                        })
            print(item1, 3, 'ICD')

    # OPS Level 1 - Split the 3rd item (index 2) in ops_level_0
    ops_items = [item.split(',')[2] for item in ops_level_0]
    ops_item_counts = Counter(ops_items)

    # Iterate over each unique OPS level 1 item and its count
    for item, count in ops_item_counts.items():
        # Add a row for each level 1 OPS item
        new_rows.append({'Code1': 'OPS', 'Code2': 'ops'+item, 'Weight': count, 'level': 1, 'ResourceType':'OPS',
                        'Displays': 'OPS'})

        # OPS Level 2 - Split the 4th item (index 3) for level 1 connections
        ops_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in ops_level_0 if lvl_0_item.split(',')[2] == item]
        ops_item_counts1 = Counter(ops_items1)

        for item1, count1 in ops_item_counts1.items():
            new_rows.append({
                            'Code1': 'ops' + item,  # Ensure the code is prefixed with 'icd'
                            'Code2': item1,          # Level 2 ICD code
                            'Weight': count1,        # Count for this item
                            'level': 2,              # Specify level
                            'ResourceType': 'OPS',   # Set resource type
                            'Displays': get_display_label(item, 2, 'OPS')  # Fetch display label or group name
                        })

            # OPS Level 3 - Split the 5th item (index 4) for level 2 connections
            ops_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in ops_level_0 if lvl_0_item.split(',')[3] == item1]
            ops_item_counts2 = Counter(ops_items2)

            for item2, count2 in ops_item_counts2.items():
                new_rows.append({
                            'Code1': item1,  # Ensure the code is prefixed with 'icd'
                            'Code2': item2,          # Level 2 ICD code
                            'Weight': count2,        # Count for this item
                            'level': 3,              # Specify level
                            'ResourceType': 'OPS',   # Set resource type
                            'Displays': get_display_label(item1, 3, 'OPS')  # Fetch display label or group name
                        })

                # LOINC Level 1 - Split the 3rd item (index 2) in loinc_level_0
    loinc_items = [item.split(',')[2] for item in loinc_level_0]
    loinc_item_counts = Counter(loinc_items)

    # Iterate over each unique LOINC level 1 item and its count
    for item, count in loinc_item_counts.items():
        # Add a row for each level 1 LOINC item
        new_rows.append({'Code1': 'LOINC', 'Code2': item, 'Weight': count, 'level': 1, 'ResourceType':'LOINC',
                        'Displays': 'LOINC'})

        # LOINC Level 2 - Split the 4th item (index 3) for level 1 connections
        loinc_items1 = [lvl_0_item.split(',')[3] for lvl_0_item in loinc_level_0 if lvl_0_item.split(',')[2] == item]
        loinc_item_counts1 = Counter(loinc_items1)

        for item1, count1 in loinc_item_counts1.items():
            # Add a row for each level 2 LOINC item
            new_rows.append({'Code1': item, 'Code2': item1, 'Weight': count1, 'level': 2, 'ResourceType':'LOINC',
                            'Displays':item})

            # LOINC Level 3 - Split the 5th item (index 4) for level 2 connections
            loinc_items2 = [lvl_0_item.split(',')[4] for lvl_0_item in loinc_level_0 if lvl_0_item.split(',')[3] == item1]
            loinc_item_counts2 = Counter(loinc_items2)

            for item2, count2 in loinc_item_counts2.items():
                # Add a row for each level 3 LOINC item
                new_rows.append({'Code1': item1, 'Code2': item2, 'Weight': count2, 'level': 3, 'ResourceType':'LOINC',
                                'Displays':item1})


    # Convert the new_rows list into a DataFrame
    new_entries_df = pd.DataFrame(new_rows)
    

    new_pairs_df = pd.concat([pairs_df, new_entries_df], ignore_index=True)
    print('new_pairs_df', new_pairs_df)

    new_pairs_df = new_pairs_df.drop_duplicates(subset=['Code1', 'Code2', 'Weight','level'])

#################################################################################################################


    # Fill the Displays column
    flat_df['Displays'] = flat_df.apply(
        lambda row: get_display_label(row['Codes'], 4, row['ResourceType']),
        axis=1
    )
    # Apply transformation only to rows with 'ICD' or 'OPS'
    flat_df.loc[flat_df['ResourceType'].isin(['ICD', 'OPS']), 'Displays'] = \
        flat_df.loc[flat_df['ResourceType'].isin(['ICD', 'OPS']), 'Displays'].apply(lambda x: ': '.join(x.split(':')[1:]).strip())

    flat_df['Displays'] = flat_df['Displays'].astype(str)

    flat_df['Full_Displays'] = flat_df['Displays']  # Store the full text
    flat_df['Displays'] = flat_df['Displays'].str.slice(0, 11) + '...'
    print('flat_df', flat_df)


    ICD_df = flat_df[flat_df['ResourceType'] == 'ICD']
    LOINC_df = flat_df[flat_df['ResourceType'] == 'LOINC']
    OPS_df = flat_df[flat_df['ResourceType'] == 'OPS']



    co_occurrence_matrices = {
        'Main': create_co_occurrence_matrix(flat_df),
        'ICD': create_co_occurrence_matrix(ICD_df),
        'LOINC': create_co_occurrence_matrix(LOINC_df),
        'OPS': create_co_occurrence_matrix(OPS_df)
    }


    # Include new_pairs_df in the returned data
    return {
        'success': True,
        'message': 'Data is loaded.',
        'data': {
            'flat_df': flat_df.to_dict(),
            'co_occurrence_matrices': co_occurrence_matrices,
            'new_pairs_df': new_pairs_df.to_dict()  # Ensure this is returned
        }
    }