In [1]:
import os
import json
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import plotly.graph_objs as go
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import dash
from dash import dash_table
from dash.dash_table.Format import Group
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State, ClientsideFunction, MATCH, ALL, ClientsideFunction, Output, Input
from dash.exceptions import PreventUpdate
import dash_bootstrap_components as dbc
#from navbar_tabs_layout import app_layout
import time
from datetime import datetime, date, time, timedelta
from dateutil.relativedelta import relativedelta
from flask_caching import Cache
import webbrowser as web
from threading import Timer
from pandasai import PandasAI
from pandasai.llm.openai import OpenAI

In [2]:
#web.open_new_tab('http://127.0.0.1:8090/')
def open_browser():
    web.open_new("http://localhost:{}".format(8080))

In [3]:
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.CERULEAN], meta_tags=[{"name": "viewport", "content": "width=device-width"}])
app.title = 'Dashboard'

In [4]:
#flask caching
cache = Cache(app.server, config={
    'CACHE_TYPE': 'filesystem',
    'CACHE_DIR': 'cache-directory',
    'CACHE_DEFAULT_TIMEOUT': 86400  # 24 hours
})
store = dcc.Store(id='local', storage_type='local')

In [5]:
cache.clear()

True

In [6]:
def preprocess_data(df):
    df = df.astype({'application_id': str,'candidate_id': str,'stage_name': str,'status': str})
    convert = ['source_id', 'job_id', 'referrer_id', 'rejection_reason_type_id', 'rejection_reason_id']
    df[convert] = df[convert].apply(pd.to_numeric, errors='coerce')
    df['entered_on'] = pd.to_datetime(df['entered_on'])
    df['exited_on'] = pd.to_datetime(df['exited_on'])
    #only select clean data
    df = df[df['stage_name']!= 'Interview 1 - F2F - delete']
    df = df[df['isDelete']==0]
    #wait time until application ackowledged
    df['entry_log_wait'] = (df['entered_on'] - df['applied_at']).dt.days
    #length per application_id and refined per status
    df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
    df['length_per_application'] = (df.groupby('application_id')['exited_on'].transform('last') - df.groupby('application_id')['entered_on'].transform('first')).dt.days
    #total number of unique stage_name per application_id per candidate_id
    df['unique_stages_per_application_candidate'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].transform('nunique')
    #sum of different application_id per candidate_id
    df['sum_applications_per_candidate'] = df.groupby('candidate_id')['application_id'].transform('nunique')
    df['month'] = df['applied_at'].dt.month
    df['year'] = df['applied_at'].dt.year
    df['exit_flag'] = ''
    for _, group in df.groupby(['candidate_id', 'application_id']):
        sorted_group = group.sort_values('exited_on')
        last_row_index = sorted_group.index[-1]
        df.at[last_row_index, 'exit_flag'] = 'exit'
    #clean stage_name
    df['stage_name'] = df['stage_name'].replace({
        'Recruiter aplication': 'Recruiter application',
        'Recruiter applicaton': 'Recruiter application',
        'Recruiter applicaton': 'Recruiter application'
    })
    df['stage_name'] = df['stage_name'].replace({'Interview 1 - Phone/Video': 'Interview 1',
                                                'Interview 1 - F2F':'Interview 1',
                                                'Interview 2 - Phone/Video':'Interview 2',
                                                'Interview 2 - F2F':'Interview 2',
                                                'Interview 3 - F2F':'Interview 3',
                                                'Interview 3 - Phone/Video':'Interview 3',
                                                })
    # df['stage_name'] = df['stage_name'].replace({'Test:Take Home':'Test:SHL' })
    df['stage_name'] = df['stage_name'].replace({'Application Review':'Hiring Manager application'})
    #df['stage_name'] = df['stage_name'].replace({'Hiring Manager Review':'Recruiter Screening', 'Hiring Manager application':'Recruiter application'})
    df = df.sort_values(['application_id','candidate_id','applied_at','entered_on', 'exited_on'])
    return df

In [7]:
def load_current():
    df1 = pd.read_excel(open('Case_Recruitment_Dataset.xlsx','rb'), 'dataset 1')
    df2 = pd.read_excel(open('Case_Recruitment_Dataset.xlsx','rb'), 'dataset 2')
    df = pd.merge(df1, df2, on="application_id")
    df = df[['candidate_id', 'application_id','stage_name','entered_on','exited_on',
        'status','rejection_reason_type_id','rejection_reason_id','job_id','applied_at', 
         'source_id', 'referrer_id', 'isDelete']]
    df = df.sort_values(['application_id','candidate_id','applied_at','entered_on','exited_on'])
    # clean_data = preprocess_data(df)
    return df

In [8]:
def load_cleaned():
    df = load_current()
    clean_data = preprocess_data(df)
    return clean_data

In [9]:
@app.callback(Output('local', 'data'),
              Output("loading-fetch-data", "children"),
              Input('fetch-data-button', 'n_clicks'))
def fetch_data(n_clicks):
    if n_clicks is None or n_clicks == 0:
        stored_data = cache.get('data')
        if stored_data is not None:
            #data from cache if there
            return stored_data, ''
        else:
            #else most recent data
            clean_data = load_current()
            cache.set('data', clean_data.to_json(date_format='iso', orient='split'))
            if clean_data.empty:
                return '', html.Div([
                    html.P('Dataset not found.')
                ])
            else:
                return clean_data.to_json(date_format='iso', orient='split'), ''  # clean_data.to_dict('records')
    else:
        try:
            clean_df = load_cleaned()
            cache.set('data', clean_df.to_json(date_format='iso', orient='split'))
            return clean_df.to_json(date_format='iso', orient='split'), html.Div([
                html.P('Data cleaned successfully')
            ])
        except Exception as e:
            return '', html.Div([
                html.P('Preprocessing not completed', style={'color': 'red'})
            ])

In [10]:
@app.callback(Output('table-container', 'children'),
              [Input('fetch-data-button', 'n_clicks'),
               State('local', 'data')])
def update_data(n_clicks, data):
    if n_clicks is None or n_clicks == 0:
        json_resp = fetch_data(0)[0]
    else:
        json_resp = fetch_data(1)[0]
    #convert json data to df
    df = pd.read_json(json_resp, orient='split')
    #show last row of df
    last_row = df.tail(1)
    #return table on dash
    return None
    return dbc.Container([
        html.H6('Last row of dataset'),
        dash_table.DataTable(
            id='table',
            columns=[{"name": i, "id": i} for i in last_row.columns],
            data=last_row.to_dict('records'),
            style_cell={'textAlign': 'center'},
            style_header={
                'backgroundColor': 'rgb(230, 230, 230)',
                'fontWeight': 'bold'
            },
            style_data_conditional=[
                {
                    'if': {'row_index': 'odd'},
                    'backgroundColor': 'rgb(248, 248, 248)'
                }
            ]
        )
    ])


In [11]:
def load_cached_data():
    cached_data = cache.get('data')
    if cached_data is not None:
        df = pd.read_json(cached_data, orient='split')
    else: 
        df = load_current()
    return df

In [12]:
#result layout 
search_container = dbc.Container([
    dbc.Row([
        dbc.Col(dcc.Input(id="input", type="text", placeholder="Ex: what is the average length of each unique stage_name based on exited_on and entered_on, ordered by length", n_submit=1, style={'width': '100%'}), width=9),
        dbc.Col(dbc.Button('Search', id='search-button', className='btn-primary'), width=2),
    ]),
    dbc.Row([
        dbc.Col(dcc.Graph(id='output-graph'), width=12, id='output-graph-container', style={'display': 'none'})
    ])
], fluid=True, style={'padding': '2rem'})

In [13]:
@app.callback(
    Output("output-graph-container", "style"),
    Output("output-graph", "figure"),
    Input("search-button", "n_clicks"),
    State("input", "value")
)
def search_output(n_clicks, input_value):
    if n_clicks is None:
        raise PreventUpdate

    df = load_cached_data()
    llm = OpenAI(api_token="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")
    pandas_ai = PandasAI(llm)
    prompt = input_value
    result = pandas_ai.run(df, prompt=prompt)

    # figure 
    if isinstance(result, go.Figure):
        # Graph is returned
        return {'display': 'block'}, result
    else:
        # Text is returned
        return {'display': 'block'}, html.Div(result)

In [14]:
date = datetime.now().strftime("%Y-%m-%d")

In [15]:
navbar = dbc.Navbar(
    dbc.Container(
        [
            dbc.Row(
                [
                    dbc.Col(html.Img(src="images.jpg", height="20px"), width="auto", align="center"),
                    dbc.Col(dbc.NavbarBrand("Recruitment Use Case", className="ml-2"), width="auto", align="center"),
                    dbc.Col(html.Div(id="date", children=date), width="auto", align="center"),
                    dbc.Col(
                        [
                            dbc.Button('Add Pipeline Scenario', id='fetch-data-button', n_clicks=0, className="mr-2"),
                     dcc.Loading(
                                id="loading-fetch-data",
                                type="default",
                                children=[
                                    dbc.Button(id='query-status-button', children='',style={'width': 'auto', 'margin-left': '10px'})
                                ]
                            ),
                            html.Div(id='status'),
                        ],
                        width="auto", align="end",style={'display': 'flex', 'align-items': 'center'}
                    ),
                    #html.Button(id='page-load', n_clicks=0, style={'display': 'none'}),
                    html.Div(id='page-load', style={'display': 'none'}, children='page-load'),
                ],
                className="my-row",
                align="center",
            ),
            dbc.NavbarToggler(id="navbar-toggler"),
        ]
    ),
    color="light",
    dark=False,
    sticky="top",
)

In [16]:
top_level = dbc.Container([
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Department Statistics"),
                dbc.CardBody([
                    html.P(
                        "OBJECTIVES : "
                        "✔️ REDUCE FAILED HIRE RATE "
                        "✔️ REDUCE COST PER HIRE "
                        "✔️ IMPROVE CANDIDATE EXPERIENCE "
                        "✔️ IMPROVE TALENT POOL",
                        style={"font-size": "16px", "margin-top": "15px"}
                    ),
                    dcc.Graph(id='high-level-table', config={'displayModeBar': False})
                ])
            ], className="rounded-0 border-0")
        )
    ]),
    # dbc.Row([
    #     dbc.Col(
    #         dbc.Card([
    #             dbc.CardHeader("Influential Stage Analysis"),
    #             dbc.CardBody([
    #                 dcc.Graph(id='correlation-heatmap')
    #             ])
    #         ], className="rounded-0 border-0")
    #     )
    # ]),
], fluid=True, style={'padding': '2rem 2rem 8rem 2rem'})


In [17]:
def generate_high_level_table():
    df = load_cached_data()
    #length per application_id and refined per status
    df['exited_on'] = pd.to_datetime(df['exited_on'])
    df['entered_on'] = pd.to_datetime(df['entered_on'])
    
    df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
    df['length_per_application'] = (df.groupby('application_id')['exited_on'].transform('last') - df.groupby('application_id')['entered_on'].transform('first')).dt.days

    df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
    df['length_per_application'] = (df.groupby('application_id')['exited_on'].transform('last') - df.groupby('application_id')['entered_on'].transform('first')).dt.days
    #total number of unique stage_name per application_id per candidate_id
    df['unique_stages_per_application_candidate'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].transform('nunique')
    #sum of different application_id per candidate_id
    df['sum_applications_per_candidate'] = df.groupby('candidate_id')['application_id'].transform('nunique')
    df['month'] = df['applied_at'].dt.month
    df['year'] = df['applied_at'].dt.year
    df['exit_flag'] = ''
    for _, group in df.groupby(['candidate_id', 'application_id']):
        sorted_group = group.sort_values('exited_on')
        last_row_index = sorted_group.index[-1]
        df.at[last_row_index, 'exit_flag'] = 'exit'
    # High Level calculations
    high_level = df.groupby('status').agg({
        'candidate_id': 'nunique',
        'application_id': 'nunique',
        'sum_applications_per_candidate': 'median',
        'unique_stages_per_application_candidate': 'median',
        'length_per_application': 'mean',
        #'stage_name': lambda x: x.mode()[0]
        #'prev_stage_clean': lambda x: x.mode()[0]
    })
    high_level.columns = ['candidates', 'applications', 'applications per candidate (mid)', 
                          'stages per application (mid)', 'application length in days (avg)']#, 'most common previous stage']
    high_level = high_level.sort_values('candidates', ascending = False)
    high_level['application length in days (avg)'] = high_level['application length in days (avg)'].round(0)
    
    #percentage
    high_level['% candidates'] = high_level['candidates'] / high_level['candidates'].sum() * 100
    high_level['% applications'] = high_level['applications'] / high_level['applications'].sum() * 100
    
    high_level['% candidates'] = high_level['% candidates'].round(2)
    high_level['% applications'] = high_level['% applications'].round(2)
    high_level = high_level.sort_values('candidates', ascending=False)
    
    grand_total_values = {
        'candidates': df['candidate_id'].nunique(),
        '% candidates':'',
        'applications': df['application_id'].nunique(),
        '% applications':'',
        'applications per candidate (mid)': df['sum_applications_per_candidate'].max(),
        'stages per application (mid)': df['unique_stages_per_application_candidate'].max(),
        'application length in days (avg)': df['length_per_application'].max(),
        #'most common previous stage': ''
    }
    grand_total_row = pd.DataFrame(grand_total_values, index=['Total or Max (if avg)'])
    
    high_level = pd.concat([high_level, grand_total_row])
    high_level = high_level[['candidates', '% candidates','applications','% applications', 'applications per candidate (mid)', 
                          'stages per application (mid)', 'application length in days (avg)']]#, 'most common previous stage']]
        
    # Define the formatting rules
    formatting_rules = [
        # ('% applications',lambda value:value<10),
        ('applications per candidate (mid)', lambda value: value > 4),
        ('stages per application (mid)', lambda value: value > 6),
        ('application length in days (avg)', lambda value: value > 30)
    ]
    
    # Create a list to store text colors
    text_color = []
    
    # Apply conditional formatting and update text colors
    for column_name, condition in formatting_rules:
        column_colors = []
        for value in high_level[column_name]:
            if condition(value):
                column_colors.append('red')
            else:
                column_colors.append('black')
        text_color.append(column_colors)
    
    # Create the Plotly table trace
    table_trace = go.Table(
        header=dict(values=['status'] + list(high_level.columns)),
        cells=dict(
            values=[high_level.index] + [high_level[col] for col in high_level.columns],
            line_color='darkslategray',
            align='left',
            font=dict(color=text_color),#, family="Lato", size=20),
            height=30
        )
    )
    
    # Create a Plotly figure with the table
    figure = go.Figure(data=[table_trace])
    figure.update_layout(
        autosize=False,
        height=500,
        width=1200
    )
    
    return figure

In [18]:
# def generate_high_level_table():
#     df = load_cached_data()
#     #length per application_id and refined per status
#     df['exited_on'] = pd.to_datetime(df['exited_on'])
#     df['entered_on'] = pd.to_datetime(df['entered_on'])
    
#     df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
#     df['length_per_application'] = (df.groupby('application_id')['exited_on'].transform('last') - df.groupby('application_id')['entered_on'].transform('first')).dt.days

#     df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
#     df['length_per_application'] = (df.groupby('application_id')['exited_on'].transform('last') - df.groupby('application_id')['entered_on'].transform('first')).dt.days
#     #total number of unique stage_name per application_id per candidate_id
#     df['unique_stages_per_application_candidate'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].transform('nunique')
#     #sum of different application_id per candidate_id
#     df['sum_applications_per_candidate'] = df.groupby('candidate_id')['application_id'].transform('nunique')
#     df['month'] = df['applied_at'].dt.month
#     df['year'] = df['applied_at'].dt.year
#     df['exit_flag'] = ''
#     for _, group in df.groupby(['candidate_id', 'application_id']):
#         sorted_group = group.sort_values('exited_on')
#         last_row_index = sorted_group.index[-1]
#         df.at[last_row_index, 'exit_flag'] = 'exit'
#     # High Level calculations
#     high_level = df.groupby('status').agg({
#         'candidate_id': 'nunique',
#         'application_id': 'nunique',
#         'sum_applications_per_candidate': 'median',
#         'unique_stages_per_application_candidate': 'median',
#         'length_per_application': 'mean',
#         #'stage_name': lambda x: x.mode()[0]
#         #'prev_stage_clean': lambda x: x.mode()[0]
#     })
#     high_level.columns = ['candidates', 'applications', 'applications per candidate (mid)', 
#                           'stages per application (mid)', 'application length in days (avg)']#, 'most common previous stage']
#     high_level = high_level.sort_values('candidates', ascending = False)
#     high_level['application length in days (avg)'] = high_level['application length in days (avg)'].round(0)
    
#     #percentage
#     high_level['% candidates'] = high_level['candidates'] / high_level['candidates'].sum() * 100
#     high_level['% applications'] = high_level['applications'] / high_level['applications'].sum() * 100
    
#     high_level['% candidates'] = high_level['% candidates'].round(2)
#     high_level['% applications'] = high_level['% applications'].round(2)
#     high_level = high_level.sort_values('candidates', ascending=False)
    
#     grand_total_values = {
#         'candidates': df['candidate_id'].nunique(),
#         '% candidates':'',
#         'applications': df['application_id'].nunique(),
#         '% applications':'',
#         'applications per candidate (mid)': df['sum_applications_per_candidate'].max(),
#         'stages per application (mid)': df['unique_stages_per_application_candidate'].max(),
#         'application length in days (avg)': df['length_per_application'].max(),
#         #'most common previous stage': ''
#     }
#     grand_total_row = pd.DataFrame(grand_total_values, index=['Total or Max (if avg)'])
    
#     high_level = pd.concat([high_level, grand_total_row])
#     high_level = high_level[['candidates', '% candidates','applications','% applications', 'applications per candidate (mid)', 
#                           'stages per application (mid)', 'application length in days (avg)']]#, 'most common previous stage']]

    
#     table_trace = go.Table(
#         header=dict(values=['status'] + list(high_level.columns)),
#         cells=dict(values=[high_level.index] + [high_level[col] for col in high_level.columns], height=30)
#     )
    
#     figure = go.Figure(data=[table_trace])
    
#     figure.update_layout(
#         autosize=False,
#         height=500,
#         width=1200
#     )
    
#     return figure


In [19]:
@app.callback(
    Output('high-level-table', 'figure'),
    Input('high-level-table', 'clickData')  # Use any input if needed
)
def update_high_level_table(click_data):
    # You can add more logic here if you want to update the table based on interactions
    # For now, just return the generated table
    return generate_high_level_table()

In [20]:
#data = load_cached_data()

In [21]:
page_layout = dbc.Container([
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Recruitment Stages Graph"),
                dbc.CardBody([
                    dcc.Graph(id='sankey-graph')
                ])
            ], className="rounded-0 border-0")
        )
    ]),
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Stage analysis"),
                dbc.CardBody([
                    dbc.Row([
                        dbc.Col(
                            dcc.Dropdown(
                                id='stage-dropdown',
                                options=[],
                                placeholder="Select a stage",
                                style={'width': '100%'}
                            ),
                            width=6
                        ),
                    ], className="mb-3"),
                    dbc.Row([
                        dbc.Col(
                            html.Div(id='stage-duration-output')
                        )
                    ]),
                    dbc.Row([
                        dbc.Col(
                            html.Div(id='backward-table-output')
                        ),
                        dbc.Col(
                            html.Div(id='forward-table-output')
                        )
                    ])
                ])
            ], className="rounded-0 border-0")
        )
    ]),
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dcc.Graph(id='duration-graph'),
                dcc.RadioItems(
                    id='data-selector',
                    options=[
                        {'label': 'All Applications', 'value': 'all'},
                        {'label': 'Only Hired Applications', 'value': 'selected'},
                    ],
                    value='all',
                )
            ],className="rounded-0 border-0")
        )
    ]),
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Stages with High Durations until 'Hired'"),
                dbc.CardBody(
                    [
                        html.Div(id='high-duration-stages'),
                        dcc.Graph(id='distribution')
                    ]
                ),
            ], className="rounded-0 border-0")
        )
    ]),
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Influential Stage Analysis"),
                dbc.CardBody([
                    dcc.Graph(id='correlation-heatmap')
                ])
            ], className="rounded-0 border-0")
        )
    ]),
], fluid=True, style={'padding': '2rem 2rem 8rem 2rem'})


In [22]:
@app.callback(
    Output('sankey-graph', 'figure'),
    Input('fetch-data-button', 'n_clicks')
)
def update_sankey_graph(n_clicks):
    df = load_cached_data()
    df['next_stage_clean'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].shift(-1)
    df['prev_stage_clean'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].shift(1)
    path2 = df.dropna(subset=['next_stage_clean'])
    source_target_pair2 = path2[['stage_name', 'next_stage_clean']].drop_duplicates()
    source_target_pair2['source'] = source_target_pair2['stage_name'].astype('category').cat.codes
    source_target_pair2['target'] = source_target_pair2['next_stage_clean'].astype('category').cat.codes
    pairs2 = source_target_pair2.groupby(['source', 'target']).size().reset_index(name='count')
    sankey_pairs2 = pairs2[['source', 'target', 'count']]
    unique_stages2 = df['stage_name'].unique()
    stage_mapping2 = {index: stage_name for index, stage_name in enumerate(unique_stages2)}
    sankey_pairs2['source_stage'] = sankey_pairs2['source'].map(stage_mapping2)
    sankey_pairs2['target_stage'] = sankey_pairs2['target'].map(stage_mapping2)
    #sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            label=unique_stages2, 
        ),
        link=dict(
            source=sankey_pairs2['source'],
            target=sankey_pairs2['target'],
            value=sankey_pairs2['count'],
        )
    )])

    fig.update_layout(
        #title='Sequences per application',
        font=dict(size=12),
        height=600,
        width=800,
    )

    return fig



In [23]:
def generate_data(stage_name):
    df = load_cached_data()
    stage_names = df['stage_name'].unique()
    stage_names = sorted(stage_names.tolist())
    options = [{'label': stage, 'value': stage} for stage in stage_names]

    if stage_name is None:
        stage = stage_names[0]
    else:
        stage = stage_name
    #generate stage analysis
    df['next_stage_clean'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].shift(-1)
    df['prev_stage_clean'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].shift(1)
    path2 = df.dropna(subset=['next_stage_clean'])
    source_target_pair2 = path2[['stage_name', 'next_stage_clean']].drop_duplicates()
    source_target_pair2['source'] = source_target_pair2['stage_name'].astype('category').cat.codes
    source_target_pair2['target'] = source_target_pair2['next_stage_clean'].astype('category').cat.codes
    pairs2 = source_target_pair2.groupby(['source', 'target']).size().reset_index(name='count')
    sankey_pairs2 = pairs2[['source', 'target', 'count']]
    unique_stages2 = df['stage_name'].unique()
    stage_mapping2 = {index: stage_name for index, stage_name in enumerate(unique_stages2)}
    exit_stages = df['next_stage_clean'].dropna().unique()
    sankey_pairs2['source_stage'] = sankey_pairs2['source'].map(stage_mapping2)
    sankey_pairs2['target_stage'] = sankey_pairs2['target'].map(stage_mapping2)
    analysis = df[df['stage_name'].str.contains(stage)]
    backward = analysis['prev_stage_clean'].value_counts(normalize=True).sort_values(ascending=False)
    forward = analysis['next_stage_clean'].value_counts(normalize=True).sort_values(ascending=False)

    # Convert 'exited_on' and 'entered_on' columns to datetime
    analysis['exited_on'] = pd.to_datetime(analysis['exited_on'])
    analysis['entered_on'] = pd.to_datetime(analysis['entered_on'])
    
    # Calculate average duration per stage
    analysis['duration'] = (analysis['exited_on'] - analysis['entered_on']).dt.days
    avg_duration = analysis.groupby('stage_name')['duration'].mean()

    #stage duration information
    duration_info = html.Div([
        # html.H4("Stage Duration"),
        html.P(f"Average Duration: {avg_duration.mean():.2f} days"),
        #html.P(f"Percentage of hired : {percentage}%"),
        # html.Ul([
            # html.Li(f"{exit_stage}: {avg_remaining_duration:.2f} days" if not pd.isna(avg_remaining_duration) else f"{exit_stage}: N/A")
            # for exit_stage, avg_remaining_duration in avg_remaining_durations.items()
        # ])
    ])

    # Calculate average remaining duration until exit stage, split by status
    avg_remaining_durations = {}
    for exit_stage in exit_stages:
        exit_stage_filter = analysis['next_stage_clean'] == exit_stage
        remaining_duration = analysis[exit_stage_filter]['exited_on'] - analysis[exit_stage_filter]['entered_on']
        avg_remaining_duration = remaining_duration.mean()
        avg_remaining_durations[exit_stage] = avg_remaining_duration

    #backward table
    backward_table = html.Table(
        [
            html.Thead(html.Tr([html.Th('Previous Stage'), html.Th('Count')])),
            html.Tbody([
                html.Tr([html.Td(prev_stage), html.Td(f"{count:.2%}")]) for prev_stage, count in backward.items()
            ])
        ],
        className='table'
    )

    #forward table
    forward_table = html.Table(
        [
            html.Thead(html.Tr([html.Th('Next Stage'), html.Th('Count'), html.Th('Avg Remaining Duration')])),
            html.Tbody([
                html.Tr([html.Td(next_stage), html.Td(f"{forward[next_stage]:.2%}"), html.Td(f"{avg_remaining_durations[next_stage].days:.2f} days")]) for next_stage in forward.index
            ])
        ],
        className='table'
    )
    
    return options, stage if stage_name is None else stage_name, duration_info, backward_table, forward_table


In [24]:
#update values
@app.callback(
    Output('stage-dropdown', 'options'),
    Output('stage-dropdown', 'value'),
    Output('stage-duration-output', 'children'),
    Output('backward-table-output', 'children'),
    Output('forward-table-output', 'children'),
    Input('stage-dropdown', 'value'),
    Input('fetch-data-button', 'n_clicks')
)
def update_stage_dropdown(value, n_clicks):
    options, stage, backward_table, forward_table, duration_info = generate_data(value)
    return options, stage, backward_table, forward_table, duration_info

In [25]:
def calculate_high_duration_stages():
    df = load_cached_data()
    df['entered_on'] = pd.to_datetime(df['entered_on'])
    df['exited_on'] = pd.to_datetime(df['exited_on'])

    refined_df = df[df.groupby('application_id')['stage_name'].transform('nunique') > 1]
    refined_df = refined_df[refined_df['stage_name'] == 'Hired']
    selected_df = df[df['application_id'].isin(refined_df['application_id'])]
    df = selected_df

    exit_stages = df['stage_name'].unique()
    high_duration_stages = {}

    for exit_stage in exit_stages:
        exit_stage_filter = df['stage_name'] == exit_stage
        remaining_duration = df[exit_stage_filter]['exited_on'] - df[exit_stage_filter]['entered_on']
        avg_remaining_duration = remaining_duration.mean()
        if pd.notna(avg_remaining_duration) and exit_stage != 'Hired':
            high_duration_stages[exit_stage] = avg_remaining_duration.days

    sorted_high_duration_stages = dict(sorted(high_duration_stages.items(), key=lambda item: item[1], reverse=True))
    return sorted_high_duration_stages


In [26]:
@app.callback(
    Output('high-duration-stages', 'children'),
    Input('high-duration-stages', 'children')  # Use any input, can be a dummy element
)
def update_high_duration_stages(dummy_input):
    high_duration_stages = calculate_high_duration_stages()
    top_10_high_stage_text = [f"{i+1}. Stage: {stage}, Avg Duration: {duration:.2f} days" for i, (stage, duration) in enumerate(high_duration_stages.items())][:3]
    top_10_high_stage_text = '\n'.join(top_10_high_stage_text)
    return top_10_high_stage_text

In [27]:
@app.callback(
    Output('distribution', 'figure'),
    Input('fetch-data-button', 'n_clicks')
)
def distrib_application_per_stage(n_clicks):
    df = load_cached_data()
    df['month'] = df['applied_at'].dt.month
    df['year'] = df['applied_at'].dt.year
    df['unique_stages_per_application_candidate'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].transform('nunique')
    no_log = df.groupby('year')['unique_stages_per_application_candidate'].value_counts(normalize=True).reset_index(name='normalized_count')
    no_log['normalized_count'] = (no_log['normalized_count'] * 100).round(2)
    
    fig = go.Figure()
    
    for stage in range(1, 11):
        stage_data = no_log[no_log['unique_stages_per_application_candidate'] == stage]
        stage_data = stage_data.sort_values('year')
        
        fig.add_trace(go.Funnel(
            name=f'{stage} stage before exit',
            y=stage_data['year'].unique(), 
            x=stage_data['normalized_count'],
            textposition='inside',
            hoverinfo='text',
            opacity=0.75,
            marker=dict(line=dict(width=1, color='gray')),
            orientation='h',
        ))
    
    fig.update_layout(
        title='Distribution of Applications (%) per Stages in the Process over the Years - Key Clickable',
        showlegend=True,
        height=500,
        width=800,
        xaxis=dict(title='Applications that Ended at This Stage'),
        yaxis=dict(title='Year', tickmode='array', tickvals=[2018, 2019, 2020]),
        hoverlabel=dict(font=dict(size=12)),
    )
    
    return fig


In [28]:
#duration with radio btn
import plotly.graph_objs as go

@app.callback(
    Output('duration-graph', 'figure'),
    Input('duration-graph', 'clickData'),
    Input('data-selector', 'value'),
    Input('fetch-data-button', 'n_clicks')
)
def update_duration_graph(click_data, data_selector, n_clicks):
    # Update df using load_cached_data() if the button is clicked
    if n_clicks is None:
        df = load_cached_data()
    else:
        df = load_cached_data()
        df['entered_on'] = pd.to_datetime(df['entered_on'])
        df['exited_on'] = pd.to_datetime(df['exited_on'])
        df['duration'] = (df['exited_on'] - df['entered_on']).dt.days
    
    all_stages_avg_durations = df.groupby('stage_name')['duration'].mean()
    
    if data_selector == 'selected':
        refined_df = df[df.groupby('application_id')['stage_name'].transform('nunique') > 1]
        refined_df = refined_df[refined_df['stage_name'] == 'Hired']
        selected_df = df[df['application_id'].isin(refined_df['application_id'])]
        df = selected_df
        selected_stages_avg_durations = df.groupby('stage_name')['duration'].mean()
    else:
        selected_stages_avg_durations = None
    
    if click_data and 'points' in click_data:
        clicked_stage = click_data['points'][0]['x']
        filtered_df = df[df['application_id'].isin(df[df['stage_name'] == clicked_stage]['application_id'])]
        avg_durations = filtered_df.groupby('stage_name')['duration'].mean()
    else:
        avg_durations = all_stages_avg_durations
    
    fig = px.bar(
        avg_durations,
        x=avg_durations.index,
        y=avg_durations.values,
        labels={'x': 'Stage Name', 'y': 'Average Duration (days)'},
        title='Average Duration of Stages',
    )
    
    if selected_stages_avg_durations is not None:
        scatter_trace = go.Scatter(
            x=selected_stages_avg_durations.index,
            y=selected_stages_avg_durations.values,
            mode='markers',
            marker=dict(size=10, color='red')
        )
        fig.add_trace(scatter_trace)
    
    return fig


In [29]:
#corr matrix
def calculate_correlation_matrix(df):
    # Filter the DataFrame to include only rows where 'status' is 'hired'
    hired_df = df[df['status'] == 'Hired']
    
    # Convert 'exited_on' and 'entered_on' columns to datetime
    hired_df['exited_on'] = pd.to_datetime(hired_df['exited_on'])
    hired_df['entered_on'] = pd.to_datetime(hired_df['entered_on'])
    
    # Calculate length per application stage and length per application
    hired_df['length_per_application_stage'] = (hired_df['exited_on'] - hired_df['entered_on']).dt.days
    hired_df['length_per_application'] = (hired_df.groupby('application_id')['exited_on'].transform('last') - hired_df.groupby('application_id')['entered_on'].transform('first')).dt.days
    
    # Select columns of interest
    columns_of_interest = ['stage_name', 'length_per_application_stage', 'length_per_application', 'rejection_reason_id', 'job_id', 'source_id', 'referrer_id']
    
    # Calculate the correlation matrix
    correlation_matrix = hired_df[columns_of_interest].corr(numeric_only=True)
    
    return correlation_matrix


In [30]:
# Define callback to update the correlation heatmap
@app.callback(
    Output('correlation-heatmap', 'figure'),
    Input('fetch-data-button', 'n_clicks')
)
def update_correlation_heatmap(n_clicks):
    #Update df using load_cached_data() if the button is clicked
    if n_clicks is None:
        df = load_cached_data()
    else:
        df = load_cached_data()
    
    correlation_matrix = calculate_correlation_matrix(df)
    
    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale='Viridis'
    ))
    
    fig.update_layout(
        title='Correlation Heatmap',
        xaxis_title='Columns',
        yaxis_title='Columns'
    )
    
    return fig


In [31]:
process = dbc.Container([
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Best Practice Stages"),
                dbc.CardBody([
                        # dropdown
                        dcc.Dropdown(
                            id='stage-dropdown2',
                            options=[],
                            value=None,
                        ),
                        dcc.Graph(id='gantt-chart'),
                    ])
            ], className="rounded-0 border-0")
        )
    ]),
], fluid=True, style={'padding': '2rem 2rem 8rem 2rem'})

In [32]:
@app.callback(
    Output('stage-dropdown2', 'options'),
    Output('stage-dropdown2', 'value'),
    Output('gantt-chart', 'figure'),
    Input('stage-dropdown2', 'value'),
    Input('fetch-data-button', 'n_clicks')
)
def update_gantt_chart(selected_stage, n_clicks):
    df = load_cached_data()
    # Convert 'entered_on' and 'exited_on' columns to datetime
    df['entered_on'] = pd.to_datetime(df['entered_on'])
    df['exited_on'] = pd.to_datetime(df['exited_on'])
    df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
    refined_df = df[df.groupby('application_id')['stage_name'].transform('nunique') > 1]
    refined_df = refined_df[refined_df['stage_name'] == 'Hired']
    selected_df = df[df['application_id'].isin(refined_df['application_id'])]
    stage_names = selected_df['stage_name'].unique()
    stage_names = sorted(stage_names.tolist())
    options = [{'label': stage, 'value': stage} for stage in stage_names]
    if selected_stage is None:
        selected_stage = stage_names[0]

    stages_min = selected_df[selected_df['stage_name'] == selected_stage].groupby('application_id')[
        'length_per_application_stage'].min().idxmin()
    
    selected_app_df = selected_df[selected_df['application_id'] == stages_min]
    
    fig = px.timeline(selected_app_df, x_start='entered_on', x_end='exited_on', y='stage_name', color='stage_name')
    
    fig.update_layout(
        title=f'Gantt Chart of Application {stages_min} Stages',
        xaxis_title='Dates',
        yaxis_title='Stage',
        showlegend=False,
        height=400,
    )

    return options, selected_stage, fig


In [33]:
# layout
predict = dbc.Container([
    dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader("Predictive"),
                dbc.CardBody([
                    html.Div([
                        # Sliders for variables
                        html.Label("Adjust variables:"),
                        dcc.Slider(id='slider-variable-1', min=0, max=1, step=0.01, value=0.5, marks={0: '0', 1: '1'}),
                        dcc.Dropdown(id='dropdown-variable-2', options=[
                            {'label': 'Option 1', 'value': 'option1'},
                            {'label': 'Option 2', 'value': 'option2'},
                            # Add more options for other categories
                        ], value='option1'),
                        # Add more sliders/dropdowns for other variables
                    ]),
                    dcc.Graph(id='hiring-probability')
                ])
            ], className="rounded-0 border-0")
        )
    ]),
    # dbc.Row([
    #     dbc.Col(
    #         dbc.Card([
    #             dbc.CardHeader("Benchmark designing"),
    #             dbc.CardBody([
    #                 dcc.Graph(id='stage-gantt')
    #             ])
    #         ], className="rounded-0 border-0")
    #     )
    # ]),
], fluid=True, style={'padding': '2rem 2rem 8rem 2rem'})


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

def train_logistic_regression_model(df, columns_of_interest):
    # Split the data into features and target
    X = df[columns_of_interest]
    y = df['status']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the logistic regression model
    model = LogisticRegression()

    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    return model, X_test, y_test


In [35]:
@app.callback(
    Output('hiring-probability', 'figure'),
    Input('slider-variable-1', 'value'),
    Input('dropdown-variable-2', 'value'),
    # Add more inputs for other sliders/dropdowns
)
def update_hiring_probability(var1_value, var2_value):
    # Load your data
    df = load_cached_data()
    
    # Extract the categorical variable for encoding
    categorical_feature = df[['stage_name']]
    
    # # Initialize the OneHotEncoder
    # encoder = OneHotEncoder()
    
    # # Fit and transform the encoder on the categorical feature
    # encoded_stage_name = encoder.fit_transform(categorical_feature)
    
    # # Retrieve the feature names from the encoder
    # encoded_feature_names = encoder.get_feature_names_out(['stage_name'])
    
    # # Convert the encoded result to a DataFrame
    # encoded_stage_name_df = pd.DataFrame(encoded_stage_name.toarray(), columns=encoded_feature_names)
    
    # # Concatenate the encoded result with your original data
    # df_encoded = pd.concat([df, encoded_stage_name_df], axis=1)

    # # Get the column names from your DataFrame
    # df_columns = df_encoded.columns.tolist()
    
    # # Define your columns of interest using all columns from df_encoded
    # columns_of_interest = df_columns
    # #columns_of_interest = ['stage_name_Application Review', 'stage_name_Application Stage', 'stage_name_Face to Face – fly in candidate', 'stage_name_Hired', 'stage_name_Hiring Manager Review', 'stage_name_Interview 1 - F2F', 'stage_name_Interview 1 - F2F - delete', 'stage_name_Interview 1 - Phone/Video', 'stage_name_Interview 2 - F2F', 'stage_name_Interview 2 - Phone/Video', 'stage_name_Interview 3 - F2F', 'stage_name_Interview 3 - Phone/Video', 'stage_name_Offer', 'stage_name_Recruiter Screening', 'stage_name_Recruiter aplication', 'stage_name_Recruiter application', 'stage_name_Recruiter applicaton', 'stage_name_Screened', 'stage_name_Test:SHL', 'stage_name_Test:Take Home']

    # #model, X_test, y_test = train_logistic_regression_model(df, columns_of_interest)
    # model, X_test, y_test = train_logistic_regression_model(df_encoded, columns_of_interest)

    # # Predict the hiring probability for the new data point
    # prob_hired = model.predict_proba([[var1_value, var2_value] + [0] * (len(df_columns) - 2)])[:, 1]
    
    # fig = go.Figure(go.Indicator(
    #     mode="number+gauge+delta",
    #     value=prob_hired[0],
    #     title={'text': "Predicted Hiring Probability"},
    #     delta={'reference': 0.5, 'position': "bottom"},
    #     gauge={
    #         'axis': {'range': [0, 1]},
    #         'bar': {'color': "darkblue"},
    #         'bgcolor': "white",
    #         'borderwidth': 2,
    #         'bordercolor': "gray",
    #         'steps': [
    #             {'range': [0, 0.3], 'color': "red"},
    #             {'range': [0.3, 0.7], 'color': "yellow"},
    #             {'range': [0.7, 1], 'color': "green"},
    #         ],
    #         'threshold': {
    #             'line': {'color': "black", 'width': 4},
    #             'thickness': 0.75,
    #             'value': 0.5
    #         }
    #     }
    # ))
    
    return fig


In [36]:
# # Define callback to update the hiring probability
# @app.callback(
#     Output('hiring-probability', 'children'),
#     Input('slider-variable-1', 'value'),
#     # Add more inputs for other sliders
# )
# def update_hiring_probability(var1_value, ...):
#     df = load_cached_data()
#     columns_of_interest = ['stage_name', 'length_per_application_stage', 'length_per_application', 'rejection_reason_id', 'job_id', 'source_id', 'referrer_id']
    
#     model, X_test, y_test = train_logistic_regression_model(df, columns_of_interest)
#     # Predict the hiring probability for the new data point
#     prob_hired = model.predict_proba(new_data)[:, 1]
    
#     return f"Predicted Hiring Probability: {prob_hired[0]:.2f}"


In [37]:
#card_link = dbc.CardLink("Open Slack", href="https://yourteam.slack.com/channels/<username>")
card  = dbc.Container(
    [
        dbc.Row(
            dbc.Col(html.H4("Share Dashboard Outputs"), width={"size": 6, "offset": 3}),
            className="mb-4",
        ),
        dbc.Row(
            dbc.Col(
                dbc.Card(
                    dbc.CardBody(
                        [
                            dbc.Row(
                                [
                                    dbc.Col(html.P("Share with a team member"), width=8),
                                    dbc.Col(
                                        dbc.Button(
                                            "Share via Slack",
                                            color="primary",
                                            id="slack-button",
                                            className="mr-2",
                                        ),
                                        width=2,
                                    ),
                                    dbc.Col(
                                        dbc.Button(
                                            "Share via Email",
                                            color="info",
                                            id="email-button",
                                        ),
                                        width=2,
                                    ),
                                ],
                                align="center",
                            ),
                        ]
                    )
                )
            ),
            className="mb-4",
        ),
], fluid=True, style={'padding': '2rem 2rem 8rem 2rem'})


In [38]:
@app.callback(
    dash.dependencies.Output("slack-button", "href"),
    dash.dependencies.Output("email-button", "n_clicks"),
    dash.dependencies.Input("slack-button", "n_clicks"),
    dash.dependencies.Input("email-button", "n_clicks"),
)
def share_outputs_via_slack_and_email(slack_n_clicks, email_n_clicks):
    # In this function, you can include the logic to send messages via Slack or email
    # Return the appropriate href for Slack and the number of email button clicks
    slack_href = "https://teamname.slack.com/channels/<username>"
    return slack_href, email_n_clicks

In [39]:
app.layout = html.Div([
    store, 
    navbar,
    html.Div(id='table-container'),
    top_level,
    search_container,
    page_layout,
    process,
    predict,
    card
], style={'padding': '0rem 0rem 4rem 0rem'})


In [40]:
if __name__ == '__main__':
    Timer(1, open_browser).start();
    app.run_server(debug=True, use_reloader=False, port=8080)