In [5]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import plotly.graph_objects as go
import plotly.express as px
from plotly_calplot import calplot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import timedelta
from datetime import datetime
import calendar
from IPython.display import HTML, display
import dash
from dash import dash_table
from dash.dash_table.Format import Group
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State, ClientsideFunction, MATCH, ALL, ClientsideFunction, Output, Input
from dash.exceptions import PreventUpdate
# from dash_extensions import Download
# from dash_extensions.snippets import send_data_frame
import dash_bootstrap_components as dbc

In [12]:
df1 = pd.read_excel(open('Case_Recruitment_Dataset.xlsx','rb'), 'dataset 1')
df2 = pd.read_excel(open('Case_Recruitment_Dataset.xlsx','rb'), 'dataset 2')
df = pd.merge(df1, df2, on="application_id")

In [13]:
def preprocess_data(df):
    df = df.astype({'application_id': str,'candidate_id': str,'stage_name': str,'status': str})
    convert = ['source_id', 'job_id', 'referrer_id', 'rejection_reason_type_id', 'rejection_reason_id']
    df[convert] = df[convert].apply(pd.to_numeric, errors='coerce')
    df['isDelete'] = df['isDelete'].astype(bool)
    df['entered_on'] = pd.to_datetime(df['entered_on'])
    df['exited_on'] = pd.to_datetime(df['exited_on'])
    #wait time until application ackowledged
    df['entry_log_wait'] = (df['entered_on'] - df['applied_at']).dt.days
    #length per application_id and refined per status
    df['length_per_application_stage'] = (df['exited_on'] - df['entered_on']).dt.days
    #sum of different application_id per candidate_id
    df['sum_applications_per_candidate'] = df.groupby('candidate_id')['application_id'].transform('nunique')
    #total number of unique stage_name per application_id per candidate_id
    df['unique_stages_per_application_candidate'] = df.groupby(['candidate_id', 'application_id'])['stage_name'].transform('nunique')
    df['month'] = df['applied_at'].dt.month
    df['year'] = df['applied_at'].dt.year
    df = df.sort_values(['application_id','candidate_id','applied_at','entered_on', 'exited_on'])
    return df

In [14]:
preprocess_data(df)

Unnamed: 0,application_id,entered_on,exited_on,stage_name,candidate_id,applied_at,status,source_id,referrer_id,rejection_reason_type_id,rejection_reason_id,job_id,isDelete,entry_log_wait,length_per_application_stage,sum_applications_per_candidate,unique_stages_per_application_candidate,month,year
0,0x00000A0957632650C26B6CE288D2DC27,2020-12-12 01:10:02,2021-01-12 04:05:52,Application Stage,0xFAE0DFE79B43DC2C7C5B8F5244C89336,2020-12-12 01:10:00,rejected,2,,1.0,8358.0,1520993,False,0,31.0,1,1,12,2020
1,0x000102EA1A28971B920C6E709C157862,2020-09-16 06:02:07,2020-11-23 08:19:10,Application Stage,0x9A5F90D80572267A92ECEC39DF3EBCD2,2020-09-16 06:02:00,rejected,2,,1.0,29203.0,1219812,False,0,68.0,1,1,9,2020
2,0x0003956D92899154617462F79CA6F10B,2020-03-06 03:54:07,2020-03-24 15:04:29,Recruiter application,0xDD174037AAB6E8A2F830F887C77CA551,2020-03-06 03:54:00,rejected,171201,,1.0,29203.0,1232995,False,0,18.0,1,1,3,2020
3,0x00056C23816F8B28F7CF6D4121BFC017,2020-08-12 05:20:08,2020-08-21 10:21:13,Application Stage,0xC4869487F29544266CE09F432FC36CE4,2020-08-12 05:20:00,rejected,171201,,1.0,29203.0,1219812,False,0,9.0,2,1,8,2020
4,0x00067D38BC6FF4483A759CB9153D86AD,2020-12-15 05:05:02,2021-01-12 04:45:00,Application Stage,0x2810621482EB46B67E577EA03D257FA9,2020-12-15 05:05:00,rejected,168017,,1.0,8358.0,1520993,False,0,27.0,2,1,12,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20692,0xFFE2935EF8FAFC90FEFF52C96FEC3F37,2020-01-20 17:38:02,2020-01-30 07:23:42,Recruiter application,0x40442C5A2DF1773A5B40961450E42584,2020-01-20 17:38:00,rejected,171200,,1.0,9222.0,1264110,False,0,9.0,1,1,1,2020
20693,0xFFE4C4C69BDF182F26024D74C34AF925,2020-05-11 10:06:05,NaT,Recruiter application,0xCCAE4EFABED4BCEAA37E5ABE1E56B647,2020-05-11 10:06:00,active,171201,,,,915774,False,0,,2,1,5,2020
20694,0xFFEA13546FB3CE42CA0AF0EAC44E8537,2020-02-04 19:53:04,2020-03-08 13:38:30,Recruiter application,0x4B11DAD1DCC3223A40B7AC94671A327A,2020-02-04 19:53:00,rejected,169394,,1.0,9222.0,563420,False,0,32.0,1,1,2,2020
20695,0xFFF5CC55E0A7AF98EF42B607D94E36EE,2020-03-07 21:57:05,NaT,Recruiter application,0x090F4C5D351B709A5BBFC20B1D795ACD,2020-03-07 21:57:00,active,171201,,,,1280008,False,0,,9,1,3,2020


In [6]:
#web.open_new_tab('http://127.0.0.1:8050/')
def open_browser():
    web.open_new("http://localhost:{}".format(8050))

In [7]:
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.UNITED], meta_tags=[{"name": "viewport", "content": "width=device-width"}])
app.title = 'Data Visualization Case (Laetitia H.)'

In [10]:
today_date = datetime.now().strftime("%Y-%m-%d")

In [11]:
navbar = dbc.Navbar(
    dbc.Container(
        [
            dbc.Row(
                [
                    dbc.Col(html.Img(src="images.jpg", height="20px"), width="auto", align="center"),
                    dbc.Col(dbc.NavbarBrand("Data Visualization Case (Laetitia H.)", className="ml-2"), width="auto", align="center"),
                    dbc.Col(html.Div(id="today-date", children=today_date), width="auto", align="center"),
                ],
                className="my-row",
                align="center",
            ),
            dbc.NavbarToggler(id="navbar-toggler"),
        ]
    ),
    color="light",
    dark=False,
    sticky="top",
)

In [15]:
page_layout = dbc.Container([
    dbc.Card([
                dbc.CardHeader("Sales YtD"),
                dbc.CardBody([
                            dcc.Graph(id='sankey-graph')
                        ])
    ], className="rounded-0 border-0"),
], fluid=True, style={'padding': '2rem'})

In [23]:
@app.callback(
    Output('sankey-graph', 'figure'),
    [Input('df', 'value')]  
)
def update_sankey_graph(df):
    sankey = df[df['unique_stages_per_application_candidate']>=1].sort_values(['candidate_id', 
                                                                             'application_id', 'entered_on', 
                                                                             'exited_on'])
    sequences = sankey.groupby('application_id')['stage_name'].apply(list).sort_values(key=lambda x: x.apply(lambda d: min(d))).value_counts()
    #sankey diagram
    nodes = set()
    links = []

    for seq in sequences.index:
        for i in range(len(seq) - 1):
            source = seq[i]
            target = seq[i+1]
            nodes.add(source)
            nodes.add(target)
            links.append((source, target))

    node_dict = {node: i for i, node in enumerate(nodes)}

    source = [node_dict[source] for source, _ in links]
    target = [node_dict[target] for _, target in links]
    value = [1] * len(links)

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            label=list(nodes),
            pad=15,
            thickness=20
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        )
    )])

    fig.update_layout(
        title='Application Process stages in sequence per application from the data given',
        font=dict(size=12)
    )

    return fig



In [24]:
app.layout = html.Div([
    navbar,
    page_layout,
], style={'padding': '0rem 0rem 2rem 0rem'})

In [25]:
# Set the app layout
if __name__ == '__main__':
    #Timer(1, open_browser).start();
    app.run_server(debug=True, use_reloader=False, port=8050)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__'
 * Debug mode: on
