In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 

import plotly.express as px
import plotly.graph_objects as go
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# Load Dataset

In [3]:
df = pd.read_csv('Data_Professional_Salary_Survey_Responses.csv')
df.rename(columns={' SalaryUSD ': 'SalaryUSD'}, inplace=True)
df

Unnamed: 0,Survey Year,Timestamp,SalaryUSD,Country,PostalCode,PrimaryDatabase,YearsWithThisDatabase,OtherDatabases,EmploymentStatus,JobTitle,...,NewestVersionInProduction,OldestVersionInProduction,PopulationOfLargestCityWithin20Miles,EmploymentSector,LookingForAnotherJob,CareerPlansThisYear,Gender,OtherJobDuties,KindsOfTasksPerformed,Counter
0,2021,12/10/2020 8:22:43,65000,Sweden,,Microsoft SQL Server,4,Microsoft SQL Server,Full time employee,"Developer: Business Intelligence (SSRS, PowerB...",...,SQL Server 2016,SQL Server 2016,"<= 20,000 (town)",Private business,Not Asked,"Stay with the same employer, same role",Male,Not Asked,Not Asked,1
1,2021,12/10/2020 8:23:22,145000,United States,76063,Microsoft SQL Server,15,Azure SQL DB (any flavor),Full time employee,DBA (Production Focus - build & troubleshoot s...,...,SQL Server 2019,SQL Server 2014,1M+ (metropolis),Private business,Not Asked,"Stay with the same employer, same role",Male,Not Asked,Not Asked,1
2,2021,12/10/2020 8:23:38,105000,United States,43240,Microsoft SQL Server,12,"PostgreSQL, Azure SQL DB (any flavor)",Full time employee,DBA (General - splits time evenly between writ...,...,SQL Server 2017,SQL Server 2008R2,300K-1M (large city),Private business,Not Asked,"Stay with the same employer, same role",Male,Not Asked,Not Asked,1
3,2021,12/10/2020 8:23:48,46482,United Kingdom,,Microsoft SQL Server,10,Azure SQL DB (any flavor),Full time employee,DBA (Production Focus - build & troubleshoot s...,...,SQL Server 2019,SQL Server 2012,300K-1M (large city),"Education (K-12, college, university)",Not Asked,"Stay with the same employer, same role",Male,Not Asked,Not Asked,1
4,2021,12/10/2020 8:24:04,98800,United States,468,Microsoft SQL Server,5,DB2,Full time employee,"Developer: Business Intelligence (SSRS, PowerB...",...,SQL Server 2019,SQL Server 2012,100K-299K (city),Private business,Not Asked,"Stay with the same employer, same role",Male,Not Asked,Not Asked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10336,2017,1/14/2017 15:50:24,36549,United Kingdom,Not Asked,Microsoft SQL Server,3,,Full time employee,DBA,...,Not Asked,Not Asked,Not Asked,Private business,No,Not Asked,Not Asked,Not Asked,Not Asked,1
10337,2017,1/15/2017 1:01:21,65000,Saudi Arabia,Not Asked,Microsoft SQL Server,7,"Oracle, MySQL/MariaDB, Microsoft Access, SQLite",Full time employee,DBA,...,Not Asked,Not Asked,Not Asked,Private business,"Yes, but only passively (just curious)",Not Asked,Not Asked,Not Asked,Not Asked,1
10338,2017,1/15/2017 3:25:47,85000,Netherlands,Not Asked,Microsoft SQL Server,17,Oracle,Full time employee,Other,...,Not Asked,Not Asked,Not Asked,Private business,"Yes, but only passively (just curious)",Not Asked,Not Asked,Not Asked,Not Asked,1
10339,2017,1/15/2017 3:37:35,90000,United States,Not Asked,Microsoft SQL Server,8,MySQL/MariaDB,Full time employee,DBA,...,Not Asked,Not Asked,Not Asked,Private business,"Yes, actively looking for something else",Not Asked,Not Asked,Not Asked,Not Asked,1


# Show the Count of each Columns 

In [4]:
df.isna().sum()

Survey Year                                0
Timestamp                                  0
SalaryUSD                                  0
Country                                    0
PostalCode                              1803
PrimaryDatabase                            0
YearsWithThisDatabase                      0
OtherDatabases                          1944
EmploymentStatus                           0
JobTitle                                   0
ManageStaff                                0
YearsWithThisTypeOfJob                     0
HowManyCompanies                           0
OtherPeopleOnYourTeam                      0
CompanyEmployeesOverall                    0
DatabaseServers                            0
Education                                  0
EducationIsComputerRelated              1212
Certifications                             0
HoursWorkedPerWeek                         0
TelecommuteDaysPerWeek                     0
NewestVersionInProduction                 14
OldestVers

# Data Cleaning:

#### Replacing missing values with np.nan

In [5]:
missing_val = ['Not Asked']
df.replace(missing_val, np.nan, inplace = True)

#### Count of Nan's after replacing

In [6]:
df.isna().sum()

Survey Year                                0
Timestamp                                  0
SalaryUSD                                  0
Country                                    0
PostalCode                              4701
PrimaryDatabase                            0
YearsWithThisDatabase                      0
OtherDatabases                          1944
EmploymentStatus                           0
JobTitle                                   0
ManageStaff                                0
YearsWithThisTypeOfJob                     0
HowManyCompanies                        5992
OtherPeopleOnYourTeam                      0
CompanyEmployeesOverall                 7715
DatabaseServers                         1747
Education                               3470
EducationIsComputerRelated              4682
Certifications                          3470
HoursWorkedPerWeek                      3470
TelecommuteDaysPerWeek                  3470
NewestVersionInProduction               6885
OldestVers

#### Remove columns that contains NaN more than 30%

In [7]:
df=df.drop(['PostalCode', 'HowManyCompanies', 'CompanyEmployeesOverall', 'Education', 'EducationIsComputerRelated', 
'Certifications', 'HoursWorkedPerWeek', 'TelecommuteDaysPerWeek', 'NewestVersionInProduction', 'OldestVersionInProduction',
'PopulationOfLargestCityWithin20Miles', 'OtherJobDuties', 'KindsOfTasksPerformed', 'LookingForAnotherJob'], axis = 1)

In [8]:
df.isna().sum()

Survey Year                  0
Timestamp                    0
SalaryUSD                    0
Country                      0
PrimaryDatabase              0
YearsWithThisDatabase        0
OtherDatabases            1944
EmploymentStatus             0
JobTitle                     0
ManageStaff                  0
YearsWithThisTypeOfJob       0
OtherPeopleOnYourTeam        0
DatabaseServers           1747
EmploymentSector             0
CareerPlansThisYear       2898
Gender                    2898
Counter                      0
dtype: int64

#### Fill nulls values

In [9]:
df['OtherDatabases'] = df['OtherDatabases'].fillna(df['OtherDatabases'].mode()[0])
df['DatabaseServers'] = df['DatabaseServers'].fillna(df['DatabaseServers'].mode()[0])
df['CareerPlansThisYear'] = df['CareerPlansThisYear'].fillna(df['CareerPlansThisYear'].mode()[0])
# We can't fill the Gender by mode. We assume that it is Unknown
df['Gender'] = df['Gender'].fillna('Unknown')
df['Gender']= df['Gender'].replace(['None'],'Unknown')

#### Convert to numeric

In [10]:
#convert to numeric
df['SalaryUSD']=df["SalaryUSD"].str.replace(",","").astype(float)
df['SalaryUSD']= pd.to_numeric(df["SalaryUSD"])
df.head()

Unnamed: 0,Survey Year,Timestamp,SalaryUSD,Country,PrimaryDatabase,YearsWithThisDatabase,OtherDatabases,EmploymentStatus,JobTitle,ManageStaff,YearsWithThisTypeOfJob,OtherPeopleOnYourTeam,DatabaseServers,EmploymentSector,CareerPlansThisYear,Gender,Counter
0,2021,12/10/2020 8:22:43,65000.0,Sweden,Microsoft SQL Server,4,Microsoft SQL Server,Full time employee,"Developer: Business Intelligence (SSRS, PowerB...",Yes,4,,10,Private business,"Stay with the same employer, same role",Male,1
1,2021,12/10/2020 8:23:22,145000.0,United States,Microsoft SQL Server,15,Azure SQL DB (any flavor),Full time employee,DBA (Production Focus - build & troubleshoot s...,No,25,,10,Private business,"Stay with the same employer, same role",Male,1
2,2021,12/10/2020 8:23:38,105000.0,United States,Microsoft SQL Server,12,"PostgreSQL, Azure SQL DB (any flavor)",Full time employee,DBA (General - splits time evenly between writ...,Yes,6,1.0,10,Private business,"Stay with the same employer, same role",Male,1
3,2021,12/10/2020 8:23:48,46482.0,United Kingdom,Microsoft SQL Server,10,Azure SQL DB (any flavor),Full time employee,DBA (Production Focus - build & troubleshoot s...,No,5,,10,"Education (K-12, college, university)","Stay with the same employer, same role",Male,1
4,2021,12/10/2020 8:24:04,98800.0,United States,Microsoft SQL Server,5,DB2,Full time employee,"Developer: Business Intelligence (SSRS, PowerB...",No,5,,10,Private business,"Stay with the same employer, same role",Male,1


# Country Selection Options List

In [11]:
countries = df[['Country']].groupby(['Country']).count()

# Load country list as option for multi select dropdown select
optionsCountry =[{'label': "Select All", 'value': -1}]
for i in range(len(countries.index)):
    optionsCountry.append({'label': countries.index[i], 'value': countries.index[i]})

countries

Albania
Argentina
Armenia
Australia
Austria
...
United States
Uruguay
Vanuatu
Venezuela
Vietnam


### Job Title and Gender selection lists

In [12]:
jobTitle=df['JobTitle'].unique()

genderList=df['Gender'].unique()

# Bootstrap Dashboard App

In [13]:
import dash
import dash_bootstrap_components as dbc
#from dash.dependencies import Input, Output
from dash import Input, Output, dcc, html

app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

# the style arguments for the sidebar. We use position:fixed and a fixed width
SIDEBAR_STYLE = {
    "position": "fixed",
    "top": 0,
    "left": 0,
    "bottom": 0,
    "width": "16rem",
    "padding": "2rem 1rem",
    "background-color": "#f8f9fa",
}

# the styles for the main content position it to the right of the sidebar and
# add some padding.
CONTENT_STYLE = {
    "margin-left": "18rem",
    "margin-right": "2rem",
    "padding": "2rem 1rem",
}

sidebar = html.Div(
    [
        html.H2("Sidebar", className="display-4"),
        html.Hr(),
        dbc.Nav(
            [
                dbc.NavLink("Home", href="/", active="exact"),
                dbc.NavLink("Page 1", href="/page-1", active="exact"),
                dbc.NavLink("Page 2", href="/page-2", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),

        html.Hr(),
        
        html.Label('Group By'),
        dcc.Dropdown(
            id="groupby",
            options= [{'label': 'Country', 'value': 'Country'},
                      {'label': 'Survey Year', 'value': 'Survey Year'},
                      {'label': 'Employment Sector', 'value': 'EmploymentSector'},
                      {'label': 'Employment Status', 'value': 'EmploymentStatus'},
                      {'label': 'Manage Staff', 'value': 'ManageStaff'},
                      {'label': 'Gender', 'value': 'Gender'},
                      {'label': 'Career Plans This Year', 'value': 'CareerPlansThisYear'}],
            value= 'Country',
        ),

        html.Label('Operation'),
        dcc.Dropdown(
            id="operation",
            options= [{'label': 'sum()', 'value': 'sum'},
                      {'label': 'count()', 'value': 'count'}],
            value= 'sum',
        ),

        html.Label('Country'),
        dcc.Dropdown(
            id="country",
            options= optionsCountry,
            value= -1,
            multi= True
        ),
        

    ],
    style=SIDEBAR_STYLE,
)

content = html.Div(id="page-content", style=CONTENT_STYLE)

app.layout = html.Div([dcc.Location(id="url"), sidebar, content])


@app.callback(Output("page-content", "children"), [Input("url", "pathname")])
def render_page_content(pathname):
    if pathname == "/":
        return dcc.Graph(id="graphCountry")
    elif pathname == "/page-1":
        return html.P("This is the content of page 1. Yay!")
    elif pathname == "/page-2":
        ############# Job Experince ###############
        return  [
                html.H1('Experince years histogram in a specified job title',
                        style={'textAlign':'center'}),
                        
                html.Br(),
                dcc.RadioItems(
                    id="gender-slider",
                    value="Female",
                    options=[
                            {"label": gender, "value": gender}
                            for gender in genderList],
                        
                ),
                html.Br(),
                dcc.Dropdown(id="slct_job",
                            options=[
                                {"label": job, "value": job} for job in jobTitle],
                            multi=False,
                            value="Manager",
                            ),
                
                

                dcc.Graph(id="Experince"),
                ]
            ############ End Job Experince ##############

    # If the user tries to reach a different page, return a 404 message
    return dbc.Jumbotron(
        [
            html.H1("404: Not found", className="text-danger"),
            html.Hr(),
            html.P(f"The pathname {pathname} was not recognised..."),
        ]
    )

@app.callback(Output('graphCountry', 'figure'), [Input("country", "value"),Input("groupby", "value"),Input("operation", "value"),])
def update_figure(val_country, val_groupby, val_operation):
    data = df

    if (val_country != -1):
        data = data[data["Country"].isin(val_country)]

    if (val_operation == 'sum'):
        data = data.groupby([val_groupby]).sum()
    if (val_operation == 'count'):
        data = data.groupby([val_groupby]).count()

    data = data.reset_index()

    return px.bar(
        data,
        x= val_groupby,
        y= "SalaryUSD"
    )

############# Job Experince ###############
@app.callback(
    Output('Experince', 'figure'),

    [Input("gender-slider", "value"),
    Input("slct_job", "value"),]
)  
def update_Experince(slctdGender, slctdJob):
    data = df.copy()

    data = data[data["Gender"] == slctdGender]
    data = data[data["JobTitle"] == slctdJob]

    fig = px.histogram(data, x="YearsWithThisTypeOfJob")

    return fig
############ End Job Experince ##############
    

if __name__ == "__main__":
    app.run_server(port=8888)


ModuleNotFoundError: No module named 'dash_bootstrap_components'