In [1]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/global-salaries-in-cybersecurity-infosec/salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,EX,FT,Information Security Officer,160000,USD,160000,US,100,US,M
1,2023,EX,FT,Information Security Officer,100000,USD,100000,US,100,US,M
2,2023,SE,FT,Security Engineer,247250,USD,247250,US,0,US,M
3,2023,SE,FT,Security Engineer,160000,USD,160000,US,0,US,M
4,2023,SE,FT,Security Engineer,224250,USD,224250,US,0,US,M


In [2]:
df.nunique()

work_year                4
experience_level         4
employment_type          4
job_title              125
salary                 922
salary_currency         21
salary_in_usd         1206
employee_residence      63
remote_ratio             3
company_location        58
company_size             3
dtype: int64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4501 entries, 0 to 4500
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           4501 non-null   int64 
 1   experience_level    4501 non-null   object
 2   employment_type     4501 non-null   object
 3   job_title           4501 non-null   object
 4   salary              4501 non-null   int64 
 5   salary_currency     4501 non-null   object
 6   salary_in_usd       4501 non-null   int64 
 7   employee_residence  4501 non-null   object
 8   remote_ratio        4501 non-null   int64 
 9   company_location    4501 non-null   object
 10  company_size        4501 non-null   object
dtypes: int64(4), object(7)
memory usage: 386.9+ KB


In [4]:
from plotly.express import bar
for column in df.columns:
    if column not in {'job_title', 'salary', 'salary_in_usd'}:
        bar(data_frame=df[column].value_counts().to_frame().reset_index(), x=column, y='count', 
            log_y=(column in {'salary_currency', 'employee_residence', 'company_location'})).show()

In [5]:
from plotly.express import parallel_categories
parallel_categories(data_frame=df,
                    dimensions=['salary_currency', 'employee_residence', 'company_location'], height=800,
                   color='salary_in_usd')

Almost all of the jobs in this dataset are in the US and pay in USD.

In [6]:
from plotly.express import scatter
scatter(data_frame=df[['job_title', 'experience_level', 'salary_in_usd']].groupby(by=['job_title', 'experience_level']).mean().reset_index(),
       x='experience_level', y='job_title', color='salary_in_usd', height=1200)

This chart is a little ungainly but it captures the diversity of job titles, the range of pay in USD, and the fact that every company wants senior talent. 

In [7]:
scatter(data_frame=df[['job_title', 'experience_level', 'salary_in_usd']].groupby(by=['job_title', 'experience_level']).mean().reset_index(),
       x='experience_level', hover_name='job_title', y='salary_in_usd', color='salary_in_usd')

If we plot the same data with different dimensions we get a sense of how experience levels are compensated relative to one another across job titles.

In [8]:
from plotly.express import violin
violin(data_frame=df, x='experience_level', y='salary_in_usd', hover_name='job_title', color='company_size')

A violin plot shows us the same information at a lower level of detail but it allows us to break out company size.

In [9]:
from plotly.express import strip
strip(data_frame=df[['job_title', 'experience_level', 'company_size', 'salary_in_usd']].groupby(by=['job_title', 'experience_level', 'company_size']).mean().reset_index(),
      x='experience_level', facet_col='company_size', hover_name='job_title', y='salary_in_usd', stripmode='overlay')

This plot breaks out the mean salary by company size and experience level but unfortunately strip plots don't use continuous colors so we're stuck with a monochrome plot.

In [10]:
from plotly.express import strip
strip(data_frame=df[['job_title', 'experience_level', 'company_size', 'salary_in_usd']].groupby(by=['job_title', 'experience_level', 'company_size']).mean().reset_index(),
      color='experience_level', y='company_size', hover_name='job_title', x='salary_in_usd', stripmode='overlay', height=800)

We can only use the color for one of our categorical variables, and that can't be job title because there are too many, and we only have one continuous variable (salary), so our options are limited in how we can display the data.