## List Tables
Exploring the tables in the 'Data Nerd Jobs' data connection.

In [None]:
\list

## Explore 'keywords_all' Column
Analyzing the 'keywords_all' column in the 'data_nerd_jobs' table.

In [None]:
SELECT keyword.element, COUNT(*) as count
FROM `public_job_listings.data_nerd_jobs`, UNNEST(keywords_all.list) as keyword
GROUP BY keyword.element
ORDER BY count DESC
LIMIT 10

## Plot Median Salary for Top 10 Skills
Visualizing the median salary for the top 10 skills with salary data.

In [None]:
WITH SkillSalary AS (
  SELECT
    keyword.element AS skill,
    salary_year
  FROM
    `public_job_listings.data_nerd_jobs`,
    UNNEST(keywords_all.list) AS keyword
  WHERE
    salary_year IS NOT NULL
),
TopSkills AS (
  SELECT skill
  FROM SkillSalary
  GROUP BY skill
  ORDER BY COUNT(*) DESC
  LIMIT 10
),
MedianSalaries AS (
  SELECT
    skill,
    PERCENTILE_CONT(salary_year, 0.5) OVER (PARTITION BY skill) AS median_salary,
    COUNT(salary_year) OVER (PARTITION BY skill) AS salary_count
  FROM
    SkillSalary
  WHERE
    skill IN (SELECT skill FROM TopSkills)
)
SELECT
  DISTINCT skill,
  median_salary,
  salary_count
FROM
  MedianSalaries
ORDER BY
  median_salary DESC

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='darkgrid')
plt.style.use('dark_background')

# Load the dataframe
df = sql_df_fkcs

# Plotting the median salary for the top 10 skills
plt.figure(figsize=(10, 8))
salary_plot = sns.barplot(x='median_salary', y='skill', data=df, palette='Blues_r', order=df.sort_values('median_salary', ascending=False)['skill'])
plt.title('Median Salary by Skill')
plt.xlabel('Median Salary')
plt.ylabel('Skill')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='darkgrid')
plt.style.use('dark_background')

# Assuming 'sql_df_fkcs' is the dataframe containing the relevant data
plt.figure(figsize=(10, 8))
barplot = sns.barplot(x='median_salary', y='skill', data=sql_df_fkcs, palette='Blues_r', order=sql_df_fkcs.sort_values('median_salary', ascending=False).skill)
plt.title('Median Salary for Top 10 Skills')
plt.xlabel('Median Salary')
plt.ylabel('Skill')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
sns.set(style='darkgrid')
plt.style.use('dark_background')

# Load the dataframe
df = sql_df_fkcs

# Plotting the median salary for the top 10 skills with modifications
plt.figure(figsize=(10, 8))
salary_plot = sns.barplot(x='median_salary', y='skill', data=df, palette='Blues_r', order=df.sort_values('median_salary', ascending=False)['skill'])

# Formatting the x-axis
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
salary_plot.xaxis.set_major_formatter(tick)
plt.xlim(80000, 160000)

# Removing the axis labels
salary_plot.set(xlabel=None, ylabel=None)

# Setting the title and making it bigger
plt.title('Median Salary by Skill', fontsize=20)

# Adding the count of job postings at the end of the bars
for p in salary_plot.patches:
    salary_plot.annotate(f'{int(p.get_width())} postings',
                   (p.get_width(), p.get_y() + p.get_height() / 2),
                   ha='left', va='center',
                   size=12, color='white',
                   xytext=(5, 0),
                   textcoords='offset points')

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

sns.set(style='darkgrid')
plt.style.use('dark_background')

# Load the dataframe
df = sql_df_fkcs

# Plotting the median salary for the top 10 skills with modifications
plt.figure(figsize=(10, 8))
salary_plot = sns.barplot(x='median_salary', y='skill', data=df, palette='Blues_r', order=df.sort_values('median_salary', ascending=False)['skill'])

# Formatting the x-axis with dollar sign and comma for thousands
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
salary_plot.xaxis.set_major_formatter(tick)

# Setting the x-axis limits
salary_plot.set_xlim(85000, 155000)

# Removing the axis labels
salary_plot.set_xlabel('')
salary_plot.set_ylabel('')

# Setting the title and making it bigger
salary_plot.set_title('Median Salary by Skill', fontsize=20)

# Adding the count of job postings at the end of the bars in black font and making it bigger
for p in salary_plot.patches:
    width = p.get_width()
    height = p.get_height()
    skill = df.iloc[int(p.get_y() + 0.5)]['skill']
    count = df[df['skill'] == skill]['salary_count'].iloc[0]
    x_position = width - 5000 if width > 90000 else width + 5000
    y_position = p.get_y() + height / 2
    salary_plot.annotate(f'{count} postings',
                   (x_position, y_position),
                   ha = 'center', va = 'center',
                   color = 'black',
                   size=10)

plt.show()

## Explore Salary vs Skills
Visualizing the how top skills relate to salary in a scatter plot

In [None]:
WITH SkillSalary AS (
  SELECT
    keyword.element AS skill,
    salary_year
  FROM
    `public_job_listings.data_nerd_jobs`,
    UNNEST(keywords_all.list) AS keyword
  WHERE
    salary_year IS NOT NULL
),
TopSkills AS (
  SELECT skill
  FROM SkillSalary
  GROUP BY skill
  ORDER BY COUNT(*) DESC
--  LIMIT 10
),
MedianSalaries AS (
  SELECT
    skill,
    PERCENTILE_CONT(salary_year, 0.5) OVER (PARTITION BY skill) AS median_salary,
    COUNT(salary_year) OVER (PARTITION BY skill) AS salary_count
  FROM
    SkillSalary
  WHERE
    skill IN (SELECT skill FROM TopSkills)
)
SELECT
  DISTINCT skill,
  median_salary,
  salary_count
FROM
  MedianSalaries
ORDER BY
  median_salary DESC

In [None]:
import plotly.express as px

# Load the dataframe
df = sql_df_fkcs

# Create the scatter plot
fig = px.scatter(df, x='median_salary', y='salary_count',
                 hover_data=['skill'],
                 title='Scatterplot of Median Salary vs. Job Postings Count',
                 labels={'median_salary': 'Median Salary', 'salary_count': 'Job Postings Count'},
                 color='skill',
                 color_continuous_scale=px.colors.sequential.Viridis)

# Show the plot
fig.show()

In [None]:
!pip install -q plotly
import plotly.express as px

# Load the dataframe
df = sql_df_1

# Create the scatter plot
fig = px.scatter(df, x='median_salary', y='salary_count',
                 hover_data=['skill'],
                 title='Median Salary vs. Job Posting Count',
                 labels={'median_salary': 'Median Salary', 'salary_count': 'Job Posting Count'},
                 template='plotly_dark')

# Show the plot
fig.show()

In [None]:
WITH SkillSalary AS (
  SELECT
    keyword.element AS skill,
    job_title_final,
    salary_year
  FROM
    `public_job_listings.data_nerd_jobs`,
    UNNEST(keywords_all.list) AS keyword
  WHERE
    salary_year IS NOT NULL
),
MedianSalaries AS (
  SELECT
    skill,
    job_title_final,
    PERCENTILE_CONT(salary_year, 0.5) OVER (PARTITION BY skill, job_title_final) AS median_salary,
    COUNT(salary_year) OVER (PARTITION BY skill, job_title_final) AS salary_count
  FROM
    SkillSalary
)
SELECT
  DISTINCT skill,
  job_title_final,
  median_salary,
  salary_count
FROM
  MedianSalaries
ORDER BY
  median_salary DESC

In [None]:
import plotly.express as px

# Load the dataframe
df = sql_df_zhgi

# Create the interactive scatter plot with a slicer for 'job_title_final'
fig = px.scatter(df, x='median_salary', y='salary_count',
                 color='job_title_final',
                 hover_data=['skill', 'job_title_final'],
                 title='Skill Count vs. Median Salary by Job Title',
                 labels={'median_salary': 'Median Salary', 'salary_count': 'Skill Count'},
                 template='plotly_dark')

# Add a dropdown to filter by job title
job_titles = df['job_title_final'].unique()
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(args=[{'visible': [True] * len(df)}],
                     label='All',
                     method='update'),
                *[dict(args=[{'visible': [job_title==jt for job_title in df['job_title_final']]}],
                     label=jt,
                     method='update') for jt in job_titles]
            ]),
            direction='down',
            pad={'r': 10, 't': 10},
            showactive=True,
            x=0.1,
            xanchor='left',
            y=1.15,
            yanchor='top'
        ),
    ],
    showlegend=False
)

# Show the plot
fig.show()

In [None]:
# Load the dataframe
df = sql_df_zhgi

# Normalize the 'salary_count' and 'median_salary' columns
df['normalized_count'] = (df['salary_count'] - df['salary_count'].min()) / (df['salary_count'].max() - df['salary_count'].min())
df['normalized_salary'] = (df['median_salary'] - df['median_salary'].min()) / (df['median_salary'].max() - df['median_salary'].min())

# Calculate the skill multiplier
df['skill_multiplier'] = df['normalized_count'] * df['normalized_salary']

# Display the dataframe with the new 'skill_multiplier' column
df[['skill', 'job_title_final', 'median_salary', 'salary_count', 'skill_multiplier']]

In [None]:
import numpy as np

# Load the dataframe
df = sql_df_zhgi.copy()

# Define a function to normalize the columns within each job title group
def normalize_within_group(data, key, target):
    group_min = data.groupby(key)[target].transform('min')
    group_max = data.groupby(key)[target].transform('max')
    normalized = (data[target] - group_min) / (group_max - group_min)
    return normalized

# Apply the normalization function to the salary_count and median_salary columns
df['normalized_count'] = normalize_within_group(df, 'job_title_final', 'salary_count')
df['normalized_salary'] = normalize_within_group(df, 'job_title_final', 'median_salary')

# Calculate the skill multiplier
df['skill_multiplier'] = df['normalized_count'] * df['normalized_salary']

# Display the dataframe with the new skill_multiplier column
df[['skill', 'job_title_final', 'median_salary', 'salary_count', 'skill_multiplier']]

In [None]:
import plotly.express as px

# Function to sort and return top 10 skills for each job title
def get_top_skills(df, job_title):
    filtered_df = df[df['job_title_final'] == job_title]
    top_skills = filtered_df.sort_values(by='skill_multiplier', ascending=False).head(10)
    return top_skills

# Initialize an empty dataframe to store top skills for each job title
top_skills_df = pd.DataFrame()
for job_title in df['job_title_final'].unique():
    top_skills_df = top_skills_df.append(get_top_skills(df, job_title))

# Create the interactive bar graph with a slicer for 'job_title_final'
fig = px.bar(top_skills_df, x='skill', y='skill_multiplier', color='job_title_final',
             hover_data=['median_salary', 'salary_count'],
             title='Top 10 Skills Based on Skill Multiplier by Job Title',
             labels={'skill_multiplier': 'Skill Multiplier', 'skill': 'Skill'},
             template='plotly_dark')

# Add a dropdown to filter and sort by job title
job_titles = top_skills_df['job_title_final'].unique()
buttons = [
    dict(label='All',
         method='update',
         args=[{'visible': [True] * len(top_skills_df)},
               {'title': 'Top 10 Skills Based on Skill Multiplier for All Job Titles'}]),
]
buttons += [
    dict(label=job_title,
         method='update',
         args=[{'visible': [row['job_title_final'] == job_title for _, row in top_skills_df.iterrows()]},
               {'title': f'Top 10 Skills Based on Skill Multiplier for {job_title}'}])
    for job_title in job_titles
]

fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            direction='down',
            pad={'r': 10, 't': 10},
            showactive=True,
            x=0.1,
            xanchor='left',
            y=1.15,
            yanchor='top'
        ),
    ],
    showlegend=False
)

# Show the plot
fig.show()