### Listing Tables in Data Connection

Exploring the tables available in the 'Data Nerd Jobs' data connection.

In [None]:
-- Listing all tables in the 'Data Nerd Jobs' data connection
\list

In [None]:
-- Retrieving the schema of the 'data_nerd_jobs' table
SELECT * FROM `public_job_listings.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'data_nerd_jobs';

### Sample Data for Salary Columns

Displaying a sample of data from the columns that contain the word 'salary'.

In [None]:
# Filtering columns that contain the word 'salary'
salary_columns = sql_df_lckb[sql_df_lckb['column_name'].str.contains('salary', case=False, na=False)]
salary_columns[['column_name', 'data_type']]

In [None]:
-- Retrieving a sample of salary-related columns without blank rows
SELECT job_salary, salary_pay, salary_rate, salary_avg, salary_max, salary_min, salary_year, salary_hour
FROM `public_job_listings.data_nerd_jobs`
WHERE job_salary IS NOT NULL AND salary_pay IS NOT NULL AND salary_rate IS NOT NULL AND salary_avg IS NOT NULL AND salary_max IS NOT NULL AND salary_min IS NOT NULL
LIMIT 10;

In [None]:
# Importing necessary libraries for EDA
import seaborn as sns
import matplotlib.pyplot as plt

# Setting the aesthetic style of the plots
sns.set(style='darkgrid')
plt.style.use('dark_background')

# Descriptive statistics for 'salary_year'
descriptive_stats_salary_year = sql_df_sjlc['salary_year'].describe()

# Descriptive statistics for 'salary_hour'
descriptive_stats_salary_hour = sql_df_sjlc['salary_hour'].describe()

# Displaying the descriptive statistics
(descriptive_stats_salary_year, descriptive_stats_salary_hour)

In [None]:
-- Retrieving 'salary_year' and 'salary_hour' columns
SELECT salary_year, salary_hour
FROM `public_job_listings.data_nerd_jobs`
WHERE salary_year IS NOT NULL OR salary_hour IS NOT NULL;

In [None]:
# Importing necessary libraries for EDA
import seaborn as sns
import matplotlib.pyplot as plt

# Setting the aesthetic style of the plots
sns.set(style='darkgrid')
plt.style.use('dark_background')

# Converting the SQL results to a DataFrame
salary_df = sql_df_ottn.copy()

# Descriptive statistics for 'salary_year'
descriptive_stats_salary_year = salary_df['salary_year'].describe()

# Descriptive statistics for 'salary_hour'
descriptive_stats_salary_hour = salary_df['salary_hour'].describe()

# Displaying the descriptive statistics
(descriptive_stats_salary_year, descriptive_stats_salary_hour)

In [None]:
# Histogram for 'salary_year' with increased number of bins
plt.figure(figsize=(10, 6))
sns.histplot(salary_df['salary_year'].dropna(), bins=60, kde=False, color='skyblue')
plt.title('Distribution of Yearly Salaries')
plt.xlabel('Yearly Salary ($)')
plt.ylabel('Frequency')
plt.show()

# Histogram for 'salary_hour' with increased number of bins
plt.figure(figsize=(10, 6))
sns.histplot(salary_df['salary_hour'].dropna(), bins=60, kde=False, color='skyblue')
plt.title('Distribution of Hourly Wages')
plt.xlabel('Hourly Wage ($)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram for 'salary_year'
plt.figure(figsize=(10, 6))
sns.histplot(salary_df['salary_year'].dropna(), bins=60, kde=False, color='skyblue')
plt.title('Distribution of Yearly Salaries')
plt.xlabel('Yearly Salary ($)')
plt.ylabel('Frequency')
plt.ticklabel_format(style='plain', axis='x')  # Disable scientific notation
plt.show()

# Histogram for 'salary_hour'
plt.figure(figsize=(10, 6))
sns.histplot(salary_df['salary_hour'].dropna(), bins=60, kde=False, color='skyblue')
plt.title('Distribution of Hourly Wages')
plt.xlabel('Hourly Wage ($)')
plt.ylabel('Frequency')
plt.show()

In [None]:
-- Calculating the percentage of non-null 'salary_year' values by 'search_country'
SELECT search_country,
       COUNT(salary_year) / COUNT(*) AS percent_non_null_salary_year
FROM `public_job_listings.data_nerd_jobs`
GROUP BY search_country
ORDER BY percent_non_null_salary_year DESC;

In [None]:
# Filtering the DataFrame to show only the top 20 countries
percent_salary_year_top20 = percent_salary_year_df.head(20)

# Plotting the bar chart for the top 20 countries
plt.figure(figsize=(15, 8))
sns.barplot(x='percent_non_null_salary_year', y='search_country', data=percent_salary_year_top20, palette='Blues_r')
plt.title('Percent of Job Postings that Include Annual Salary')
plt.xlabel('')  # Removing the x-axis label
plt.ylabel('')  # Removing the y-axis label
plt.gca().xaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1))  # Formatting the x-axis as a percentage
plt.show()

### Median Annual Salary by Job Title

Calculating the median annual salary for each job title in the dataset.

In [None]:
1. When writing code in the notebook, be sure to include a markdown cell with a brief title and one-line description of what you are doing-- Calculating the median annual salary based on the 'job_title_final' column
SELECT job_title_final,
       SAFE_DIVIDE(ARRAY_AGG(salary_year ORDER BY salary_year)[SAFE_OFFSET(DIV(COUNT(salary_year), 2) - 1)] + ARRAY_AGG(salary_year ORDER BY salary_year)[SAFE_OFFSET(DIV(COUNT(salary_year), 2))], 2) AS median_salary_year
FROM `public_job_listings.data_nerd_jobs`
WHERE salary_year IS NOT NULL
GROUP BY job_title_final
ORDER BY median_salary_year DESC;

In [None]:
# Converting the SQL results to a DataFrame
median_salary_df = sql_df_jnpw.copy()

# Plotting the bar chart for median annual salary by job title
plt.figure(figsize=(10, 6))
sns.barplot(x='median_salary_year', y='job_title_final', data=median_salary_df, palette='Blues_r')
plt.title('Median Annual Salary by Job Title')
plt.xlabel('')  # Removing the x-axis label
plt.ylabel('')  # Removing the y-axis label
plt.show()