# Notebook to Create SQL Database for tutorial
Main Database Build

### Import libraries
NOTE: Used magic command of %%bigquery to execute sql and save to pandas dataframe

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from google.cloud import bigquery
pd.set_option('display.max_columns', None)

%load_ext google.cloud.bigquery
# for large datasets store data in google storage for improved speed using the following command
# %bigquery --project job-listings-366015 --use_bqstorage_api 

## Import Hourly Data

In [2]:
%%bigquery gsearch_jobs_wide
-- Job posting where salary rate is 'year' and 'hour'
-- filter search_time to 2023-01-01 to 2023-12-31
SELECT *
FROM `job-listings-366015.gsearch_job_listings_clean.gsearch_jobs_wide`
WHERE salary_rate = 'hour'
AND search_time BETWEEN '2023-01-01' AND '2024-01-01'

Query is running:   0%|          |

Downloading:   0%|          |

In [3]:
len(gsearch_jobs_wide)

46477

### Columns to name:  
ActivityID (Integer): Unique identifier for each activity.  
NerdID (Integer): Identifier for professionals (analysts, engineers, scientists).  
NerdRole (String): Role of the professional (e.g., Data Analyst, Data Scientist).  
ProjectID (Integer): Identifier for projects worked on.  
ClientCompany (String): Name of the client company.  
HoursSpent (Decimal): Hours logged per activity.  
ActivityDate (Date): Date when the activity was recorded.  
ToolUsed (String): Software or tool used (e.g., Python, SQL, Tableau).

In [4]:
job_invoice_fact = gsearch_jobs_wide[['job_title', 'job_title_final', 'company_name', 'search_time', 'keywords_all', 'salary_hour']]

## Activity ID

In [5]:
# order by search_time
job_invoice_fact = job_invoice_fact.sort_values(by=['search_time'])

# reset the index
job_invoice_fact = job_invoice_fact.reset_index(drop=True)

# create column called activity_id and number it from 100000 onwards
job_invoice_fact['activity_id'] = job_invoice_fact.index + 100000

## Nerd ID

In [6]:
job_invoice_fact

# based on the job_title and company_name, create a unique identification number for each
job_invoice_fact['nerd_id'] = job_invoice_fact['job_title'] + '_' + job_invoice_fact['company_name'] + '_' + job_invoice_fact['salary_hour'].astype(str) + '_' + job_invoice_fact['keywords_all'].astype(str)

# create a unique identification number for each title_company_id
job_invoice_fact['nerd_id'] = job_invoice_fact['nerd_id'].astype('category').cat.codes


## Nerd Role

In [7]:
# rename job_title_final to nerd_role
job_invoice_fact = job_invoice_fact.rename(columns={'job_title_final': 'nerd_role'})

## Client Company

In [8]:
# rename company name as project_company
job_invoice_fact = job_invoice_fact.rename(columns={'company_name': 'project_company'})

## Project ID

In [9]:
# based on the job_title and company_name, create a unique identification number for each
job_invoice_fact['project_id'] = job_invoice_fact['nerd_role'] + '_' + job_invoice_fact['job_title'] + '_' + job_invoice_fact['project_company']

# create a unique identification number for each title_company_id
job_invoice_fact['project_id'] = job_invoice_fact['project_id'].astype('category').cat.codes

# remove job_title column
job_invoice_fact = job_invoice_fact.drop(columns=['job_title'])

## Hours Spent

In [10]:
# create a new column called hours_spent and populate it with values between 1 and 2080

# create a nerd_ids listing the quanitty of each nerd_id
nerd_ids = job_invoice_fact['nerd_id'].value_counts()

# convert this into a dataframe with column names nerd_id and jobs_completed
nerd_ids = nerd_ids.rename_axis('nerd_id').reset_index(name='jobs_completed')

# create a new column on avg hours per job, which is 2080 divided by jobs_completed
nerd_ids['avg_hours_per_job'] = 2080 / nerd_ids['jobs_completed']

# merge this dataframe with the job_invoice_fact dataframe based on the nerd_id
job_invoice_fact = pd.merge(job_invoice_fact, nerd_ids, on='nerd_id')

# create a new column called hours_spent and use the apply function to generate the value based on a random number using np.random.normal() from the avg_hours_per_job column and using 10% of this value as the standand deviation
job_invoice_fact['hours_spent'] = job_invoice_fact.apply(lambda x: np.random.normal(x['avg_hours_per_job'], x['avg_hours_per_job'] * 0.1), axis=1)

job_invoice_fact['hours_spent'] = job_invoice_fact['hours_spent'].round(0) # ensure round up
job_invoice_fact['hours_spent'] = job_invoice_fact['hours_spent'].astype(int)

# remove the avg_hours_per_job and jobs_completed columns
job_invoice_fact = job_invoice_fact.drop(columns=['avg_hours_per_job', 'jobs_completed'])

## Activity Date

In [11]:
# rename search_time to activity_date
job_invoice_fact = job_invoice_fact.rename(columns={'search_time': 'activity_date'})

# convert activity_date to date only to remove time
job_invoice_fact['activity_date'] = job_invoice_fact['activity_date'].dt.date

# convert back to datetime
job_invoice_fact['activity_date'] = pd.to_datetime(job_invoice_fact['activity_date'])

## Tool Used

In [12]:
# transform the 'skills' column into a list of elements and skip none elements
job_invoice_fact['keywords_all'] = job_invoice_fact['keywords_all'].apply(lambda x: [item['element'] for item in x['list']] if x is not None else [])

# create a new column called tool_used and populate it with the first element in the list from keywords_all
job_invoice_fact['project_tool'] = job_invoice_fact['keywords_all'].apply(lambda x: x[0] if len(x) > 0 else None)

# drop the keywords_all column
job_invoice_fact = job_invoice_fact.drop(columns=['keywords_all'])


## Hourly Rate

In [13]:
# rename salary_hour to hours_rate
job_invoice_fact = job_invoice_fact.rename(columns={'salary_hour': 'hours_rate'})

## Convert to SQLite file

In [18]:
# get columns in order
invoices_fact = job_invoice_fact[['activity_id', 'activity_date', 'project_id', 'project_company', 'project_tool', 'nerd_id', 'nerd_role', 
       'hours_spent',  'hours_rate']]

# convert into a sqlite database
import sqlite3
conn = sqlite3.connect('invoices_2023.db')
invoices_fact.to_sql('invoices_fact', conn, if_exists='replace', index=False)
conn.close()


## Upload to BigQuery

In [19]:
import pandas as pd
from pandas_gbq import to_gbq

to_gbq(invoices_fact, 'sql_tutorial_jobs.invoices_fact', project_id='job-listings-366015', if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 10866.07it/s]
