## Main Preprocessing Program
This program removes duplicate job listings and uses tokenization to search for 221 different words and/or sequences in every job listing. These results are packaged into a dataframe along with other important features and exported as a csv file for analysis.


In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install -U word2number
!python -m spacy download en_core_web_sm # small english model

In [None]:
import spacy
import json
import time
import re
from word2number import w2n
from spacy.matcher import Matcher
from spacy import displacy
import numpy as np
import pandas as pd
import psycopg2
import getpass
from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine
nlp = spacy.load('en_core_web_sm')

In [None]:
# The default maximum number of columns for a data frame is 20. The below code increases this maximum number.

pd.options.display.max_columns = 500
print(pd.options.display.max_columns)

In [None]:
def query_database(sql_query, user):
    '''
    This function take say SQL query and queries the postgresql database.
    It returns a dataframe containing the result of that query.
    
    Args:
        sql_query: Str
            Contains the SQL query you want to query the database with.
        user: Str
    
    Returns:
        pandas.DataFrame
            Contains result of your query.
    '''
    
    sql_query = sql_query
    database = "my_database"
    user     = user
    password = getpass.getpass("Enter password: ")

    connection = psycopg2.connect(
        database = database,
        user = user,
        host = 'my_host',
        password = password)
    
    df = pd.read_sql_query(sql_query, connection)
    connection.close()
    return df

In [None]:
def get_search_list():  
    '''
    This function returns a fresh instance of the Main Dictionary (all words/items have 
    "found" set to False).
    The Main Dictionary contains all the words we want to find in the job listing description.
    Many words have variations and these variations need to be accounted for. Some words/items
    contain multiple tokens such as "scikit-learn". We need to seperate one, two, and three
    token variations because they each require different search methods. The goal of this
    data structure is to contain all the components necessary to do the search, contain 
    the result of the search, and be in an efficient format for time complexity. 
    '''
    
    main_dictionary = {
        "programming_query_languages": {
            "python": {
                'one_token': ["python"], 
                'found': False
            }, 
            "r": {
                'one_token': ["r"],
                'found': False
            },
            "c++": {
                'one_token': ["c++", "cpp"],
                'found': False
            },
            "rust": {
                'one_token': ["rust", "rustlang"],
                'found': False
            },
            "sql": {
                'one_token': ["sql"],
                'found': False
            },
            "java": {
                'one_token': ["java"],
                'found': False
            },
            "javascript": {
                'one_token': ["javascript", "js", "ecmascript"],
                'found': False
            },
            "c#": {
                'two_token': [["c", "#"]],
                'found': False
            }, 
            "spark": {
                'one_token': ["spark"], # could be a false positive
                'found': False
            },  
            "scala": {
                'one_token': ["scala"],
                'found': False
            },
            "xml": {
                'one_token': ["xml"],
                'found': False
            },
            "vba": {
                'one_token': ["vba"],
                'found': False
            },
            "julia": {
                'one_token': ["julia"],
                'found': False
            },
            "typescript": {
                'one_token': ["typescript", "ts"], 
                'found': False
            },
            "swift": {
                'one_token': ["typescript"], 
                'found': False
            },
            "php": {
                'one_token': ["php"], 
                'found': False
            },
            "ruby": {
                'one_token': ["ruby"], 
                'found': False
            },
            "perl": {
                'one_token': ["perl"], 
                'found': False
            },
            "matlab": {
                'one_token': ["matlab"], 
                'found': False
            },
            "graphql": {
                'one_token': ["graphql"], 
                'found': False
            },
        },
        "libraries_and_packages": {
            "numpy": {
                'one_token': ["numpy", "np"], # not sure if np is necessary
                'found': False
            },
            "pandas": {
                'one_token': ["pandas", "pd"], # not sure if pd is necessary
                'found': False
            },
            "spacy": {
                'one_token': ["spacy"],
                'found': False
            },
            "scikit-learn": {
                'one_token': ["sklearn", "scikitlearn"],
                'two_token': [["scikit", "learn"]],
                'three_token': [["scikit", "-", "learn"]],
                'found': False
            }, 
            "pytorch": {
                'one_token': ["pytorch", "torch"], 
                'found': False
            },
            "tensorflow": {
                'one_token': ["tensorflow", "tf"], 
                'found': False
            },
            "dask": {
                'one_token': ["dask"], 
                'found': False
            },
            "pyspark": {
                'one_token': ["pyspark"], 
                'found': False
            },
            "scipy": {
                'one_token': ["scipy"], 
                'found': False
            },
            "keras": {
                'one_token': ["keras"], 
                'found': False
            },
            "matplotlib": {
                'one_token': ["matplotlib"],
                'found': False
            },
            "seaborn": {
                'one_token': ["seaborn"], 
                'found': False
            },
            "bokeh": {
                'one_token': ["bokeh"], 
                'found': False
            },
            "plotly": {
                'one_token': ["plotly"],
                'found': False
            },
            "ggplot": {
                'one_token': ["ggplot"],
                'found': False
            },
        },
        "cloud_technologies": {
            "aws": {
                'one_token': ["aws"], 
                'three_token': [["amazon", "web", "service"], ["amazon", "web", "services"]],
                'found': False
            },
            "gcp": {
                'one_token': ["gcp"], 
                'three_token': [["google", "cloud", "platform"]],
                'found': False
            },
            "azure": {
                'one_token': ["azure"],
                'found': False
            },
            "ibm_cloud": {
                'two_token': [["ibm", "cloud"], ["ibm", "bluemix"]],
                'found': False
            },
            "heroku": {
                'one_token': ["heroku"],
                'found': False
            },
            "salesforce": {
                'one_token': ["salesforce"],
                'found': False
            },
        },
        "data_visualization_tools": {
            "tableau": {
                'one_token': ["tableau"],
                'found': False
            },
            "power_bi": {
                'two_token': [["power", "bi"]],
                'found': False
            },
            "qlikview": {
                'one_token': ["qlikview"],
                'found': False
            },
            "d3.js": {
                'one_token': ["d3.js"],
                'found': False
            },
            "grafana": {
                'one_token': ["grafana"],
                'found': False
            },
            "looker": {
                'one_token': ["looker"],
                'found': False
            },
            "metabase": {
                'one_token': ["metabase"],
                'found': False
            },
        },
        "database_solutions": {
            "nosql": {
                'one_token': ["nosql"],
                'found': False
            },
            "mysql": {
                'one_token': ["mysql"],
                'found': False
            },
            "sql_server": {
                'two_token': [["sql", "server"]],
                'found': False
            },
            "oracle": {
                'one_token': ["oracle"],
                'found': False
            },
            "postgresql": {
                'one_token': ["postgresql", "postgres"],
                'found': False
            },
            "mongodb": {
                'one_token': ["mongodb"],
                'found': False
            },
            "cassandra": {
                'one_token': ["cassandra"],
                'found': False
            },
            "elasticsearch": {
                'one_token': ["elasticsearch"],
                'found': False
            },
            "redis": {
                'one_token': ["redis"],
                'found': False
            },
            "neo4j": {
                'one_token': ["neo4j"],
                'found': False
            },
        },
        "containers": {
            "docker": {
                'one_token': ["docker"],
                'found': False
            },
            "kubernetes": {
                'one_token': ["kubernetes", "k8s"],
                'found': False
            },
            "openshift": {
                'one_token': ["openshift"],
                'found': False
            },
        },
        "distributed_solutions": {
            "hadoop": {
                'one_token': ["hadoop"],
                'found': False
            },
            "hive": {
                'one_token': ["hive"],
                'found': False
            },
            "mapreduce": {
                'one_token': ["mapreduce"],
                'found': False
            },
            "flink": {
                'one_token': ["flink"],
                'found': False
            },
            "kafka": {
                'one_token': ["kafka"],
                'found': False
            },
            "apache_beam": {
                'two_token': [["apache", "beam"]],
                'found': False
            },
        },
        "mlops": {
            "vertexai": {
                'one_token': ["vertexai"],
                'found': False
            },
            "sagemaker": {
                'one_token': ["sagemaker"],
                'found': False
            },
        },
        "version_control": {
            "git": {
                'one_token': ["git"],
                'found': False
            },
            "cvs": {
                'one_token': ["cvs"],
                'three_token': [["concurrent", "versions", "system"]],
                'found': False
            },
        },
        "os": {
            "windows": {
                'one_token': ["windows"],
                'found': False
            },
            "linux": {
                'one_token': ["linux"],
                'found': False
            },
            "macos": {
                'one_token': ["macos"],
                'found': False
            },
        },
        "project_management_frameworks": {
            "agile": {
                'one_token': ["agile"],
                'found': False
            },
            "scrum": {
                'one_token': ["scrum"],
                'found': False
            },
        },
        "subjects": {
            "technical_field": {
                "two_token": [["technical", "field"]],
                "found": False
            },
            "quantitative_field": {
                "two_token": [["quantitative", "field"]],
                "found": False
            },
            "data_science": {
                "two_token": [["data", "science"]],
                "found": False
            },
            "analytics": {
                "one_token": ["analytics"],
                "found": False
            },
            "machine_learning": {
                "one_token": ["ml"],
                "two_token": [["machine", "learning"]],
                "found": False
            },
            "artificial_intelligence": {
                "one_token": ["ai"],
                "two_token": [["artificial", "intelligence"]],
                "found": False
            },
            "statistics": {
                "one_token": ["statistics"],
                "found": False
            },
            "mathematics": {
                "one_token": ["mathematics"],
                "found": False
            },
            "computer_science": {
                "one_token": ["cs"],
                "two_token": [["computer", "science"]],
                "found": False
            },
            "engineering": {
                "one_token": ["engineering"],
                "found": False
            },
            "electrical_engineering": {
                "two_token": [["electrical", "engineering"]],
                "found": False
            },
            "computer_engineering": {
                "two_token": [["computer", "engineering"]],
                "found": False
            },
            "finance": {
                "one_token": ["finance"],
                "found": False
            },
            "psychology": {
                "one_token": ["psychology"],
                "found": False
            },
            "economics": {
                "one_token": ["economics"],
                "found": False
            },
            "information_technology": {
                "two_token": [["information", "technology"]],
                "found": False
            },
            "probability": {
                "one_token": ["probability"],
                "found": False
            },
            "operational_research": {
                "two_token": [["operational", "research"]],
                "found": False
            },
            "geography": {
                "one_token": ["geography"],
                "found": False
            },
            "physics": {
                "one_token": ["physics"],
                "found": False
            },
        },
        "education_levels": {
            "phd": {
                "one_token": ["doctorate", "phd", "ph.d."],
                "found": False
            },
            "masters": {
                "one_token": ["master", "masters", "ms", "m.s."],
                "found": False
            },
            "bachelors": {
                "one_token": ["bachelor", "bachelors", "bs", "ba", "b.s.", "b.a."],
                "found": False
            },
        },
        "reporting_packages": {
            "crystal_reports": {
                "two_token": [["crystal", "reports"]],
                "found": False
            },
            "ssrs": {
                "one_token": ["ssrs"],
                "found": False
            },
            "excel": {
                "one_token": ["excel"],
                "found": False
            },
        },
        "job_titles": {
            "data_scientist": {
                "two_token": [["data", "scientist"]],
                "found": False
            },
            "data_analyst": {
                "two_token": [["data", "analyst"]],
                "found": False
            },
            "machine_learning_engineer": {
                "three_token": [["machine", "learning", "engineer"]],
                "found": False
            },
            "computer_programmer": {
                "two_token": [["computer", "programmer"]],
                "found": False
            },
            "software_engineer": {
                "two_token": [["software", "engineer"]],
                "found": False
            },
            "statistician": {
                "one_token": ["statistician"],
                "found": False
            },
            "mathematician": {
                "one_token": ["mathematician"],
                "found": False
            },
            "business_intelligence_analyst": {
                "three_token": [["business", "intelligence", "analyst"]],
                "found": False
            },
            "engineer": {
                "one_token": ["engineer"],
                "found": False
            },
            "database_administrator": {
                "two_token": [["database", "administrator"]],
                "found": False
            },
        },
        "statistics": {
            "hypothesis_testing": {
                "two_token": [["hypothesis", "testing"]],
                "found": False
            },
            "variance": {
                "one_token": ["variance"],
                "found": False
            },
            "correlation": {
                "one_token": ["correlation"],
                "found": False
            },
            "standard_deviation": {
                "two_token": [["standard", "deviation"]],
                "found": False
            },
            "univariate": {
                "one_token": ["univariate"],
                "found": False
            },
            "multivariate": {
                "one_token": ["multivariate"],
                "found": False
            },
            "qualitative_variables": {
                "two_token": [["qualitative", "variables"]],
                "found": False
            },
            "quantitative_variables": {
                "two_token": [["quantitative", "variables"]],
                "found": False
            },
            "descriptive_statistics": {
                "two_token": [["descriptive", "statistics"]],
                "found": False
            },
            "inferential_statistics": {
                "two_token": [["inferential", "statistics"]],
                "found": False
            },
            "test_statistic": {
                "two_token": [["test", "statistic"]],
                "found": False
            },
            "chi_squared": {
                "two_token": [["chi", "square"], ["chi", "squared"]],
                "three_token": [["chi", "-", "square"], ["chi", "-", "squared"]],
                "found": False
            },
            "p_value": {
                "two_token": [["p", "value"], ["p", "values"]],
                "three_token": [["p", "-", "value"], ["p", "-", "values"]],
                "found": False
            },
            "t_test": {
                "two_token": [["t", "test"], ["t", "tests"]],
                "three_token": [["t", "-", "test"], ["t", "-", "tests"]],
                "found": False
            },
            "z_test": {
                "two_token": [["z", "test"], ["z", "tests"]],
                "three_token": [["z", "-", "test"], ["z", "-", "tests"]],
                "found": False
            },
            "normal_distributions": {
                "two_token": [["normal", "distribution"], ["normal", "distributions"]],
                "found": False
            },
            "central_limit_theorem": {
                "one_token": ["clt"],
                "three_token": [["central", "limit", "theorem"]],
                "found": False
            },
            "poisson_distributions": {
                "two_token": [["poisson", "distribution"], ["poisson", "distributions"]],
                "found": False
            },
            "confidence_intervals": {
                "two_token": [["confidence", "interval"], ["confidence", "intervals"]],
                "found": False
            },
            "type_i_errors": {
                "three_token": [["type", "i", "error"], ["type", "i", "errors"]],
                "found": False
            },
            "type_ii_errors": {
                "three_token": [["type", "ii", "error"], ["type", "ii", "errors"]],
                "found": False
            },
            "maximum_likelihood_estimation": {
                "three_token": [["maximum", "likelihood", "estimation"]],
                "found": False
            },
            "a_b_testing": {
                'three_token': [["a", "/", "b"]],
                "found": False
            },
            "conditional_probability": {
                "two_token": [["conditional", "probability"], ["conditional", "probabilities"]],
                "found": False
            },
            "bayes_rule": {
                "two_token": [["bayes", "rule"], ["bayes'", "rule"]],
                "found": False
            },
            "random_variables": {
                "two_token": [["random", "variable"], ["random", "variables"]],
                "found": False
            },
            "discrete_variable": {
                "two_token": [["discrete", "variable"], ["discrete", "variables"]],
                "found": False
            },
            "continuous_variable": {
                "two_token": [["continuous", "variable"], ["continuous", "variables"]],
                "found": False
            },
            "probability_distributions": {
                "two_token": [["probability", "distribution"], ["probability", "distributions"]],
                "found": False
            },
            "markov_chains": {
                "two_token": [["markov", "chains"]],
                "found": False
            },
            "confusion_matrix": {
                "two_token": [["confusion", "matrix"]],
                "found": False
            },
            
        },
        "computer_science": {
            "time_complexity": {
                "two_token": [["time", "complexity"]],
                "three_token": [["big", "-", "o"]],
                "found": False
            },
            "data_structures": {
                "two_token": [["data", "structures"]],
                "found": False
            },
            "recursion": {
                "one_token": ["recursion"],
                "found": False
            },
            "algorithms": {
                "one_token": ["algorithms"],
                "found": False
            },
            "stacks": {
                "one_token": ["stacks"],
                "found": False
            },
            "queues": {
                "one_token": ["queues"],
                "found": False
            },
            "heaps": {
                "one_token": ["heaps"],
                "found": False
            },
            "linked_lists": {
                "two_token": [["linked", "lists"]],
                "found": False
            },
            "hash_maps": {
                "two_token": [["hash", "maps"]],
                "found": False
            },
            "binary_search_trees": {
                "three_token": [["binary", "search", "trees"]],
                "found": False
            },
        },
        "skills": {
            "scripting": {
                "one_token": ["scripting"],
                "found": False
            },
            "designing": {
                "one_token": ["designing"],
                "found": False
            },
            "software_development": {
                "two_token": [["software", "development"]],
                "found": False
            },
            "software_engineering": {
                "two_token": [["software", "engineering"]],
                "found": False
            },
            "programming": {
                "one_token": ["programming"],
                "found": False
            },
            "querying": {
                "one_token": ["querying"],
                "found": False
            },
            "full_stack": {
                "two_token": [["full", "stack"]],
                "three_token": [["full", "-", "stack"]],
                "found": False
            },
            "automation": {
                "one_token": ["automation"],
                "found": False
            },
            "communication": {
                "one_token": ["communication"],
                "found": False
            },
            "optimization": {
                "one_token": ["optimization", "optimizing"],
                "found": False
            },
            "etl": {
                "one_token": ["etl"],
                "found": False
            },
            "problem_solving": {
                "two_token": [["problem", "solving"]],
                "found": False
            },
            "forecasting": {
                "one_token": ["forecasting"],
                "found": False
            },
        },
        "machine_learning": {
            "data_mining": {
                "two_token": [["data", "mining"]],
                "found": False
            },
            "nlp": {
                "one_token": ["nlp"],
                'three_token': [["natural", "language", "processing"]],
                "found": False
            },
            "regression": {
                "one_token": ["regression"],
                "found": False
            },
            "classification": {
                "one_token": ["classification"],
                "found": False
            },
            "naive_bayes": {
                "two_token": [["naive", "bayes"]],
                "found": False
            },
            "clustering": {
                "one_token": ["clustering"],
                "found": False
            },
            "decision_tree": {
                "two_token": [["decision", "tree"], ["decision", "trees"]],
                "found": False
            },
            "random_forest": {
                "two_token": [["random", "forest"], ["random", "forests"]],
                "found": False
            },
            "gradient_boosting": {
                "two_token": [["gradient", "boost"], ["gradient", "boosting"]],
                "found": False
            },
            "ensemble": {
                "one_token": ["ensemble", "ensembles"],
                "found": False
            },
            "boosting": {
                "one_token": ["boosting", "ensembles"],
                "found": False
            },
            "mixture_of_experts": {
                "one_token": ["moe"],
                "three_token": [["mixture", "of", "experts"]],
                "found": False
            },
            "neural_network": {
                "two_token": [["neural", "network"], ["neural", "networks"]],
                "found": False
            },
            "anomaly_detection": {
                "two_token": [["anomaly", "detection"]],
                "found": False
            },
            "preprocessing": {
                "one_token": ["preprocessing"],
                "found": False
            },
            "normalization": {
                "one_token": ["normalization"],
                "found": False
            },
            "standardization": {
                "one_token": ["standardization"],
                "found": False
            },
            "support_vector_machines": {
                "one_token": ["svm"],
                "three_token": [["support", "vector", "machine"], ["support", "vector", "machines"]],
                "found": False
            },
            "recommender_systems": {
                "two_token": [["recommender", "system"], ["recommender", "systems"]],
                "found": False
            },
            "computer_vision": {
                "two_token": [["computer", "vision"], ["computer", "visioning"]],
                "found": False
            },
            "convolutional_neural_netorks": {
                "one_token": ["cnn"],
                "three_token": [["convolutional", "neural", "network"], ["convolutional", "neural", "networks"]],
                "found": False
            },
            "recurrent_neural_networks": {
                "one_token": ["rnn"],
                "three_token": [["recurrent", "neural", "network"], ["recurrent", "neural", "networks"]],
                "found": False
            },
            "graph_neural_networks": {
                "one_token": ["gnn"],
                "three_token": [["graph", "neural", "network"], ["graph", "neural", "networks"]],
                "found": False
            },
            "transfer_learning": {
                "two_token": [["transfer", "learning"]],
                "found": False
            },
            "belief_networks": {
                "two_token": [["belief", "network"], ["belief", "networks"]],
                "found": False
            },
            "gradient_descent": {
                "two_token": [["gradient", "descent"]],
                "found": False
            },
            "overfitting": {
                "one_token": ["overfit", "overfitting"],
                "found": False
            },
            "underfitting": {
                "one_token": ["underfit", "underfitting"],
                "found": False
            },
            "regularization": {
                "one_token": ["regularization"],
                "found": False
            },
            "cross_validation": {
                "two_token": [["cross", "validation"]],
                "found": False
            },
            "bagging": {
                "one_token": ["bagging"],
                "found": False
            },
            "bootstrapping": {
                "one_token": ["bootstrapping"],
                "found": False
            },
            "hyperparameter_tuning": {
                "two_token": [["hyperparameter", "tuning"]],
                "found": False
            },
            "principal_component_analysis": {
                "one_token": ["pca"],
                "three_token": [["principle", "component", "analysis"]],
                "found": False
            },
            "dimensionality_reduction": {
                "two_token": [["dimensionality", "reduction"]],
                "found": False
            },
            "logistic_regression": {
                "two_token": [["logistic", "regression"]],
                "found": False
            },
            "sentiment_analysis": {
                "two_token": [["sentiment", "analysis"]],
                "found": False
            },
        },
        "social_groups": {
            "clients": {
                "one_token": ["clients"],
                "found": False
            },
            "stakeholders": {
                "one_token": ["stakeholders"],
                "found": False
            },
            "teammates": {
                "one_token": ["teammates"],
                "found": False
            },
            "partners": {
                "one_token": ["partners"],
                "found": False
            },
        },
        "modeling": {
            "quantitative_modeling": {
                "two_token": [["quantitative", "models"], ["quantitative", "modeling"]],
                "found": False
            },
            "market_mix_modeling": {
                "one_token": ["mmm"],
                "three_token": [["market", "mix", "models"], ["market", "mix", "modeling"]],
                "found": False
            },
            "large_language_models": {
                "one_token": ["llm"],
                "three_token": [["large", "language", "models"], ["large", "language", "modeling"]],
                "found": False
            },
            "predictive_modeling": {
                "two_token": [["predictive", "models"], ["predictive", "modeling"]],
                "found": False
            },
            "statistical_modeling": {
                "two_token": [["statistical", "models"], ["statistical", "modeling"]],
                "found": False
            },
            "stochastic_modeling": {
                "two_token": [["stochastic", "models"], ["stochastic", "modeling"]],
                "found": False
            },
            "generative_modeling": {
                "two_token": [["generative", "models"], ["generative", "modeling"]],
                "found": False
            },
            "transformer_modeling": {
                "two_token": [["transformer", "models"], ["transformer", "modeling"]],
                "found": False
            },
        },
        "other": {
            "startup": {
                "one_token": ["startup"],
                "found": False
            },
            "remote": {
                "one_token": ["remote"],
                "two_token": [["fully", "remote"]],
                "three_token": [["fully", "-", "remote"], ["remote", "-", "based"], ["work", "from", "home"]],
                "found": False
            },
            "in-office": {
                "three_token": [["in", "-", "office"], ["office", "-", "based"], ["in", "-", "person"]],
                "found": False
            },
            "full-time": {
                "two_token": [["full", "time"]],
                "three_token": [["full", "-", "time"]],
                "found": False
            },
            "part-time": {
                "two_token": [["part", "time"]],
                "three_token": [["full", "-", "time"]],
                "found": False
            },
            "hybrid": {
                "one_token": ["hybrid"],
                "two_token": [["hybrid", "role"]],
                "found": False
            },
        }
    }
    return main_dictionary

In [None]:
def get_number_of_items(dictionary):
    '''
    This is a convenience function to display the total number of words/items
    in the Main Dictionary.
    '''
    number_of_items = 0
    for category in dictionary:
        for item in dictionary[category]:
            number_of_items += 1
        
    return number_of_items

In [None]:
def display_contents(dictionary):
    '''
    This is a convenience function to display all the words/items in the Main Dictionary
    '''
    for category in dictionary:
        print(category, ':')
        for item in dictionary[category]:
            print("\t", item)

In [None]:
def find_tokens(dictionary, doc):
    '''
    The function takes every word/token in the job description and checks if that word/token 
    is in the Main Dictionary. If it is, it flags that item as found in the Main Dictionary. 
    The Main Dictionary is then returned. 
    
    Args:
        dictionary : Dict
            A fresh instance of the Main Dictionary
        doc : spacy.tokens.doc.Doc
            The tokenized job description 
    
    Returns:
        Dict
            Main Dictionary with variable "found" set to "True" for each word found during the search.
    '''
    
    # Search for all one token words
    for token in doc:
        for category in dictionary:
            for item in dictionary[category]:
                if dictionary[category][item]["found"] == True:
                    continue
                else:
                    if 'one_token' in dictionary[category][item]:
                        if token.text.lower() in dictionary[category][item]['one_token']:
                            dictionary[category][item]['found'] = True
    
    # Search for all two token words    
    for i in range(len(doc) - 2 + 1):
        for category in dictionary:
            for item in dictionary[category]:
                if dictionary[category][item]["found"] == True:
                    continue
                else:
                    if 'two_token' in dictionary[category][item]:
                        if [token.text.lower() for token in doc[i:i+2]] in dictionary[category][item]['two_token']:
                            dictionary[category][item]['found'] = True
    
    # Search for all three token words
    for i in range(len(doc) - 3 + 1):
        for category in dictionary:
            for item in dictionary[category]:
                if dictionary[category][item]["found"] == True:
                    continue
                else:
                    if 'three_token' in dictionary[category][item]:
                        if [token.text.lower() for token in doc[i:i+3]] in dictionary[category][item]['three_token']:
                            dictionary[category][item]['found'] = True
    return dictionary

In [None]:
def get_search_results(df):
    '''
    This gets the search results for every all job listings and packages them
    into a dataframe. This dataframe is then returned to the caller. 
    
    Args:
        df : pandas.core.frame.DataFrame
            The dataframe containing the job listing data scraped from Google
    
    Returns:
        Dict
            Dictionary containing the search results and other features
    '''
    
    start_time = time.time()
    
    counter = 0
    
    nlp = spacy.load('en_core_web_sm')
    
    results = pd.DataFrame()

    for index, row in df.iterrows():
        temp_dict = {}

        temp_dict["job_id"] = row["job_id"]
        temp_dict["title"] = row["title"]
        temp_dict["search_term"] = row["search_term"]
        temp_dict["location"] = row["location"]
        temp_dict["schedule_type"] = row["schedule_type"]
        temp_dict["work_from_home"] = row["work_from_home"]

        search_list = get_search_list()
        # See which items in the main dictionary were found
        search_list_results = find_tokens(search_list, nlp(row['description']))
        
        # Each column is a word, and each value is whether that word was found in that job listing
        for category in search_list_results:
            for item in search_list_results[category]:
                temp_dict[item] = search_list_results[category][item]["found"]
        
        # Add this row into the temp_dict dictionary
        results = results.append(temp_dict, ignore_index=True)
        
        counter += 1
        
        if counter % 100 == 0:
            print(counter, "job descriptions searched")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time, "seconds")
    
    return results

In [None]:
def classify_title_based_on_parts(title):
    '''
    This function classifies titles based on combinations of individual words
    found in the title. So no sequences of words like ["machine", "learning", "engineering"]. 
    This is used to label titles like "Software Engineer-Machine Learning".
    
    Args:
        title : spacy.tokens.doc.Doc
            The tokenized job title 
    
    Returns:
        Dict
            Returns which job roles were found in the title
    '''
    
    has_data = False
    has_scientist = False
    has_analyst = False
    has_machine = False
    has_learning = False
    has_engineer = False
    has_business = False
    has_intelligence = False
    has_software = False
    
    data = ["data"]
    scientist = ["scientist", "science"]
    analyst = ["analyst", "analytics", "analysis", "analysts"]
    machine = ["machine"]
    learning = ["learning"]
    engineer = ["engineer", "engineering"]
    business = ["business"]
    intelligence = ["intelligence"]
    software = ["software"]
    
    ds_label = False 
    da_label = False 
    de_label = False
    mle_label = False
    bia_label = False
    
    for token in title:
        if token.text.lower() in data:
            has_data = True
        if token.text.lower() in scientist:
            has_scientist = True
        if token.text.lower() in analyst:
            has_analyst = True
        if token.text.lower() in machine:
            has_machine = True
        if token.text.lower() in learning:
            has_learning = True
        if token.text.lower() in engineer:
            has_engineer = True
        if token.text.lower() in business:
            has_business = True
        if token.text.lower() in intelligence:
            has_intelligence = True
        if token.text.lower() in software:
            has_software = True
    
    if (has_data == True) & (has_scientist == True) & (has_business == False) & (has_intelligence == False) & (has_engineer == False):
        ds_label = True
    if (has_machine == True) & (has_learning == True) & (has_engineer == True):
        mle_label = True
    if (has_data == True) & (has_analyst == True) & (has_business == False) & (has_intelligence == False) & (has_engineer == False):
        da_label = True
    if (has_data == True) & (has_engineer == True) & (has_analyst == False) & (has_machine == False) * (has_software == False):
        de_label = True
#     if (has_analyst == True) & (has_business == True) & (has_intelligence == False) & 
#     (has_engineer == False):
#         bia_label = True

    return {"data_scientist": ds_label,
            "data_analyst": da_label,
            "data_engineer": de_label,
            "ml_engineer": mle_label,
            "bi_analyst": bia_label}

In [None]:
def classify_title(title):
    '''
    This function accurately labels each job listing as a 
    data scientist, data analyst, data engineer, ml engineer, and/or bi analyst.
    This is based on the job listings title.
    A single job listing can have zero, one, or multiple title labels.
    
    Args:
        title : spacy.tokens.doc.Doc
            The tokenized job title 
    
    Returns:
        Dict
            Returns which job roles were found in the title
    '''
    
    title_labels = {
        "data_scientist": {
            "two_token": [["data", "scientist"], ["data", "science"], ["data", "scientists"]],
            "found": False
        },
        "data_analyst": {
            "two_token": [["data", "analyst"], ["data", "analysis"], ["data", "analytics"], ["data", "analysts"]],
            "found": False
        },
        "data_engineer": {
            "two_token": [["data", "engineer"], ["data", "engineering"]],
            "found": False
        },
        "ml_engineer": { 
            "two_token": [["ml", "engineer"], ["ml", "engineering"], 
    #                       ["software", "engineer"], ["software", "engineering"], 
                          ["ai", "engineer"], ["ai", "engineering"], ["ai", "developer"]],
            "three_token": [["machine", "learning", "engineer"], ["machine", "learning", "engineering"], 
                            ["deep", "learning", "engineer"], ["artificial", "intelligence", "engineer"]],
            "found": False
        },
        "bi_analyst": {
            "two_token": [["bi", "analyst"], ["bi", "analysis"], ["bi", "analytics"], ["intelligence", "analyst"], 
                          ["intelligence", "analysis"], ["intelligence", "analytics"], ["business", "intelligence"],
                          ["business", "analyst"], ["business", "analysis"], ["business", "analytics"], ["business", "analysts"]],
            "three_token": [["business", "intelligence", "analyst"], ["business", "intelligence", "analysis"], 
                            ["business", "intelligence", "analytics"]],
            "found": False
        },
    }
    
    for token in title:
        for label in title_labels:
            if title_labels[label]["found"] == True:
                continue
            else:
                if "one_token" in title_labels[label]:
                    if token.text.lower() in title_labels[label]["one_token"]:
                        title_labels[label]["found"] = True
    for i in range(len(title) - 2 + 1):
        for label in title_labels:
            if title_labels[label]["found"] == True:
                continue
            else:
                if "two_token" in title_labels[label]:
                    if [token.text.lower() for token in title[i:i+2]] in title_labels[label]["two_token"]:
                        title_labels[label]["found"] = True
    for i in range(len(title) - 3 + 1):
        for label in title_labels:
            if title_labels[label]["found"] == True:
                continue
            else:
                if "three_token" in title_labels[label]:
                    if [token.text.lower() for token in title[i:i+3]] in title_labels[label]["three_token"]:
                        title_labels[label]["found"] = True
    
    # If all labels are False use this function classify_title_based_on_parts()
    if (title_labels["data_scientist"]["found"] == False) & (title_labels["data_analyst"]["found"] == False) & (title_labels["data_engineer"]["found"] == False) & (title_labels["ml_engineer"]["found"] == False) & (title_labels["bi_analyst"]["found"] == False):
        results = classify_title_based_on_parts(title)
        title_labels["data_scientist"]["found"] = results["data_scientist"]
        title_labels["data_analyst"]["found"] = results["data_analyst"]
        title_labels["data_engineer"]["found"] = results["data_engineer"]
        title_labels["ml_engineer"]["found"] = results["ml_engineer"]
        title_labels["bi_analyst"]["found"] = results["bi_analyst"]
    
    return title_labels

In [None]:
def get_state_xor_country(location):
    '''
    This function takes a location and parses out the state code.
    If there is no state code but there is a state name, the state name is mapped to 
    its state code. 
    If there is no state code and no state name, it is either a country, "anywhere", or null.
    
    Args:
        location : Str
            The location of the job for a particular job listing
    
    Returns:
        Str
            Returns state code  
    '''
    
    pattern = r"\b[A-Z]{2}\b"
    
    state_mapping = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
    }
    
    state_list = [state_mapping[state] for state in state_mapping].append('DC')
    
    if type(location) != str:
        return np.nan
    elif re.findall(pattern, location) and len(re.findall(pattern, location)[0]) > 1:
        return(re.findall(pattern, location)[-1])
    elif re.findall(pattern, location) and re.findall(pattern, location)[0] in state_list:
        return(re.findall(pattern, location)[0])
    elif "other" in location:
        if len(location.split()) > 3:
            if location.split()[0] + " " + location.split()[1] in state_mapping:
                return(state_mapping[location.split()[0] + " " + location.split()[1]])
            else:
                return(location.split()[0] + " " + location.split()[1])
        elif location.split()[0] in state_mapping:
            return(state_mapping[location.split()[0]])
        else:
            return(location.split()[0])
    else:
        if len(location.split()) > 1:
            if location.split()[0] + " " + location.split()[1] in state_mapping:
                return(state_mapping[location.split()[0] + " " + location.split()[1]])
            else:
                return(location.split()[0] + " " + location.split()[1])
        elif location.split()[0] in state_mapping:
            return(state_mapping[location.split()[0]])
        else:
            return(location.split()[0])

In [None]:
def find_job_levels(title):
    '''
    This functions assigns a job level based on words and numbers in the title.
    
    Args:
        title : spacy.tokens.doc.Doc
            The tokenized job title 
    
    Returns:
        Dict
            Returns which job levels were found in the title 
    '''
    
    job_levels = {
        "junior": {
            "one_token": ["junior", "jr"],
            "found": False
        },
        "senior": {
            "one_token": ["senior", "sr"],
            "found": False
        },
        "principal": {
            "one_token": ["principal"],
            "found": False
        },
        "lead": {
            "one_token": ["lead"],
            "found": False
        },
        "i": {
            "one_token": ["i", "1"], # maybe remove the number
            "found": False
        },
        "ii": {
            "one_token": ["ii", "2"],
            "found": False
        },
        "iii": {
            "one_token": ["iii", "3"],
            "found": False
        },
        "vice_president": {
            "one_token": ["vp"],
            "two_token": [["vice", "president"]],
            "found": False
        },
        "entry_level": {
            "two_token": [["entry", "level"]],
            "found": False
        },
        "internship": {
            "one_token": ["intern", "internship"],
            "found": False
        },
    }
    
    for token in title:
        for level in job_levels:
            if job_levels[level]["found"] == True:
                continue
            else:
                if "one_token" in job_levels[level]:
                    if token.text.lower() in job_levels[level]["one_token"]:
                        job_levels[level]["found"] = True
    for i in range(len(title) - 2 + 1):
        for level in job_levels:
            if job_levels[level]["found"] == True:
                continue
            else:
                if "two_token" in job_levels[level]:
                    if [token.text.lower() for token in title[i:i+2]] in job_levels[level]["two_token"]:
                        job_levels[level]["found"] = True
    
    return job_levels

In [None]:
def get_title_labels(title):
    '''
    This function returns the title labels for a job title.

    Args:
        title : Str
            The title string
    
    Returns:
        Tuple
            Returns tuple of boolean values corresponding to which job roles were identified in the title
    '''
    
    # Replaces all characters in the string title that are not alphanumeric or whitespace with a space.
    title = re.sub(r"[^a-zA-Z0-9\s]", " ", title)
    
    classified_titles = classify_title(nlp(title))
    
    return (
        classified_titles["data_scientist"]["found"],
        classified_titles["data_analyst"]["found"],
        classified_titles["data_engineer"]["found"],
        classified_titles["ml_engineer"]["found"],
        classified_titles["bi_analyst"]["found"]
    )

In [None]:
def get_job_levels(title):
    
    '''
    This function returns the job levels for a job title.
    
    Args:
        title : Str
            The title string
    
    Returns:
        Tuple
            Returns tuple of boolean values corresponding to which job levels were identified in the title    
    '''
    
    job_levels = find_job_levels(nlp(title))
    return (
        job_levels["junior"]["found"],
        job_levels["senior"]["found"],
        job_levels["principal"]["found"],
        job_levels["lead"]["found"],
        job_levels["i"]["found"],
        job_levels["ii"]["found"],
        job_levels["iii"]["found"],
        job_levels["vice_president"]["found"],
        job_levels["entry_level"]["found"],
        job_levels["internship"]["found"]      
    )

In [None]:
schema = 'my_schema'
table = 'my_table'
username = 'username'

sql_query = 'SELECT * FROM {}.{};'.format(schema, table)
df = query_database(sql_query, username)
df.shape

In [None]:
'''
Remove duplicate job listings with the same discription, location, and company name
'''
unique_rows = df[~df.duplicated(subset=['description', 'location', 'company_name'], keep="first")]
unique_rows = unique_rows.reset_index(drop=True)
unique_rows.shape

In [None]:
'''
Search for 221 words pertaining to the data industry
'''
results = get_search_results(unique_rows)
results = results.reset_index(drop=True)
results.shape

In [None]:
'''
Get job labels
'''
results['ds_label'], results['da_label'], results['de_label'], results['mle_label'], results['bia_label'] = zip(*results['title'].apply(get_title_labels))


In [None]:
'''
Get states xor countries
'''
results['state_xor_country'] = results['location'].apply(get_state_xor_country)


In [None]:
'''
Get job levels
'''
results['junior'], results['senior'], results['principal'], results['lead'], results['level_i'], results['level_ii'], results['level_iii'], results['vice_president'], results['entry_level'], results['internship'] = zip(*results['title'].apply(get_job_levels))


In [None]:
'''
Get number of job labels
'''
results['num_job_labels'] = results['ds_label'].astype(int) + results['da_label'].astype(int) + results['de_label'].astype(int) + results['mle_label'].astype(int) + results['bia_label'].astype(int)


In [None]:
'''
Get number of education labels
'''
results['num_edu_labels'] = results['bachelors'].astype(int) + results['masters'].astype(int) + results['phd'].astype(int)


In [None]:
file_path = '/my_filepath/results.csv'
results.to_csv(file_path, index=False)
