In [1]:
import numpy as np
import pickle
import os

In [2]:
# Save the file in the same folder as the script
def save(final_file, job_dict):
    if job_dict is None:
        print("Error: job_dict is None. Nothing to save.")
        return
    # Saving the file
    try:
        with open(f'{final_file}.p', 'wb') as fp:
            pickle.dump(job_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Data successfully saved to {final_file}.p")
    except Exception as e:
        print(f"Error saving file: {e}")

# Load the file from the same folder as the script
def load(file_name):
    if not os.path.exists(file_name):
        print(f"{file_name} does not exist. Starting fresh.")
        return {}
    
    # Loading the file
    try:
        with open(file_name, 'rb') as fp:
            job_dict = pickle.load(fp)
        print(f"Data successfully loaded from {file_name}")
        return job_dict
    except Exception as e:
        print(f"Error loading file: {e}")
        return {}

In [3]:
loaded_job_dict = load("job_dict_full_usa.p")

Data successfully loaded from job_dict_full_usa.p


In [5]:
first_key = next(iter(loaded_job_dict))
first_value = loaded_job_dict[first_key]

print(f"URL: {first_key}")
print(f"Data: {first_value}")

URL: https://www.linkedin.com/jobs/view/4030963908/?eBP=CwEAAAGSRiAlnNccT2c14KIeiUtm-RaNdHLhuYx36SW8dLApHAc2Fwv46AAtOhxQDgMSxM0665BkEaYiaRoe7PvPPo2LKbErhgwsblbH78A9S7JrwMNIoLUx7ctdjhnhfbFIpv6kzqnKgHuFlLpNqL5j_f2gWd5tsuyddLj1_uATdcsE3AcB6G0k-Fp0ZuKQTMZSUnUJ8sS2JwiGEYkVYq0DSdv4NOeNn0S5wRwheRwcMdnFlMCzagsfV8Jg0ph5E4kD1c783ncF67m91iJUhzCneujDF-azXUmVb7Cbbpd8e5yMDNwWaTCkoU1zLBnFmiiXLzdgJyNs47HPZUgmnGydp9pU-CCeIOOOv9Y19mmSXaxOEh6VsrKAxrv2KOKuh0tjxs2-uVVB-slTLxDJlHn19RYvJt6wnfpwWZhscmGs_Mw94FRIj_E-XesTQmgbEIJv_HBVNgncF4-62k2aV7RpQHGjb_81wA&refId=vwSTfUVIpwofAdTjA650%2Bg%3D%3D&trackingId=HdHITNx6OWnhDY0QiTuLIw%3D%3D&trk=flagship3_search_srp_jobs
Data: {'title': 'Director Search Engine Optimization', 'location': 'Wayne, PA (Hybrid)', 'description': "About the job\nSEO Director - Technical Focus\n\nWe're seeking an experienced SEO Director to lead our agency's search engine optimization efforts, with a particular emphasis on technical SEO. In this role, you'll drive organic growth for our client

### `JobLocationParser` Class

Parses job location details and categorizes them into city, state, country, and place of work.

---

#### `__init__()` Method

Initializes US state abbreviations, Canadian province abbreviations, and supported countries.

---

#### `extract_place_of_work(location)` Method

Extracts and removes the work type (`Hybrid`, `On-site`, `Remote`) from the location string. Returns the work type and cleaned location.

---

#### `is_state_or_province(location_part)` Method

Determines if a location part is a valid US state or Canadian province. Returns the state/province abbreviation and inferred country, or `None` if not found.

---

#### `categorize_location(location)` Method

Splits the location into city, state, and country. Checks for a valid state/province and adds country information if applicable. Returns the categorized data.

---

#### `process_job_locations(job_dict)` Method

Processes job location data in `job_dict`, categorizing each job's location into `city`, `state`, `country`, and `place_of_work`. Updates the job data with the new fields.


In [15]:
import re

class JobLocationParser:
    def __init__(self):
        self.us_states = ['IA', 'KS', 'UT', 'VA', 'NC', 'NE', 'SD', 'AL', 'ID', 'FM', 'DE', 'AK', 'CT', 'PR', 'NM', 'MS', 'PW', 'CO', 'NJ', 'FL', 'MN', 'VI', 'NV', 'AZ', 'WI', 'ND', 'PA', 'OK', 'KY',
                          'RI', 'NH', 'MO', 'ME', 'VT', 'GA', 'GU', 'AS', 'NY', 'CA', 'HI', 'IL', 'TN', 'MA', 'OH', 'MD', 'MI', 'WY', 'WA', 'OR', 'MH', 'SC', 'IN', 'LA', 'MP', 'DC', 'MT', 'AR', 'WV', 'TX']
        self.can_province_abbrev = {
            'Alberta': 'AB',
            'British Columbia': 'BC',
            'Manitoba': 'MB',
            'New Brunswick': 'NB',
            'Newfoundland and Labrador': 'NL',
            'Northwest Territories': 'NT',
            'Nova Scotia': 'NS',
            'Nunavut': 'NU',
            'Ontario': 'ON',
            'Prince Edward Island': 'PE',
            'Quebec': 'QC',
            'Saskatchewan': 'SK',
            'Yukon': 'YT',
            'Labrador': 'NL',  # Labrador part of Newfoundland and Labrador
            'Newfoundland': 'NL'  # Newfoundland part of Newfoundland and Labrador
        }
        self.countries = ['USA', 'Canada']

    def extract_place_of_work(self, location):
        """Helper function to extract place of work (Hybrid/On-site/Remote) if present in the location string."""
        work_types = ['Hybrid', 'On-site', 'Remote']
        for work_type in work_types:
            if f"({work_type})" in location:
                location = location.replace(f"({work_type})", "").strip()  # Remove place of work from location
                return work_type, location
        return "NA", location

    def is_state_or_province(self, location_part):
        """Helper function to check if a given part of the location is a state or province."""
        if location_part in self.us_states:
            return location_part, 'USA'
        for province, abbrev in self.can_province_abbrev.items():
            if location_part == province or location_part == abbrev:
                return abbrev, 'Canada'
        return None, None

    def categorize_location(self, location):
        # Step 1: Extract place of work (Hybrid/On-site/Remote) and clean location
        place_of_work, location = self.extract_place_of_work(location)

        # Step 2: Split by comma to extract city, state/region, and potentially country
        location_map = location.split(",")
        city, state, country = "All", "All", "All"  # Default values

        if len(location_map) >= 2:  # Handle cases with city and state/province
            city = location_map[0].strip()  # Assume city is the first part
            state_info = location_map[1].split()  # Second part could be state/province

            # Check if the second part is a valid state or province
            potential_state = state_info[0].strip()
            state_from_second_part, inferred_country = self.is_state_or_province(potential_state)

            if state_from_second_part:
                # Case: We found a valid state in the second part
                state = state_from_second_part
                country = inferred_country
            else:
                # Check if there is a third part for country information
                if len(location_map) >= 3:
                    potential_country = location_map[2].strip()
                    if potential_country in self.countries:
                        country = potential_country

        elif len(location_map) == 1:  # Only one part present (maybe just state or province)
            potential_state = location_map[0].strip()
            state_from_first_part, inferred_country = self.is_state_or_province(potential_state)

            if state_from_first_part:
                # Case: Single part is a valid state or province, no city
                state = state_from_first_part
                country = inferred_country
            else:
                # Case: No valid state or province
                city = location_map[0].strip()  # Keep it as city
                state = "All"

        return {
            "city": city,
            "state": state,
            "country": country,
            "place_of_work": place_of_work
        }

    def process_job_locations(self, job_dict):
        for url, job_data in job_dict.items():
            # Get the location from the job data
            location = job_data['location']
            
            # Categorize the location into city, state, country, and place_of_work
            categorized_location = self.categorize_location(location)
            
            # Add new fields to the job data
            job_data['city'] = categorized_location['city']
            job_data['state'] = categorized_location['state']
            job_data['country'] = categorized_location['country']
            job_data['place_of_work'] = categorized_location['place_of_work']

        return job_dict


- Initializing the class and processeing the data

In [None]:
jlp = JobLocationParser()
loaded_job_dict_loc = jlp.process_job_locations(loaded_job_dict)

In [13]:
loaded_job_dict

{'https://www.linkedin.com/jobs/view/4030963908/?eBP=CwEAAAGSRiAlnNccT2c14KIeiUtm-RaNdHLhuYx36SW8dLApHAc2Fwv46AAtOhxQDgMSxM0665BkEaYiaRoe7PvPPo2LKbErhgwsblbH78A9S7JrwMNIoLUx7ctdjhnhfbFIpv6kzqnKgHuFlLpNqL5j_f2gWd5tsuyddLj1_uATdcsE3AcB6G0k-Fp0ZuKQTMZSUnUJ8sS2JwiGEYkVYq0DSdv4NOeNn0S5wRwheRwcMdnFlMCzagsfV8Jg0ph5E4kD1c783ncF67m91iJUhzCneujDF-azXUmVb7Cbbpd8e5yMDNwWaTCkoU1zLBnFmiiXLzdgJyNs47HPZUgmnGydp9pU-CCeIOOOv9Y19mmSXaxOEh6VsrKAxrv2KOKuh0tjxs2-uVVB-slTLxDJlHn19RYvJt6wnfpwWZhscmGs_Mw94FRIj_E-XesTQmgbEIJv_HBVNgncF4-62k2aV7RpQHGjb_81wA&refId=vwSTfUVIpwofAdTjA650%2Bg%3D%3D&trackingId=HdHITNx6OWnhDY0QiTuLIw%3D%3D&trk=flagship3_search_srp_jobs': {'title': 'Director Search Engine Optimization',
  'location': 'Wayne, PA (Hybrid)',
  'description': "About the job\nSEO Director - Technical Focus\n\nWe're seeking an experienced SEO Director to lead our agency's search engine optimization efforts, with a particular emphasis on technical SEO. In this role, you'll drive organic growth for our clients t

- Saving the Data

In [24]:
save("job_dict_full_usa_loc", loaded_job_dict_loc)

Data successfully saved to job_dict_full_usa_loc.p


In [25]:
job_dict_full_usa_loc = load("job_dict_full_usa_loc.p")

Data successfully loaded from job_dict_full_usa_loc.p


In [26]:
job_dict_full_usa_loc

{'https://www.linkedin.com/jobs/view/4030963908/?eBP=CwEAAAGSRiAlnNccT2c14KIeiUtm-RaNdHLhuYx36SW8dLApHAc2Fwv46AAtOhxQDgMSxM0665BkEaYiaRoe7PvPPo2LKbErhgwsblbH78A9S7JrwMNIoLUx7ctdjhnhfbFIpv6kzqnKgHuFlLpNqL5j_f2gWd5tsuyddLj1_uATdcsE3AcB6G0k-Fp0ZuKQTMZSUnUJ8sS2JwiGEYkVYq0DSdv4NOeNn0S5wRwheRwcMdnFlMCzagsfV8Jg0ph5E4kD1c783ncF67m91iJUhzCneujDF-azXUmVb7Cbbpd8e5yMDNwWaTCkoU1zLBnFmiiXLzdgJyNs47HPZUgmnGydp9pU-CCeIOOOv9Y19mmSXaxOEh6VsrKAxrv2KOKuh0tjxs2-uVVB-slTLxDJlHn19RYvJt6wnfpwWZhscmGs_Mw94FRIj_E-XesTQmgbEIJv_HBVNgncF4-62k2aV7RpQHGjb_81wA&refId=vwSTfUVIpwofAdTjA650%2Bg%3D%3D&trackingId=HdHITNx6OWnhDY0QiTuLIw%3D%3D&trk=flagship3_search_srp_jobs': {'title': 'Director Search Engine Optimization',
  'location': 'Wayne, PA (Hybrid)',
  'description': "About the job\nSEO Director - Technical Focus\n\nWe're seeking an experienced SEO Director to lead our agency's search engine optimization efforts, with a particular emphasis on technical SEO. In this role, you'll drive organic growth for our clients t

# Convert job dictionary to DataFrame, clean data, and export to CSV


In [28]:
import pandas as pd

In [29]:
# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(job_dict_full_usa_loc, orient='index')

# Drop the unwanted columns (if applicable)
df = df.drop(columns=['posted_time', 'applicants'], errors='ignore')

# Reset the index so that URLs become a regular column
df = df.reset_index()

# Rename the 'index' column to 'url'
df = df.rename(columns={'index': 'url'})

# Write the DataFrame to a CSV file without the index
df.to_csv('linkedin_jobs_filtered.csv', index=False)