In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
import re
import pycountry

from openpyxl import Workbook

import os
import time
import datetime

In [2]:
from bs4 import BeautifulSoup as bs
import json

In [3]:
import glob

In [4]:
# Python3 program to Grouping list 
# elements based on frequency
from collections import Counter
import itertools

In [5]:
from googletrans import Translator, constants

In [6]:
# streamlit app modules
import streamlit as st 
import sklearn
import plotly
import matplotlib

In [7]:
from joblib import Parallel, delayed

In [8]:
print('--Python module versions')

print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f're=={re.__version__}')
print(f'json=={json.__version__}')

print('--Streamlit app module versions')

print(f'streamlit=={st.__version__}')
print(f'sklearn=={sklearn.__version__}')
print(f'plotly=={plotly.__version__}')
print(f'matplotlib=={matplotlib.__version__}')

--Python module versions
pandas==2.1.2
numpy==1.26.2
re==2.2.1
json==2.0.9
--Streamlit app module versions
streamlit==1.31.0
sklearn==1.3.2
plotly==5.16.1
matplotlib==3.8.2


### Diving into nested dictionaries to find navigation based on keywords

In [9]:
# Look for keyword in a dictionary and return the dictionary key that contains the keyword
def diveDictValues(keyword = None, dict_to_search = {}):
    if keyword is None:
        return None
    if dict_to_search is False:
        return None
    
    for key in dict_to_search.keys():
        try:
            if str(dict_to_search[key]).__contains__(keyword):
                return key
        except KeyError:
            continue

    return None

# Find keyword in a dictionary or list of strings
def diveListValues(keyword = None, list_to_search = []):
    if keyword is None:
        return None
    if list_to_search is False:
        return None
    
    matched_items = []
    for i in range(len(list_to_search)):
        if type(list_to_search[i]) == dict:
            if keyword in list(list_to_search[i].keys()):
                matched_items.append(i)
        elif str(list_to_search[i]).__contains__(keyword):
            matched_items.append(i)

    return matched_items

# Get instructions to iterate deeper through a nested json text file and perform certain instructions if nest level is a dictionary or list
def ProcessHtmlText(dict_to_search = None, dict_labels = []):
    if dict_to_search is None:
        return None
    
    curr_dict_level = dict_to_search
    result = None

    try:
        for i in range(len(dict_labels)):
            label = dict_labels[i]
            if label.__contains__('list'):
                if type(curr_dict_level) == list:
                    dive_result = diveListValues(label.split('-')[1], curr_dict_level)
                    if dive_result is None:
                        continue
                    elif len(dive_result) <= 0:
                        continue
                    elif len(dive_result) > 1:
                        possible_nests_to_check = {}
                        for j in dive_result:
                            possible_nests_to_check[j] = curr_dict_level[j]
                        accepted_nests = AttemptPossibleNest(possible_nests_to_check, dict_labels[i+1:])
                        dive_result = accepted_nests[-1]
                    elif len(dive_result) == 1:
                        dive_result = dive_result[0]
 
                    curr_dict_level = curr_dict_level[dive_result]
            else:
                if type(curr_dict_level) == dict:
                    try:
                        # dive_result = diveDictValues(label, curr_dict_level)
                        # if dive_result is None:
                        #     continue
                        
                        # print(f"label: {label}; result: {dive_result}")
                        curr_dict_level = curr_dict_level[label]
                    except KeyError:
                        continue

            if label == dict_labels[-1]:
                result = curr_dict_level
    except ValueError:
        return None

    return result

# To get ahead of the arbitrary header names, attempt the neighbouring nest level to look for the interested keyword
# Similar to a search binary tree, search through the node of neighbours to find the interested keyword
def AttemptPossibleNest(possible_nest_dict = {}, next_labels = []):
    accepted_nests = []
    for key in possible_nest_dict.keys():
        nest_to_dive = possible_nest_dict[key]
        
        # print('nesting', key, next_labels, nest_to_dive)
        result_check = ProcessHtmlText(nest_to_dive, dict_labels=next_labels)
        if result_check is not None:
            accepted_nests.append(key)
    
    return accepted_nests

# Parse HTML text into XML and iterate through ids to get the expected value
def diveHtmlTags(html_body, toplevel_tag = None, dict_labels = [], multi = False, find_keyword = None):
    soup = bs(html_body, 'lxml')
    if toplevel_tag is None:
        return None
    if dict_labels is False:
        return None
    
    ids = [ids['id'] for ids in soup.find_all(toplevel_tag, id=True)]
    
    result = None
    if multi == False:
        for id_key in ids:
            long_text = soup.find(id = id_key).text
            try:
                json_dict = json.loads(long_text, strict = False)

                if json_dict is None:
                    continue
                
                result_check = ProcessHtmlText(json_dict, dict_labels=dict_labels)
                if result_check is not None:
                    if find_keyword is None:
                        result = result_check
                    elif result_check.__contains__(find_keyword):
                        # print('Found "employees"')
                        # print(result_check)
                        # print('--')
                        result = result_check
                    else:
                        result = result_check     


            except ValueError:  # includes simplejson.decoder.JSONDecodeError
                # print(f'Decoding JSON has failed at {id_key}')
                continue
    else:
        result = []
        for id_key in ids:
            long_text = soup.find(id = id_key).text
            try:
                json_dict = json.loads(long_text, strict = False)

                if json_dict is None:
                    continue
                
                result_check = ProcessHtmlText(json_dict, dict_labels=dict_labels)
                if result_check is not None:
                    print('rc')
                    print(f"id: {id_key}")
                    print(result_check)
                    print('--')
                    result.append(result_check)
            except ValueError:  # includes simplejson.decoder.JSONDecodeError
                print(f'Decoding JSON has failed at {id_key}')
                continue

    return result


### Obtaining details/information in job details from posts

<details>

<summary> List of details </summary>

    - Job Post ID
    - Post Status
    - Post Link
    - Job Title
    - Job Description
    - Description Language
    - Remote Option
    - Repost Status
    - Job Location
    - Job Top Card Details
    - Experience Level
    - Work Setting
    - Employment Type
    - Skill Requirements
    - Associated Skill Phrases
    - Unlisted Skills
    - Unlisted Skill Phrases
    - Company ID
    - Company Name
    - Company Link
    - Company Top Card Details
    - Company Size
    - Company Industry
    - Company Description
</details>

In [10]:
# Get Job Post ID
def GetJobPostingID(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['jobPostingId']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'jobPostingId'])
    except:
        return None

# Get Job Post Status
def GetPostStatus(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['jobState']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'jobState'])
    except:
        return None

# Get Job Post Date
def GetPostDate(html_body):
    try:
        soup = bs(html_body)
        post_date = soup.find('div', {'class': 'jobs-description__content jobs-description-content jobs-description__content--condensed'}).find('p', {'class': 't-black--light t-14 mt4'}).text.replace('Posted on ', '').replace('.', '')
        return datetime.datetime.strptime(post_date, "%b %d, %Y").strftime('%Y-%m-%d')
    except:
        return ''

# Get Link of Job Post on LinkedIn
def GetPostLink(url_link = None, job_id = None):
    try:
        if url_link is not None:
            return url_link
        if job_id is not None:
            return f'https://www.linkedin.com/jobs/view/{job_id}/'
    except:
        return None
    
    return None

# Get Job Title
def GetJobTitle(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['title']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'title'])
    except:
        return None

# Get Job Description
def GetJobDescription(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['description']['text']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'description', 'text'])
    except:
        return None

# Get the language of the Job Description
def GetDescriptionLanguage(jd_text):
    try:
        return detect(jd_text)
    except:
        return ''

# Get Job Remote Option
def GetRemoteWorkStatus(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['workRemoteAllowed']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'workRemoteAllowed'])
    except:
        return None

# Get Job Reposted Status
def GetRepostStatus(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['repostedJobPosting']
        return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'repostedJobPosting'])
    except:
        return None

# Get Job Location
def GetJobCountry(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['formattedLocation']
        # return diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'formattedLocation'])

        country_code = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-countryISOCode', 'countryISOCode'])
        if country_code is not None:
            country_name = pycountry.countries.get(alpha_2 = country_code).name

            return country_name
        else:
            return  diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-countryISOCode', 'abbreviatedLocalizedName'])
    except:
        return None

# Get Job Top Card Details
# Example: ['Remote', 'Full-time', 'Entry level']
def GetJobTopCardDetails(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912217
        # id_key = 'bpr-guid-15912217'
        # long_text = soup.find(id = id_key)
        # json_dict = json.loads(long_text.text, strict = False)
        # return [t['text']['text'] for t in json_dict['included'][4]['jobInsightsV2ResolutionResults'][0]['jobInsightViewModel']['description']]
        dive_result = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-jobInsightsV2ResolutionResults', 'jobInsightsV2ResolutionResults','list-jobInsightViewModel', 'jobInsightViewModel', 'description'])
        # print('dr type', type(dive_result))
        # print(dive_result.keys())
        # print(dive_result)
        # print([t.keys() for t in dive_result])
        return [t['text']['text'] for t in dive_result]
    except:
        return None

# Get Experience Level
# Example: ['Internship', 'Entry', 'Associate', 'Mid-Senior', 'Director', 'Executive']
def GetJobExperienceLevel(html_body, details = []):
    try:
        if details is True:
            for keyword in ['Internship', 'Entry', 'Associate', 'Mid-Senior', 'Director', 'Executive']:
                for detail in details:
                    if keyword in detail:
                        return detail
                    
        details = GetJobTopCardDetails(html_body)
        for keyword in ['Internship', 'Entry', 'Associate', 'Mid-Senior', 'Director', 'Executive']:
            for detail in details:
                if keyword in detail:
                    return detail
    except:
        return None
    
    return None

# Get Work Setting Placement
# Example: ['Hybrid', 'Remote', 'On-site']
def GetJobWorkSetting(html_body, details = []):
    try:
        if details is True:
            for keyword in ['Hybrid', 'Remote', 'On-site']:
                for detail in details:
                    if keyword in detail:
                        return detail
                    
        details = GetJobTopCardDetails(html_body)
        for keyword in ['Hybrid', 'Remote', 'On-site']:
            for detail in details:
                if keyword in detail:
                    return detail
    except:
        return None
    
    return None

# Get Employment Type
# Example: ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Volunteer', 'Internship', 'Other']
def GetJobType(html_body, details = []):
    try:
        if details is True:
            for keyword in ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Volunteer', 'Internship', 'Other']:
                for detail in details:
                    if keyword in detail:
                        return detail
                    
        details = GetJobTopCardDetails(html_body)
        for keyword in ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Volunteer', 'Internship', 'Other']:
            for detail in details:
                if keyword in detail:
                    return detail
    except:
        return None
    
    return None

# Get Skills Listed on Job Post
def GetJobSkills(html_body):
    try:
        soup = bs(html_body)

        return [li.find('div').get_text().strip() for li in soup.find('ul', {'class': 'job-details-skill-match-status-list'}).find_all('li')]
    except:
        return None

# Get Years of Experience Mentioned in Description
def GetJobYearsExperience(html_body, job_des = None, des_lan = 'en'):
    try:
        def GetYears_RegEx(des):
            years_text = re.findall(r"([\d+-]+)\s+(years?)", des, re.IGNORECASE)
            splitted_num = []
            for clipped in years_text:
                char_to_remove = ['+', '-', ' ']
                text_to_clean = clipped[0]

                for ch in char_to_remove:
                    for el in text_to_clean.split(ch):
                        if el.isalnum() is True:
                            splitted_num = splitted_num + [int(el)]

            return np.max([year for year in list(set(splitted_num)) if year <= 15])

        if des_lan != 'en':
            translator = Translator()
            translation_dest = 'en'
            translation = translator.translate(job_des, dest=translation_dest)
            
            job_des = translation.text

        if job_des is not None:
            return GetYears_RegEx(job_des)
        
        job_des = GetJobDescription(html_body)
        if job_des is not None:
            return GetYears_RegEx(job_des)
    except:
        return None

    return None

# Get Salary for Job Post, If Available
def GetPaySalary(html_body):
    try:
        # Get Salary Pay
        soup = bs(html_body)
        id_key = 'SALARY'

        salary_text = soup.find(id = id_key).find('div', {'data-view-name': 'job-salary-card'}).find('div', {'class': 'mt4'}).find('p', {'class': 't-16'}).text.strip()
        salary_text = salary_text.replace('(from job description)', '').strip()
        salary_text = salary_text.replace('$', '').replace('£', '').replace('€', '')

        if (salary_text).__contains__('/yr'):
            salary_text = salary_text.replace('/yr', '')
            salary_ranges = salary_text.split(' - ')
            if (salary_ranges[0]).__contains__('K'):
                salary_ranges[0] = float(salary_ranges[0].replace(',', '').replace('K', '')) * 1000
            pay_salary = str(salary_ranges[0]).replace(',', '') + '/yr'
            if len(salary_ranges) > 1:
                pay_salary = pay_salary + ' - ' 
                if (salary_ranges[1]).__contains__('K'):
                    salary_ranges[1] = float(salary_ranges[1].replace(',', '').replace('K', '')) * 1000
                pay_salary = pay_salary + str(salary_ranges[1]).replace(',', '') + '/yr'
        elif (salary_text).__contains__('/month'):
            salary_text = salary_text.replace('/month', '')
            salary_ranges = salary_text.split(' - ')
            if (salary_ranges[0]).__contains__('K'):
                salary_ranges[0] = float(salary_ranges[0].replace(',', '').replace('K', '')) * 1000
            salary_ranges[0] = float(salary_ranges[0].replace(',', '')) * 12
            pay_salary = str(salary_ranges[0]) + '/yr'
            if len(salary_ranges) > 1:
                pay_salary = pay_salary + ' - ' 
                if (salary_ranges[1]).__contains__('K'):
                    salary_ranges[1] = float(salary_ranges[1].replace(',', '').replace('K', '')) * 1000
                salary_ranges[1] = float(salary_ranges[1].replace(',', '')) * 12
                pay_salary = pay_salary + str(salary_ranges[1]).replace(',', '') + '/yr'
        elif (salary_text).__contains__('/hr'):
            salary_text = salary_text.replace('/hr', '')
            salary_ranges = salary_text.split(' - ')
            if (salary_ranges[0]).__contains__('K'):
                salary_ranges[0] = float(salary_ranges[0].replace(',', '').replace('K', '')) * 1000
            salary_ranges[0] = float(salary_ranges[0].replace(',', '')) * 1920
            pay_salary = str(salary_ranges[0]) + '/yr'
            if len(salary_ranges) > 1:
                pay_salary = pay_salary + ' - ' 
                if (salary_ranges[1]).__contains__('K'):
                    salary_ranges[1] = float(salary_ranges[1].replace(',', '').replace('K', '')) * 1000
                salary_ranges[1] = float(salary_ranges[1].replace(',', '')) * 1920
                pay_salary = pay_salary + str(salary_ranges[1]) + '/yr'
        else:
            pay_salary = salary_text.replace('K', ',000').replace(',', '')

        return pay_salary
        # return salary_text
    except:
        return None
    
# Get Country Code
def GetCountryCode(html_body):
    try:
        country_code = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-countryISOCode', 'countryISOCode'])
        if type(country_code) == str:
            return country_code
    except:
        return None
    
# Get Location Localized Name
def GetLocalizedLocation(html_body):
    try:
        localized_name = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-abbreviatedLocalizedName', 'abbreviatedLocalizedName'])
        if type(localized_name) == str:
            return localized_name
    except:
        return None
    
# Get Country Name of Job Post
def GetLocalizedLocation(html_body):
    try:
        localized_name = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-abbreviatedLocalizedName', 'abbreviatedLocalizedName'])
        if type(localized_name) == str:
            return localized_name
    except:
        return None

# Get Company ID
def GetCompanyID(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['data']['companyDetails']['company']
        company_text = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['data', 'companyDetails', 'company'])
        return company_text.split('urn:li:fs_normalized_company:')[1]
    except:
        return None

# Get Company Name
def GetCompanyName(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['included'][1]['name']
        company_name = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-name', 'name'])
        if type(company_name) == str:
            return company_name
        
        return None
    except:
        return None

# Get Company Link
def GetCompanyLink(html_body):
    try:
        # soup = bs(html_body)
        # # id: bpr-guid-15912216
        # id_key = 'bpr-guid-15912216'
        # long_text = soup.find(id = id_key).text

        # json_dict = json.loads(long_text, strict = False)
        # return json_dict['included'][1]['url']
        company_link = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-url', 'url', 'list-primaryDescription', 'primaryDescription', 'attributesV2', 'list-detailData', 'detailData', 'hyperlink'])
        if type(company_link) == str:
            return company_link
        
        return None
    except:
        return None
    
# Get Company Top Card Details
# Example: ['11-50 employees', 'Software Development']
def GetCompanyTopCardDetails(html_body):
    soup = bs(html_body, 'lxml')
    # id: bpr-guid-15912217
    # id_key = 'bpr-guid-15912217'
    # long_text = soup.find(id = id_key).text

    # json_dict = json.loads(long_text, strict = False)
    # return json_dict['included'][4]['jobInsightsV2ResolutionResults'][1]['insightViewModel']['text']['text'].split(' · ')
    # dive_result = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-jobInsightsV2ResolutionResults', 'jobInsightsV2ResolutionResults','list-insightViewModel', 'insightViewModel', 'text', 'text'], multi = True)
    # return dive_result
    try:
        dive_result = diveHtmlTags(html_body, toplevel_tag='code', dict_labels = ['included', 'list-jobInsightsV2ResolutionResults', 'jobInsightsV2ResolutionResults', 'list-insightViewModel', 'insightViewModel', 'text', 'text'], find_keyword = 'employees')
        if type(dive_result) == list:
            if dive_result[0].__contains__('works here') or dive_result[0].__contains__('work here'):
                return None
            elif dive_result[0].__contains__('hiring for this job'):
                return None
            elif dive_result[0].__contains__('Actively recruiting'):
                return None
        elif type(dive_result) == str:
            if dive_result.__contains__('works here') or dive_result.__contains__('work here'):
                return None
            elif dive_result.__contains__('hiring for this job'):
                return None
            elif dive_result.__contains__('Actively recruiting'):
                return None
            
        if type(dive_result) == str:
            if dive_result.__contains__('employees') == False:
                return None
        
        return dive_result.split(' · ')
    except:
        return None
    # for result in dive_result:
    #     print('r')
    #     print(result)
    #     print('--')
    #     if str.lower(result).__contains__('employee'):
    #         dive_result = result

    # return dive_result.split(' · ')

# Get Employee Count of Company
def GetCompanySize(html_body, details = []):
    try:
        if details is True:
            for detail in details:
                if 'employees' in detail:
                    return detail
                    
        details = GetCompanyTopCardDetails(html_body)
        for detail in details:
            if 'employees' in detail:
                return detail
    except:
        return None
    
    return None

# Get Industry that Company Specializes In
def GetCompanyIndustry(html_body, details = []):
    try:
        if details is True:
            for detail in details:
                if 'employees' not in detail:
                    return detail
                    
        details = GetCompanyTopCardDetails(html_body)
        for detail in details:
            if 'employees' not in detail:
                return detail
    except:
        return None
    
    return None

# Get Company Description
def GetCompanyDescription(html_body):
    try:
        soup = bs(html_body)

        long_text = soup.find('div', {'class': 'jobs-company__box'}).find('div', {'class': 'inline-show-more-text inline-show-more-text--is-collapsed inline-show-more-text--is-collapsed-with-line-clamp'}).text.strip()
        return long_text
    except:
        return None

In [11]:
def GetHTMLFromFile(file_path, error_note = ''):
    try:
        f = open(file_path, "r")
        file_text = f.read()

        return file_text
    except:
        print(f'invalid file path: {file_path} [{error_note}]')

def ReadFiles(files_df = pd.DataFrame([], columns = ['Job ID', 'page_path', 'skills_path', 'page_file_date', 'skills_file_date'])):
    job_files = []
    for file in glob.glob("data/*.txt"):
        job_files.append(file)

        job_id = 0
        f = open(file, "r")
        file_text = f.read()
        job_id = GetJobPostingID(file_text)
        if job_id == None:
            print(f'id not found in {file}')
            continue

        if job_id not in files_df['Job ID'].values:
            file_entry = {'Job ID':job_id, 'page_path':'', 'skills_path':''}
            files_df = pd.concat([files_df, pd.DataFrame([file_entry])], ignore_index=True)
            files_df.reset_index()

        if file.__contains__('skills.txt'):
            file_row = files_df[files_df['Job ID'] == job_id]
            file_row['skills_path'] = file
            # datetime.datetime.fromtimestamp(os.path.getctime("data/data_00.txt")).strftime('%Y-%m-%d')
            file_row['skills_file_date'] = os.path.getctime(file)

            files_df[files_df['Job ID'] == job_id] = file_row
        else:
            file_row = files_df[files_df['Job ID'] == job_id]
            file_row['page_path'] = file
            file_row['page_file_date'] = os.path.getctime(file)

            files_df[files_df['Job ID'] == job_id] = file_row
    
    return files_df

### Parsing HTML text from .txt files saved in local storage and saving file paths in a dataframe

In [12]:
files_df_columns = ['Job ID', 'page_path', 'skills_path', 'page_file_date', 'skills_file_date']
files_df = pd.DataFrame([], columns = files_df_columns)

# Get the job post ID and file names of all .txt files in the /data folder
# Designating the file names to the identified post ID makes it easier to get job details
files_df = ReadFiles(files_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1093.txt
id not found in data\data_1093_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1366.txt
id not found in data\data_1366_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1561.txt
id not found in data\data_1561_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_157.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_157_skills.txt
id not found in data\data_158.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1586.txt
id not found in data\data_1586_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_158_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1709.txt
id not found in data\data_1709_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

id not found in data\data_1711.txt
id not found in data\data_1711_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1717.txt
id not found in data\data_1717_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_1755.txt
id not found in data\data_1755_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

id not found in data\data_1769.txt
id not found in data\data_1769_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

id not found in data\data_494.txt
id not found in data\data_494_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_497.txt
id not found in data\data_497_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_501.txt
id not found in data\data_501_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['skills_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

id not found in data\data_719.txt
id not found in data\data_719_skills.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_file_date'] = os.path.getctime(file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_row['page_path'] = file
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

### Declaring the job details class

In [13]:
class JobDetails():
    def __init__(self, id=''):
        self.job_id = id

    job_id = 0
    post_date = None
    status = None
    job_link = None
    job_title = None
    job_description = None
    post_language = None
    remote_option = None
    job_reposted = None
    job_topcarddetails = None
    job_experiencelevel = None
    job_experienceinyears = None
    job_worksetting = None
    job_employmenttype = None
    job_location = None
    job_country_code = None
    job_location_localized = None
    job_paysalary = None
    job_requiredskills = None
    job_associatedskillphrases = None
    job_unlistedskills = None
    job_unlistedskillphrases = None

### Declaring the company details class

In [14]:
class CompanyDetails():
    def __init__(self, id=''):
        self.company_id = id

    company_id = None
    company_name = None
    company_link = None
    company_topcarddetails = None
    company_size = None
    company_industry = None
    company_description = None

### Dataframe columns to store job and company details

In [15]:
df_columns = ['Job ID', 'Post Date', 'Status', 'Job Link', 'Job Title', 'Job Description', 'Post Language', 'Remote', 'Reposted', 'Top Card Details', 'Experience Level', 'Experience in Years', 'Work Setting', 'Job Type', 'Job Country', 'Job Skills', 'Pay Salary', 'Country Code', 'Localized Location', 'Company ID', 'Company Name', 'Company Link', 'Company Card Details', 'Company Size', 'Industry', 'Company Description']

In [16]:
# Get details from .txt files and prepare dictionary to store in a dataframe
def GetJobDetails(html_body, html_skills_body):
    posting_id = GetJobPostingID(html_body)
    post_status = GetPostStatus(html_body)
    post_date = GetPostDate(html_body)
    job_title = GetJobTitle(html_body)
    job_description = GetJobDescription(html_body)
    desc_lan = GetDescriptionLanguage(GetJobDescription(html_body))
    remote_status = GetRemoteWorkStatus(html_body)
    repost_status = GetRepostStatus(html_body)
    job_country = GetJobCountry(html_body)
    pay_salary = GetPaySalary(html_body)
    company_id = GetCompanyID(html_body)
    localized_location = None
    if GetCountryCode(html_body) is None:
        localized_location = GetLocalizedLocation(html_body)
        job_country = localized_location
    else:
        # if (str(GetLocalizedLocation(html_body)).replace(job_country, '').strip() != '') | (str(GetLocalizedLocation(html_body)).replace(job_country, '').strip() != ','):
        #     print(f'1a: {localized_location}')
        #     localized_location = str(GetLocalizedLocation(html_body)).replace(job_country, '').strip()
        # else:
        #     print(f'1b: {localized_location}')
        #     localized_location = GetLocalizedLocation(html_body).strip()
        if (str(GetLocalizedLocation(html_body)).replace(job_country, '') != '') | (str(GetLocalizedLocation(html_body)).replace(job_country, '') != ','):
            localized_location = str(GetLocalizedLocation(html_body)).replace(job_country, '').strip()
        else:
            localized_location = GetLocalizedLocation(html_body).strip()

        try:
            if localized_location[-1] == ',':
                localized_location = localized_location[:-1]
        except:
            localized_location = localized_location
        
    job_topcard_details = GetJobTopCardDetails(html_body)
    company_topcard_details = GetCompanyTopCardDetails(html_body)
    # company_topcard_details = []
    job_skills = GetJobSkills(html_skills_body)

    dict_job = {'Job ID': GetJobPostingID(html_body),
                'Post Date': GetPostDate(html_body),
                'Status': GetPostStatus(html_body),
                'Job Link': GetPostLink(url_link=None, job_id=GetJobPostingID(html_body)),
                'Job Title': GetJobTitle(html_body),
                'Job Description': GetJobDescription(html_body),
                'Post Language': GetDescriptionLanguage(GetJobDescription(html_body)),
                'Remote': GetRemoteWorkStatus(html_body),
                'Reposted': GetRepostStatus(html_body),
                'Top Card Details': GetJobTopCardDetails(html_body),
                'Experience Level': GetJobExperienceLevel(html_body, details = job_topcard_details),
                'Experience in Years': GetJobYearsExperience(html_body, job_description, desc_lan),
                'Work Setting': GetJobWorkSetting(html_body, details = job_topcard_details),
                'Job Type': GetJobType(html_body, details = job_topcard_details),
                'Job Country': job_country,
                'Job Skills': GetJobSkills(html_skills_body),
                'Pay Salary': GetPaySalary(html_body),
                'Country Code': GetCountryCode(html_body),
                'Localized Location': localized_location,
                'Company ID': GetCompanyID(html_body),
                'Company Name': GetCompanyName(html_body),
                'Company Link': GetCompanyLink(html_body),
                'Company Card Details': GetCompanyTopCardDetails(html_body),
                'Company Size': GetCompanySize(html_body, details = company_topcard_details),
                'Industry': GetCompanyIndustry(html_body, details = company_topcard_details),
                'Company Description': GetCompanyDescription(html_body)
                }
    return dict_job

# Declaring dataframe columns to store job details

# ['Job ID', 'Post Date', 'Status', 'Job Link', 'Job Title', 'Job Description', 
# 'Post Language', 'Remote', 'Reposted', 'Top Card Details', 'Experience Level', 
# 'Experience in Years', 'Work Setting', 'Job Type', 'Job Country', 'Job Skills', 'Pay Salary', 'Country Code', 'Localized Location',
# 'Company ID', 'Company Name', 'Company Link', 'Company Card Details', 'Company Size', 
# 'Industry', 'Company Description']
df_columns = ['Job ID', 'Post Date', 'Status', 'Job Link', 'Job Title', 'Job Description', 'Post Language', 'Remote', 'Reposted', 'Top Card Details', 'Experience Level', 'Experience in Years', 'Work Setting', 'Job Type', 'Job Country', 'Job Skills', 'Pay Salary', 'Country Code', 'Localized Location', 'Company ID', 'Company Name', 'Company Link', 'Company Card Details', 'Company Size', 'Industry', 'Company Description']
jobs_df = pd.DataFrame([], columns = df_columns)

def ReadFilesandGetJobDetails(files_df, file_index, df):
    index = file_index
    html_body = GetHTMLFromFile(files_df.loc[index]['page_path'], error_note=f'id:{index}, page content')
    html_skills = GetHTMLFromFile(files_df.loc[index]['skills_path'], error_note=f'id:{index}, skill content')

    job_entry = GetJobDetails(html_body, html_skills)
    if (job_entry['Post Date'] is None) | (job_entry['Post Date'] == ''):
        job_entry['Post Date'] = list(files_df[files_df['Job ID'] == job_entry['Job ID']]['page_file_date'])[0]
    return job_entry
    # df = pd.concat([df, pd.DataFrame([job_entry])], ignore_index=True)
    # df.reset_index()
    
    # return df

# 
results = Parallel(n_jobs=4)(delayed(ReadFilesandGetJobDetails)(files_df, index, jobs_df) for index in files_df.index)
for result in results:
    jobs_df = pd.concat([jobs_df, pd.DataFrame([result])], ignore_index=True)
    jobs_df.reset_index()

  jobs_df = pd.concat([jobs_df, pd.DataFrame([result])], ignore_index=True)


In [17]:
jobs_df

Unnamed: 0,Job ID,Post Date,Status,Job Link,Job Title,Job Description,Post Language,Remote,Reposted,Top Card Details,...,Pay Salary,Country Code,Localized Location,Company ID,Company Name,Company Link,Company Card Details,Company Size,Industry,Company Description
0,3772602742,1701295492.606442,LISTED,https://www.linkedin.com/jobs/view/3772602742/,Financial Data Analyst,"About Team\n\nWe are Quadcode, a fintech compa...",en,True,,"[Remote, Full-time, Mid-Senior level]",...,,,EMEA,42345997,Quadcode,https://www.linkedin.com/company/quadcodecaree...,"[501-1,000 employees, Software Development]","501-1,000 employees",Software Development,Quadcode is a fintech company specializing in ...
1,3768710281,1701295631.953394,LISTED,https://www.linkedin.com/jobs/view/3768710281/,Consultant in Data Analysis and Scientific Pro...,Your Responsibilities\n\nWe are currently look...,en,False,,"[On-site, Full-time, Mid-Senior level]",...,,DE,Berlin,10198832,Climate Analytics,https://www.linkedin.com/company/climate-analy...,"[51-200 employees, Climate Data and Analytics]",51-200 employees,Climate Data and Analytics,Climate Analytics is a global climate science ...
2,3773349346,1701295734.449802,LISTED,https://www.linkedin.com/jobs/view/3773349346/,Data-Analyst (m/w/d) in SaaS-Startup,"Wir sind cinify, ein junges Start-up, dass den...",de,True,,"[Remote, Full-time, Entry level]",...,,DE,,88277276,cinify,https://www.linkedin.com/company/cinify-io/life,"[1-10 employees, Data Infrastructure and Analy...",1-10 employees,Data Infrastructure and Analytics,cinify wurde Anfang des Jahres 2023 gegründet ...
3,3773351712,1701295567.078773,LISTED,https://www.linkedin.com/jobs/view/3773351712/,"Data Engineer/Analyst - DBT, AWS, Redshift, Py...",Orcan Intelligence are currently partnered wit...,en,False,,"[On-site, Contract]",...,,DE,Berlin,9387927,CareerAddict,https://www.linkedin.com/company/career-addict...,"[11-50 employees, Staffing and Recruiting]",11-50 employees,Staffing and Recruiting,CareerAddict is a leading career advice platfo...
4,3775291259,1701295891.976814,LISTED,https://www.linkedin.com/jobs/view/3775291259/,Junior Business Intelligence Consultant (M/W/D),Wir sind die Positive Thinking Company. Wir su...,de,False,,"[On-site, Full-time, Mid-Senior level]",...,,DE,Berlin,11444799,Positive Thinking Company by CBTW,https://www.linkedin.com/company/the-positive-...,"[1,001-5,000 employees, IT Services and IT Con...","1,001-5,000 employees",IT Services and IT Consulting,𝐀𝐛𝐨𝐮𝐭 𝐏𝐨𝐬𝐢𝐭𝐢𝐯𝐞 𝐓𝐡𝐢𝐧𝐤𝐢𝐧𝐠 𝐂𝐨𝐦𝐩𝐚𝐧𝐲As the technolo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,3850929186,2024-03-10,LISTED,https://www.linkedin.com/jobs/view/3850929186/,Manager-Commercial Acquisition & Engagement M...,You Lead the Way. We’ve Got Your Back.\nWith t...,en,False,,"[Hybrid, Full-time, Mid-Senior level]",...,,IN,Gurugram,1277,American Express,https://www.linkedin.com/company/american-expr...,"[10,001+ employees, Financial Services]","10,001+ employees",Financial Services,
1751,3844057602,2024-03-06,LISTED,https://www.linkedin.com/jobs/view/3844057602/,Analyst-Product Development,You Lead the Way. We’ve Got Your Back.\n\nWith...,en,False,,"[Hybrid, Full-time, Mid-Senior level]",...,,GB,London,1277,American Express,https://www.linkedin.com/company/american-expr...,"[10,001+ employees, Financial Services]","10,001+ employees",Financial Services,
1752,3844884784,2024-03-07,LISTED,https://www.linkedin.com/jobs/view/3844884784/,Manager - Data Governance & Management,You Lead the Way. We’ve Got Your Back.\n\nWith...,en,False,,"[$90,000/yr - $165,000/yr, Hybrid, Full-time, ...",...,90000/yr - 165000/yr,US,"New York, NY",1277,American Express,https://www.linkedin.com/company/american-expr...,"[10,001+ employees, Financial Services]","10,001+ employees",Financial Services,
1753,3855874747,2024-03-15,LISTED,https://www.linkedin.com/jobs/view/3855874747/,Analyst-Risk Management,You Lead the Way. We’ve Got Your Back.\n\nWith...,en,False,,"[Hybrid, Full-time, Entry level]",...,,IN,Gurgaon,1277,American Express,https://www.linkedin.com/company/american-expr...,"[10,001+ employees, Financial Services]","10,001+ employees",Financial Services,


### Import data from Kaggle: lukebarousse/data-analyst-job-postings-google-search

In [18]:
gsearch_jobs_df = pd.read_csv('gsearch_jobs.csv', encoding='utf8')
gsearch_jobs_df.columns

Index(['Unnamed: 0', 'index', 'title', 'company_name', 'location', 'via',
       'description', 'extensions', 'job_id', 'thumbnail', 'posted_at',
       'schedule_type', 'work_from_home', 'salary', 'search_term', 'date_time',
       'search_location', 'commute_time', 'salary_pay', 'salary_rate',
       'salary_avg', 'salary_min', 'salary_max', 'salary_hourly',
       'salary_yearly', 'salary_standardized', 'description_tokens'],
      dtype='object')

In [19]:
# ['Job ID', 'Post Date', 'Status', 'Job Link', 'Job Title', 'Job Description', 
# 'Post Language', 'Remote', 'Reposted', 'Top Card Details', 'Experience Level', 
# 'Experience in Years', 'Work Setting', 'Job Type', 'Job Country', 'Job Skills', 'Pay Salary',
# 'Company ID', 'Company Name', 'Company Link', 'Company Card Details', 'Company Size', 
# 'Industry', 'Company Description']

In [20]:
# def ReadKaggleFilesandGetJobDetails(gsearch_df, file_index, df):
#     index = file_index
#     gsearch_job = gsearch_df.loc[index]

#     job_description = gsearch_job['description']
#     desc_lan = GetDescriptionLanguage(gsearch_job['description'])
#     job_topcard_details = (gsearch_job['extensions'].strip('][').replace("'", "").split(', '))[1:]
#     pay_salary = None
#     try:
#         if (gsearch_job['salary_pay']).__contains('a year'):
#             pay_salary = gsearch_job['salary_pay'].replace('–', ' - ').replace('a year', '').replace('$', '').replace('£', '').replace('€', '')
#             salary_ranges = pay_salary.split(' - ')
#             if (salary_ranges[0]).__contains('K'):
#                 salary_ranges[0] = float(salary_ranges[0]) * 1000
#             pay_salary = salary_ranges[0] + '/yr'
#             if len(salary_ranges > 1):
#                 pay_salary = pay_salary + ' - ' 
#                 if (salary_ranges[1]).__contains('K'): 
#                     salary_ranges[1] = float(salary_ranges[1]) * 1000
#                 pay_salary = pay_salary + salary_ranges[1] + '/yr'
#         elif (gsearch_job['salary_pay']).__contains('a month'):
#             pay_salary = gsearch_job['salary_pay'].replace('–', ' - ').replace('a year', '').replace('$', '').replace('£', '').replace('€', '')
#             salary_ranges = pay_salary.split(' - ')
#             if (salary_ranges[0]).__contains('K'):
#                 salary_ranges[0] = float(salary_ranges[0]) * 1000
#             salary_ranges[0] = float(salary_ranges[0]) * 12
#             pay_salary = salary_ranges[0] + '/yr'
#             if len(salary_ranges > 1):
#                 pay_salary = pay_salary + ' - ' 
#                 if (salary_ranges[1]).__contains('K'): 
#                     salary_ranges[1] = float(salary_ranges[1]) * 1000
#                 salary_ranges[1] = float(salary_ranges[1]) * 12
#                 pay_salary = pay_salary + salary_ranges[1] + '/yr'
#         elif (gsearch_job['salary_pay']).__contains('an hour'):
#             pay_salary = gsearch_job['salary_pay'].replace('–', ' - ').replace('a year', '').replace('$', '').replace('£', '').replace('€', '')
#             salary_ranges = pay_salary.split(' - ')
#             if (salary_ranges[0]).__contains('K'):
#                 salary_ranges[0] = float(salary_ranges[0]) * 1000
#             salary_ranges[0] = float(salary_ranges[0]) * 1920
#             pay_salary = salary_ranges[0] + '/yr'
#             if len(salary_ranges > 1):
#                 pay_salary = pay_salary + ' - ' 
#                 if (salary_ranges[1]).__contains('K'): 
#                     salary_ranges[1] = float(salary_ranges[1]) * 1000
#                 salary_ranges[1] = float(salary_ranges[1]) * 1920
#                 pay_salary = pay_salary + salary_ranges[1] + '/yr'
#         else:
#             pay_salary = gsearch_job['salary_pay'].replace('–', ' - ').replace('K', ',000')
#     except:
#         pay_salary = None

#     job_entry = {'Job ID': gsearch_job['job_id'],
#             # 'Post Date': datetime.datetime.strptime(gsearch_job['date_time'][:10], '%d/%m/%Y').strftime('%Y-%m-%d'),
#             'Post Date': datetime.datetime.strptime(gsearch_job['date_time'][:10], '%Y-%m-%d').strftime('%Y-%m-%d'),
#             'Status': None,
#             'Job Link': None,
#             'Job Title': gsearch_job['title'].strip(),
#             'Job Description': gsearch_job['description'],
#             'Post Language': GetDescriptionLanguage(gsearch_job['description']),
#             'Remote': gsearch_job['work_from_home'],
#             'Reposted': None,
#             'Top Card Details': gsearch_job['extensions'][1:],
#             'Experience Level': None,
#             'Experience in Years': GetJobYearsExperience(None, gsearch_job['description'], desc_lan),
#             'Work Setting': GetJobWorkSetting(None, details = job_topcard_details),
#             'Job Type': GetJobType(None, details = job_topcard_details),
#             'Job Country': str(gsearch_job['location']).strip(),
#             'Job Skills': None,
#             'Pay Salary': pay_salary,
#             'Company ID': None,
#             'Company Name': gsearch_job['company_name'],
#             'Company Link': None,
#             'Company Card Details': None,
#             'Company Size': None,
#             'Industry': None,
#             'Company Description': None
#             }

#     return job_entry
#     # df = pd.concat([df, pd.DataFrame([job_entry])], ignore_index=True)
#     # df.reset_index()
    
#     # return df

# # 
# # results = Parallel(n_jobs=4)(delayed(ReadKaggleFilesandGetJobDetails)(gsearch_jobs_df, index, jobs_df) for index in gsearch_jobs_df.index)
# # for result in results:
# #     jobs_df = pd.concat([jobs_df, pd.DataFrame([result])], ignore_index=True)
# #     jobs_df.reset_index()

# for index in gsearch_jobs_df.index:
#     gsearch_job = gsearch_jobs_df.loc[index]

#     job_description = gsearch_job['description']
#     desc_lan = GetDescriptionLanguage(gsearch_job['description'])
#     job_topcard_details = (gsearch_job['extensions'].strip('][').replace("'", "").split(', '))[1:]
#     pay_salary = None
#     try:
#         pay_salary = gsearch_job['salary_pay'].replace('–', ' - ').replace('K', ',000')
#     except:
#         pay_salary = None

#     job_entry = {'Job ID': gsearch_job['job_id'],
#             # 'Post Date': datetime.datetime.strptime(gsearch_job['date_time'][:10], '%d/%m/%Y').strftime('%Y-%m-%d'),
#             'Post Date': datetime.datetime.strptime(gsearch_job['date_time'][:10], '%Y-%m-%d').strftime('%Y-%m-%d'),
#             'Status': None,
#             'Job Link': None,
#             'Job Title': gsearch_job['title'].strip(),
#             'Job Description': gsearch_job['description'],
#             'Post Language': GetDescriptionLanguage(gsearch_job['description']),
#             'Remote': gsearch_job['work_from_home'],
#             'Reposted': None,
#             'Top Card Details': gsearch_job['extensions'][1:],
#             'Experience Level': None,
#             'Experience in Years': GetJobYearsExperience(None, gsearch_job['description'], desc_lan),
#             'Work Setting': GetJobWorkSetting(None, details = job_topcard_details),
#             'Job Type': GetJobType(None, details = job_topcard_details),
#             'Job Country': str(gsearch_job['location']).strip(),
#             'Job Skills': None,
#             'Pay Salary': pay_salary,
#             'Company ID': None,
#             'Company Name': gsearch_job['company_name'],
#             'Company Link': None,
#             'Company Card Details': None,
#             'Company Size': None,
#             'Industry': None,
#             'Company Description': None
#             }
    
#     jobs_df = pd.concat([jobs_df, pd.DataFrame([job_entry])], ignore_index=True)
#     jobs_df.reset_index()

### Getting incomplete data in columns where it is available in other rows that has commonalities

In [21]:
def FillIncompleteDetails(file_index, df):
    index = file_index
    row = jobs_df.loc[index]

    if row['Company Card Details'] is not None:
        return row

    company_name = row['Company Name']
    company_details_df = jobs_df[(jobs_df['Company Name'] == company_name) & (jobs_df['Company Name'] is not None)]

    info_index = None
    for cd_index in company_details_df.index:
        if (company_details_df.loc[cd_index]['Company Name'] is not None) & (company_details_df.loc[cd_index]['Company Card Details'] is not None):
            info_index = cd_index
            break
    
    if info_index is None:
        return row

    if row['Company Name'] is None:
        row['Company Name'] = company_details_df.loc[info_index]['Company Name']
    if row['Company ID'] is None:
        row['Company ID'] = company_details_df.loc[info_index]['Company ID']
    if row['Company Link'] is None:
        row['Company Link'] = company_details_df.loc[info_index]['Company Link']
    if row['Company Card Details'] is None:
        row['Company Card Details'] = company_details_df.loc[info_index]['Company Card Details']
    if row['Company Size'] is None:
        row['Company Size'] = company_details_df.loc[info_index]['Company Size']
    if row['Industry'] is None:
        row['Industry'] = company_details_df.loc[info_index]['Industry']
    if row['Company Description'] is None:
        row['Company Description'] = company_details_df.loc[info_index]['Company Description']
    
    return row

# 
# results = Parallel(n_jobs=4)(delayed(FillIncompleteDetails)(index, jobs_df) for index in jobs_df.index)
# for result in results:
#     df_index = jobs_df[jobs_df == result['Job ID']].index[0]
#     jobs_df.loc[df_index] = result

for index in jobs_df.index:
    row = jobs_df.loc[index]

    if row['Company Card Details'] is not None:
        continue

    company_name = row['Company Name']
    company_details_df = jobs_df[(jobs_df['Company Name'] == company_name) & (jobs_df['Company Name'] is not None)]

    info_index = None
    for cd_index in company_details_df.index:
        if (company_details_df.loc[cd_index]['Company Name'] is not None) & (company_details_df.loc[cd_index]['Company Card Details'] is not None):
            info_index = cd_index
            break
    
    if info_index is None:
        continue

    if row['Company Name'] is None:
        row['Company Name'] = company_details_df.loc[info_index]['Company Name']
    if row['Company ID'] is None:
        row['Company ID'] = company_details_df.loc[info_index]['Company ID']
    if row['Company Link'] is None:
        row['Company Link'] = company_details_df.loc[info_index]['Company Link']
    if row['Company Card Details'] is None:
        row['Company Card Details'] = company_details_df.loc[info_index]['Company Card Details']
    if row['Company Size'] is None:
        row['Company Size'] = company_details_df.loc[info_index]['Company Size']
    if row['Industry'] is None:
        row['Industry'] = company_details_df.loc[info_index]['Industry']
    if row['Company Description'] is None:
        row['Company Description'] = company_details_df.loc[info_index]['Company Description']
    
    jobs_df.loc[index] = row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Company Card Details'] = company_details_df.loc[info_index]['Company Card Details']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Company Size'] = company_details_df.loc[info_index]['Company Size']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Industry'] = company_details_df.loc[info_index]['Industry']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable

### Save the dataframe into an excel file

In [22]:
jobs_df.to_excel('linkedin_job_details.xlsx')  

### Declaring skills class

In [23]:
class SkillDetails():
    def __init__(self, skill_name = ''):
        self.skill_name = skill_name

    skill_name = None
    common_phrases = None

### Declaring dataframe columns for skill details

In [24]:
skills_columns = ['skill name', 'common phrases']

In [25]:
top_skills_dict = {}

for keyword_skill in list(set().union(*jobs_df[pd.isna(jobs_df['Job Skills']) == False]['Job Skills'].values)):
    skills = []
    
    if keyword_skill.__contains__('('):
        skills = keyword_skill.split('(')

        skills = [skill.rstrip() for skill in skills]
        skills = [skill.replace(')', '') for skill in skills]
    else:
        skills = [keyword_skill]
    
    for skill in skills:
        if keyword_skill not in top_skills_dict.keys():
            top_skills_dict[keyword_skill] = skills
        else:
            top_skills_dict[keyword_skill] = list(set().union(top_skills_dict[keyword_skill], skills))
    
skills_columns = ['skill name', 'common phrases']

top_skills_df = pd.DataFrame([], columns=skills_columns)

for key in top_skills_dict.keys():
    dict_entry = {}
    dict_entry['skill name'] = key
    dict_entry['common phrases'] = top_skills_dict[key]
    top_skills_df = pd.concat([top_skills_df, pd.DataFrame([dict_entry])], ignore_index=True)
    top_skills_df.reset_index()

top_skills_df

Unnamed: 0,skill name,common phrases
0,Wireless Engineering,[Wireless Engineering]
1,Game Engines,[Game Engines]
2,Audience Analysis,[Audience Analysis]
3,Sampling,[Sampling]
4,Proprietary Trading,[Proprietary Trading]
...,...,...
2110,Customer Support,[Customer Support]
2111,Building Performance,[Building Performance]
2112,User Documentation,[User Documentation]
2113,Stochastic Methods,[Stochastic Methods]


### Assigning common phrases to listed skills and identify unlisted skills from job description

In [26]:
jobs_df['Associated Skill Phrases'] = ''
jobs_df['Unlisted Skill'] = ''
jobs_df['Unlisted Skill Phrases'] = ''

def get_listed_skill_phrases(row, skills_df):
    associated_skills = []

    if row['Job Skills'] is None:
        return row

    for skill in row['Job Skills']:
        # print(skill)
        # print(top_skills_df[top_skills_df['skill name'] == skill]['common phrases'].values)
        # print(associated_skills)
        associated_skills.append(top_skills_df[top_skills_df['skill name'] == skill]['common phrases'].tolist()[0])
        # associated_skills.append(top_skills_df[top_skills_df['skill name'] == skill]['common phrases'].values[0])
        # associated_skills = associated_skills + top_skills_df[top_skills_df['skill name'] == skill]['common phrases'].values
    # return list(set().union(*associated_skills))
    
    row['Associated Skill Phrases'] = list(set().union(*associated_skills))
   
    return row
    
def get_unlisted_skill_phrases(row, skills_df):
    unlisted_skills = []
    unlisted_skill_phrases = []

    for index in skills_df.index:
        if row['Job Skills'] is not None:
            if skills_df.loc[index]['skill name'] in row['Job Skills']:
                continue
        
        for phrase in skills_df.loc[index]['common phrases']:
            if phrase in row['Job Description'].split(' '):
                unlisted_skills.append(skills_df.loc[index]['skill name'])
                unlisted_skill_phrases.append(phrase)
    
    # ret_dict = {}
    # ret_dict['Unlisted Skill'] = unlisted_skills
    # ret_dict['Unlisted Skill Phrases'] = unlisted_skill_phrases
    # return ret_dict

    row['Unlisted Skill'] = unlisted_skills
    row['Unlisted Skill Phrases'] = unlisted_skill_phrases

    return row

def GetAllSkillInformation(top_skills_df, index, df):
    skill_row = df.loc[index]
    skill_row['Associated Skill Phrases'] = get_listed_skill_phrases(skill_row, top_skills_df)
    unlisted_skills_dict = get_unlisted_skill_phrases(skill_row, top_skills_df)
    skill_row['Unlisted Skill'] = unlisted_skills_dict['Unlisted Skill']
    skill_row['Unlisted Skill Phrases'] = unlisted_skills_dict['Unlisted Skill Phrases']

    return skill_row

# results = Parallel(n_jobs=4)(delayed(GetAllSkillInformation)(top_skills_df, index, jobs_df) for index in jobs_df.index)
# for result in results:
#     df_index = jobs_df[jobs_df == result['Job ID']].index[0]
#     jobs_df.loc[df_index] = result

for index in jobs_df.index:
    skill_row = jobs_df.loc[index]
    jobs_df.loc[index] = get_listed_skill_phrases(skill_row, top_skills_df)
    jobs_df.loc[index] = get_unlisted_skill_phrases(skill_row, top_skills_df)


jobs_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Associated Skill Phrases'] = list(set().union(*associated_skills))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Unlisted Skill'] = unlisted_skills
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['Unlisted Skill Phrases'] = unlisted_skill_phrases


Unnamed: 0,Job ID,Post Date,Status,Job Link,Job Title,Job Description,Post Language,Remote,Reposted,Top Card Details,...,Company ID,Company Name,Company Link,Company Card Details,Company Size,Industry,Company Description,Associated Skill Phrases,Unlisted Skill,Unlisted Skill Phrases
0,3772602742,1701295492.606442,LISTED,https://www.linkedin.com/jobs/view/3772602742/,Financial Data Analyst,"About Team\n\nWe are Quadcode, a fintech compa...",en,True,,"[Remote, Full-time, Mid-Senior level]",...,42345997,Quadcode,https://www.linkedin.com/company/quadcodecaree...,"[501-1,000 employees, Software Development]","501-1,000 employees",Software Development,Quadcode is a fintech company specializing in ...,"[Attention to Detail, English, Analytical Skil...","[SQL, Software as a Service (SaaS), Python (Pr...","[SQL, SaaS, Python]"
1,3768710281,1701295631.953394,LISTED,https://www.linkedin.com/jobs/view/3768710281/,Consultant in Data Analysis and Scientific Pro...,Your Responsibilities\n\nWe are currently look...,en,False,,"[On-site, Full-time, Mid-Senior level]",...,10198832,Climate Analytics,https://www.linkedin.com/company/climate-analy...,"[51-200 employees, Climate Data and Analytics]",51-200 employees,Climate Data and Analytics,Climate Analytics is a global climate science ...,"[Problem Solving, Data Analysis, Visualization...","[Analytics, Java, Application Programming Inte...","[Analytics, Java, API]"
2,3773349346,1701295734.449802,LISTED,https://www.linkedin.com/jobs/view/3773349346/,Data-Analyst (m/w/d) in SaaS-Startup,"Wir sind cinify, ein junges Start-up, dass den...",de,True,,"[Remote, Full-time, Entry level]",...,88277276,cinify,https://www.linkedin.com/company/cinify-io/life,"[1-10 employees, Data Infrastructure and Analy...",1-10 employees,Data Infrastructure and Analytics,cinify wurde Anfang des Jahres 2023 gegründet ...,"[SQL, Dashboards, Cascading Style Sheets, Prob...","[Software, SAS (Software), Figma (Software), H...","[Software, Software, Software, Software, Softw..."
3,3773351712,1701295567.078773,LISTED,https://www.linkedin.com/jobs/view/3773351712/,"Data Engineer/Analyst - DBT, AWS, Redshift, Py...",Orcan Intelligence are currently partnered wit...,en,False,,"[On-site, Contract]",...,9387927,CareerAddict,https://www.linkedin.com/company/career-addict...,"[11-50 employees, Staffing and Recruiting]",11-50 employees,Staffing and Recruiting,CareerAddict is a leading career advice platfo...,"[English, Airflow, Cloud Infrastructure, Datab...",[],[]
4,3775291259,1701295891.976814,LISTED,https://www.linkedin.com/jobs/view/3775291259/,Junior Business Intelligence Consultant (M/W/D),Wir sind die Positive Thinking Company. Wir su...,de,False,,"[On-site, Full-time, Mid-Senior level]",...,11444799,Positive Thinking Company by CBTW,https://www.linkedin.com/company/the-positive-...,"[1,001-5,000 employees, IT Services and IT Con...","1,001-5,000 employees",IT Services and IT Consulting,𝐀𝐛𝐨𝐮𝐭 𝐏𝐨𝐬𝐢𝐭𝐢𝐯𝐞 𝐓𝐡𝐢𝐧𝐤𝐢𝐧𝐠 𝐂𝐨𝐦𝐩𝐚𝐧𝐲As the technolo...,"[Jedox, SAP, BI, BPCS, Statutory Accounting Pr...",[],[]


### Saving skill phrases and unlisted skills to excel file

In [27]:
jobs_df.to_excel('linkedin_job_details.xlsx')  

In [28]:
top_skills_df['Listed frequency'] = 0
top_skills_df['Unlisted frequency'] = 0

def group_list(lst):
     
    return list(zip(Counter(lst).keys(), Counter(lst).values()))
     
combined_listed_skill_lists = list(itertools.chain.from_iterable(jobs_df[pd.isna(jobs_df['Job Skills']) == False]['Job Skills'].values))
combined_unlisted_skill_lists = list(itertools.chain.from_iterable(jobs_df['Unlisted Skill'].values))

listed_skills_grouped = group_list(combined_listed_skill_lists)
unlisted_skills_grouped = group_list(combined_unlisted_skill_lists)

for skill_tuple in listed_skills_grouped:
    skill_row = top_skills_df[top_skills_df['skill name'] == skill_tuple[0]]

    skill_row['Listed frequency'] = skill_tuple[1]
    top_skills_df.loc[skill_row.index] = skill_row


for skill_tuple in unlisted_skills_grouped:
    skill_row = top_skills_df[top_skills_df['skill name'] == skill_tuple[0]]

    skill_row['Unlisted frequency'] = skill_tuple[1]
    top_skills_df.loc[skill_row.index] = skill_row

top_skills_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skill_row['Listed frequency'] = skill_tuple[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skill_row['Unlisted frequency'] = skill_tuple[1]


Unnamed: 0,skill name,common phrases,Listed frequency,Unlisted frequency
0,Wireless Engineering,[Wireless Engineering],1,0
1,Game Engines,[Game Engines],2,0
2,Audience Analysis,[Audience Analysis],1,0
3,Sampling,[Sampling],1,0
4,Proprietary Trading,[Proprietary Trading],1,0
...,...,...,...,...
2110,Customer Support,[Customer Support],3,0
2111,Building Performance,[Building Performance],2,0
2112,User Documentation,[User Documentation],1,0
2113,Stochastic Methods,[Stochastic Methods],1,0


### Instantiating Google Translator API

In [29]:
# init the Google API translator
translator = Translator()
translation_dest = 'en'

### Translating non-English job description to English for keyword search function

In [30]:
df_translated = jobs_df.copy()
df_translated = df_translated[['Job ID', 'Job Title', 'Job Description', 'Post Language', 'Remote', 'Top Card Details', 'Job Country', 'Job Skills', 'Company Name', 'Company Card Details', 'Industry', 'Company Description']]

def translate_phrases(phrase_to_translate, translate_dest = 'en'):
    if type(phrase_to_translate) != str:
        return phrase_to_translate
    
    translation = translator.translate(phrase_to_translate, dest=translate_dest)
    return translation.text

df_translated['final'] = ''
for index in df_translated.index:
    row = df_translated.loc[index]
    if row['Post Language'] == translation_dest:
        row['final'] = ', '.join([str(el) for el in df_translated.loc[index].tolist()])
        if row['Remote'] == True:
            row['final'] = row['final'] + ', Remote'
    else:
        row['final'] = ', '.join([str(translate_phrases(el, translation_dest)) for el in df_translated.loc[index].tolist()])
        if row['Remote'] == True:
            row['final'] = row['final'] + ', Remote'
    
    df_translated.loc[index] = row

jobs_df['translated_en'] = df_translated['final']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['final'] = ', '.join([str(el) for el in df_translated.loc[index].tolist()])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['final'] = row['final'] + ', Remote'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['final'] = ', '.join([str(translate_phrases(el, translation_dest)) for el in df_translated.loc[index].tolist()])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-

### Saving jobs dataframe and skills dataframe to excel files

In [31]:
jobs_df.to_excel('linkedin_job_details.xlsx') 
top_skills_df.to_excel('linkedin_job_skills.xlsx') 