<a href="https://colab.research.google.com/github/nzcbass/Pythontoopenai/blob/main/analysingJsonoutput_working_code_50_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import json

# Assuming 'resume.json' is in a directory called 'resumes' in the same parent directory as your Python script
json_file_path1 = "/content/CV_parsed_by_CV_Parser_Premium-15.json"  # Relative path

# Open the JSON file and load its contents into a Python dictionary
with open(json_file_path1, 'r') as file:
    resume_data = json.load(file)

# Labeling the job entries
job_count = 1
for experience in resume_data['profile']['professional_experiences']:
    experience['job_label'] = f'Job_{job_count}'
    job_count += 1

# Save the modified data back to the JSON file
output_json_path = "/content/jobslabeled.json"
with open(output_json_path, 'w') as output_file:
    json.dump(resume_data, output_file, indent=2)

print(f"Job labeling completed. Labeled data saved to {output_json_path}")

# Now 'resume_data' contains the contents of the JSON file
# print(json.dumps(resume_data, indent=2))


Job labeling completed. Labeled data saved to /content/jobslabeled.json


In [35]:
import json
import pandas as pd
import re

def json_to_dataframe(json_file_path):
    with open(json_file_path, 'r') as file:
        resume_data = json.load(file)

    df = pd.json_normalize(resume_data['profile']['professional_experiences'])

    if 'description' in df.columns:
        df = df.drop('description', axis=1)

    return df

def split_and_create_columns(location):
    if pd.notna(location):
        # Use a more comprehensive regex to handle special characters
        substrings = re.split(r'[^\w]+', location)
        substrings = [entry.strip() for entry in substrings]

        # Remove specific terms like "plaza," "street," etc.
        substrings = [remove_special_terms(entry) for entry in substrings]

        # Check if "Africa" is present and the preceding word is "South"
        for i, entry in enumerate(substrings):
            if entry.lower() == "africa" and i > 0 and substrings[i-1].lower() == "south":
                substrings[i] = ""  # Remove "Africa" from the current cell
                substrings[i-1] = "South Africa"

            # Check if "Arabia" is present and the preceding word is "Saudi"
            if entry.lower() == "arabia" and i > 0 and substrings[i-1].lower() == "saudi":
                substrings[i] = ""  # Remove "Arabia" from the current cell
                substrings[i-1] = "Saudi Arabia"

            # Check if "Emirates" is present and the two preceding words are "United Arab"
            if entry.lower() == "emirates" and i > 1 and substrings[i-1].lower() == "arab" and substrings[i-2].lower() == "united":
                substrings[i] = ""  # Remove "Emirates" from the current cell
                substrings[i-1] = ""  # Remove "Arab" from the preceding cell
                substrings[i-2] = "United Arab Emirates"

        column_headers = [f'substring_{i+1}' for i in range(len(substrings))]
        substrings_dict = dict(zip(column_headers, substrings))
        return substrings_dict

    return pd.NA


def remove_special_terms(substring):
    # Add logic to remove specific terms like "plaza," "street," etc.
    special_terms = ["plaza", "street", "st", "road", "rd", "avenue", "close", "motorway", "highway",
                     "po", "PO", "new zealand|po", "office", "drive", "cor.", "ave.", "ave", "level", "lvl",
                     "box", "lv", "village", "vellage", "building", "bldg", "city","albany"]

    # Construct a regex pattern to match whole words
    pattern = r'\b(?:' + '|'.join(re.escape(term) for term in special_terms) + r')\b'

    # Use regex to replace matched terms with an empty string
    substring = re.sub(pattern, '', substring, flags=re.IGNORECASE).strip()

    return substring

def find_city(substring):
    if pd.notna(substring):
        # Check if the substring is a list
        if isinstance(substring, list):
            matching_cities = df1[df1['City'].isin(substring)]['City'].tolist()
            return matching_cities
        else:
            # Check if the substring matches any city in df1
            matching_city = df1[df1['City'].str.match(f"^{substring}$", case=False, na=False)]

            if not matching_city.empty:
                return matching_city['City'].values[0]

    return pd.NA

def get_country_matched(row):
    cities = []

    for col in df.filter(like='substring_'):
        if isinstance(row[col], list):
            cities.extend(city for city in row[col] if isinstance(city, str) and pd.notna(city))
        elif isinstance(row[col], str) and pd.notna(row[col]):
            cities.append(row[col])

    if cities:
        unique_cities = set(cities)
        if len(unique_cities) == 1:
            return df1[df1['City'] == unique_cities.pop()]['Country'].values[0]
        elif len(unique_cities) > 1:
            return ', '.join(df1[df1['City'].isin(unique_cities)]['Country'].tolist())

    # Handle cases where cities is empty
    return 'Review CV' if not any(isinstance(row[col], (float, pd.NA)) for col in df.filter(like='substring_')) else pd.NA


if __name__ == "__main__":
    json_file_path2 = "jobslabeled.json"
    df = json_to_dataframe(json_file_path2)
    df['location'] = df['location'].str.lower()
    df = df.join(df['location'].apply(split_and_create_columns).apply(pd.Series))

    df1 = pd.read_excel("/content/Updated_World_Cities.xlsx")

    # Extract city columns and handle NA values
    for col in df.filter(like='substring_').columns:
        col_city = f'{col}_city'
        df[col_city] = df[col].apply(find_city)
        df[col_city] = df[col_city].astype(str)
        df[col_city].replace('nan', pd.NA, inplace=True)  # Replace 'nan' strings with actual NA values

    # Continue with the rest of the code
    df['Country Matched'] = df.apply(lambda row: get_country_matched(row.filter(like='substring_')), axis=1)

    # Update the logic for Final_country_matched
    df['Final_country_matched'] = df['Country Matched'].apply(lambda x: x.split(', ')[-1] if pd.notna(x) and ', ' in x else x)

    # Other cleaning steps
    df['location'] = df['location'].str.replace(r'\b(city)\b|\d', '', case=False, regex=True)
    df['location'] = df['location'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df['location'] = df['location'].str.strip()
    df['location'].replace('', pd.NA, inplace=True)

    # Add this line at the end of your script
    df['Final_country_matched'].fillna('Review Resume', inplace=True)

    csv_output_path = "output_data.csv"
    df.to_csv(csv_output_path, index=False)
    print(df)


   is_current  duration_in_months                                company  \
0       False                  30  All trades laboure hire ltd/Ele group   
1       False                  52      Hilmarcs Construction Corporation   
2       False                  12      Hilmarcs Construction Corporation   
3       False                   2      Hilmarcs Construction Corporation   
4       False                  14                            MDC / DDTKI   

                                        location               title  \
0                               ryman healthcare   Carpenter/builder   
1  okada manila admintiger paraaque  philippines   foreman carpenter   
2                 pilippine arena bocaue bulacan  Lead Man Carpenter   
3                  alphaland makati  philippines  Lead Man Carpenter   
4            glorietta  and  makati  philippines           Carpenter   

  job_label  start_date.year  start_date.month  end_date.year  end_date.month  \
0     Job_1             2017 