In [1]:
import os
import json
from openai import OpenAI
import pandas as pd
import ast

In [21]:
client = OpenAI(api_key="")

def parse_cv_with_chatgpt(cv_text):
    prompt = f"""
    Extract the following information from the CV text below and format it as a JSON object. Use the keys:
    - Name
    - Nationality Vietnamese ->Viet Nam, if null return Viet Nam as default
    - DOB year only
    - Sex
    - Seniority (if present)
    - About
    - Education (include university and years)
    - Languages
    - Programing Language: e.g.. Python, C++
    - Tool: e.g.Grafana,Prometheus,Graylog,...
    - Automation/Orchestration tool: e.g AWS CloudFormation,Ansible,Jenkins,Terraform,...
    - Microservice:e.g Docker,ECS,ECR,Kurbernetes
    - OS: e.g Windows,Linux
    - Database:e.g MySQL,Postgres,MongoDB,Oracle,DMS,Athena,Reshift,...
    - Public Cloud:e.g AWS,Azure,GCP,...
    - Projects Experience (project name,customer, position, responsibilities,technologies used)
    - Certifications (if any)

    If a field is missing, set it to null. Return only valid JSON.

    CV Text:
    {cv_text}
    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts unstructured data from CVs."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1
    )
    response_content = response.choices[0].message.content
    try:
        return json.loads(response_content)
    except json.JSONDecodeError:
        # If JSON parsing fails, prompt the model to correct its output
        correction_prompt = f"The following output was not valid JSON:\n{response_content}\nPlease provide a corrected JSON."
        correction_response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts structured data from CVs."},
                {"role": "user", "content": correction_prompt}
            ],
            temperature=0.1
        )
        try:
            return json.loads(correction_response.choices[0].message.content)
        except json.JSONDecodeError:
            print("Failed to parse JSON response after correction attempt.")
            return None
def main():
    cv_dir = "output"
    output_file = "cv_data.csv"
    data = []

    if not os.path.exists(cv_dir):
        print(f"Directory '{cv_dir}' does not exist.")
        return

    for filename in os.listdir(cv_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(cv_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                cv_text = f.read()
                result = parse_cv_with_chatgpt(cv_text)
                if result:
                    data.append(result)

    if data:
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False)
        print(f"CSV file '{output_file}' created successfully.")
    else:
        print("No data to write to CSV.")

if __name__ == "__main__":
    main()

CSV file 'cv_data.csv' created successfully.


In [24]:
pd.read_csv("cv_data.csv")

Unnamed: 0,Name,Nationality,DOB,Sex,Seniority,About,Education,Languages,Programing Language,Tool,Automation/Orchestration tool,Microservice,OS,Database,Public Cloud,Projects Experience,Certifications
0,Vu Viet Anh,Viet Nam,13/05/1993,Male,7+ years,4+ years of experience in Operation System. Kn...,"{'university': 'Academy of Fice', 'years': '20...","['Vietnamese', 'English']",,"['Redmine', 'Grafana', 'Jira', 'Zabbix', 'MS O...","['Windows', 'Linux']",,"['Windows', 'Linux']","['SQL Server', 'My SQL']",,"[{'project name': 'FCCL', 'customer': 'Japanes...",
1,Bui Duc Anh,Viet Nam,8/24/1990 0:00,Male,,2 years of experience in the Cloud Computing. ...,{'university': 'HoChiMinh City University of I...,English,"['Python', 'Bash Script']","['Grafana', 'Prometheus', 'NodeExporter', 'AWS...","['AWS Codepipeline', 'CodeBuild', 'GitlabCI', ...","['Docker', 'Kurbernetes']","['Windows', 'Linux', 'Ubuntu']",PostgresSQL,AWS,"[{'project name': 'USHM', 'customer': 'Japan c...",
2,To Quy Thanh,Viet Nam,2003,Male,< 1 year,Skilled in developing Bidata platform on AWS. ...,{'university': 'HAUI Hanoi University of Indus...,"['Vietnamese', 'English']","['Python', 'C++', 'SQL', 'Bash Script', 'Scala...","['Databrick', 'Apache Spark', 'AWS Glue', 'AWS...",,"['Kafka', 'Docker']","['Windows', 'Linux']","['MySQL', 'DynamoDB']",['AWS'],"[{'project name': 'USMH', 'customer': 'Japanes...",
3,Pham Viet Hieu,Viet Nam,1993,Male,5 years,8 years of experience in the computer software...,{'university': 'Hanoi University of Science an...,"['Vietnamese', 'Japanese', 'English']","['Python', 'Bash Script']","['Grafana', 'Prometheus', 'Graylog']","['AWS CloudFormation', 'Ansible', 'Jenkins', '...","['Docker', 'ECS', 'ECR', 'Kurbernetes']","['Windows', 'Linux']","['MySQL', 'MongoDB', 'Oracle', 'DMS', 'Athena'...","['AWS', 'Azure', 'GCP']","[{'project name': 'Cloud Group Leader', 'custo...","['AWS Certified DevOps Engineer Professional',..."
4,Doan Dinh Vu Cong,Viet Nam,1/1/1997 0:00,Male,3.5 years,"I am a responsible, dedicated and proactive pe...","{'university': 'FPT University', 'years': '201...","['Vietnamese', 'Japanese', 'English']","['Java Spring Boost', 'Java Core', 'C#']","['Visual Studio Code', 'Eclipse', 'IntelliJ', ...",['Selenium'],,['Windows'],"['MS SQL', 'My SQL', 'Postgre SQL']",['Nifcloud'],"[{'project name': 'FCCL_Monitor_D3', 'customer...",
5,Le Huu Minh,Viet Nam,12/7/1994 0:00,Male,6+ years,#NAME?,"{'university': 'FPT Polytechnic', 'years': 'Now'}","['Vietnamese', 'English']",,"['Redmine', 'Jira', 'CheckMK', 'Zabbix', 'Graf...",,,"['Windows', 'Linux']","['SQL Server', 'MS SQL']",['AWS'],"[{'project name': 'FCCL', 'customer': 'Japanes...",
6,Le Sy Quang,Viet Nam,24/02/1993,Male,7+ years,"4+ years of experience in Developer, 2+ years ...",{'university': 'Hanoi University of Mining and...,"['Vietnamese', 'English']",,['Redmine'],['Postman'],,,"['SQL Server', 'My SQL', 'PostgreSQL']",,"[{'project name': 'FCCL', 'customer': 'Japanes...",
7,Ngo Thach Anh,Viet Nam,11/2/1994 0:00,Male,2 years,+ 2 year of experience in Operation System\r\n...,{'university': 'Nam Dinh Industrial College Ni...,"['Vietnamese', 'Japanese', 'English']",,"['Redmine', 'Grafana', 'IntelliJ', 'Visual Stu...",,,"['Windows', 'Linux']","['My SQL', 'Oracle']",,"[{'project name': 'FCCL', 'customer': 'Japanes...",
8,Nguyen Phi Hai Nam,Viet Nam,6/4/2001 0:00,Male,3+ years,2+ years of experience in Website Testing Mobi...,"{'university': 'Hanoi University', 'years': '2...","{'Vietnamese': 'Native', 'English': 'C1 Advanc...",,"Redmine, Backlog, MS Office","Postman, Katalon Studio, Test Cafe",,"Windows, Centos 7","SQL Server, My SQL, PostgreSQL, DBeaver, MongoDB",AWS,[{'project name': 'EXV_Installation_Checksheet...,
9,NGUYEN THI HONG NHUNG,Viet Nam,29/11/2000,Female,2 years,Over 2 years of experience working as a sub-PM...,{'University': 'University of Languages & Inte...,"['Vietnamese', 'Japanese', 'English']",['HTML/CSS'],"['Redmine', 'Backlog', 'Jira', 'MS Office', 'd...",,,"['Windows', 'MacOSX']",['MySQL'],,[{'project name': 'System and Infra Monitoring...,"['Japanese-Language Proficiency Test N2', 'ITI..."


In [18]:
import pandas as pd
import ast

def standardize_languages(entry):
    # Handle missing or empty entries
    if not entry or pd.isna(entry):
        return {}

    # Try to safely evaluate the string to a Python object
    try:
        parsed = ast.literal_eval(entry)
    except Exception:
        parsed = entry  # If evaluation fails, assume it's a plain string

    # If it's a dictionary, keep the original values (levels)
    if isinstance(parsed, dict):
        return parsed

    # If it's a list, check the type of its items
    elif isinstance(parsed, list):
        # If it's a list of dictionaries (e.g., [{'Language': 'Vietnamese', 'Level': 'Native'}, ...])
        if all(isinstance(item, dict) for item in parsed):
            result = {}
            for item in parsed:
                language = item.get("Language")
                # Get the level if available, otherwise default to None
                level = item.get("Level", None)
                if language:
                    result[language] = level
            return result
        # If it's a list of strings (e.g., ['Vietnamese', 'English'])
        elif all(isinstance(item, str) for item in parsed):
            return {lang: None for lang in parsed}
        else:
            return {}

    # If it is a plain string, assume it's a single language with no level information
    elif isinstance(parsed, str):
        return {parsed: None}

    else:
        return {}

# Read the CSV file
df = pd.read_csv('name_lang_data.csv')

# Apply the transformation to the 'Languages' column
df['Languages'] = df['Languages'].apply(standardize_languages)

# Save the modified DataFrame to a new CSV file
df.to_csv('name_lang.csv', index=False)

# Show the resulting DataFrame
print(df)


                  Name                                          Languages
0            Vu Viet A              {'Vietnamese': None, 'English': None}
1            Bui Duc A                                  {'English': None}
2             To Quy T              {'Vietnamese': None, 'English': None}
3          Pham Viet H  {'Vietnamese': None, 'Japanese': None, 'Englis...
4       Doan Dinh Vu C  {'Vietnamese': None, 'Japanese': None, 'Englis...
5             Le Huu M              {'Vietnamese': None, 'English': None}
6              Le Sy Q              {'Vietnamese': None, 'English': None}
7          Ngo Thach A  {'Vietnamese': None, 'Japanese': None, 'Englis...
8     Nguyen Phi Hai N  {'Vietnamese': 'Native', 'English': 'C1 Advanc...
9    NGUYEN THI HONG N  {'Vietnamese': None, 'Japanese': None, 'Englis...
10        Nguyen Hai M                  {'Korean': None, 'English': None}
11      Nguyen Quang H                                  {'English': None}
12     Pham Thi Minh L  {'Vietnamese':