In [9]:
import pandas as pd
import os
import re
import csv

In [10]:
input_folder = 'EN'  # Update with your folder path
output_folder = 'input'  # Folder to save extracted sections
# List of section names to look for (in order)
section_names = [
    'PERSONAL DETAIL',
    'ABOUT',
    'EDUCATIONAL BACKGROUND',
    'CERTIFICATION',
    'LANGUAGE',
    'TECHNICAL SKILL',
    'EXPERIENCE'
]

In [11]:
def remove_accent(text):
    """Removes Vietnamese accents from a string."""
    text = re.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', text)
    text = re.sub(r'[ÀÁẠẢÃÂẦẤẬẨẪĂẰẮẶẲẴ]', 'A', text)
    text = re.sub(r'[èéẹẻẽêềếệểễ]', 'e', text)
    text = re.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', text)
    text = re.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', text)
    text = re.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', text)
    text = re.sub(r'[ìíịỉĩ]', 'i', text)
    text = re.sub(r'[ÌÍỊỈĨ]', 'I', text)
    text = re.sub(r'[ùúụủũưừứựửữ]', 'u', text)
    text = re.sub(r'[ÙÚỤỦŨƯỪỨỰỬỮ]', 'U', text)
    text = re.sub(r'[ỳýỵỷỹ]', 'y', text)
    text = re.sub(r'[ỲÝỴỶỸ]', 'Y', text)
    text = re.sub(r'[đ]', 'd', text)
    text = re.sub(r'[Đ]', 'D', text)
    return text

In [12]:
# Function to find the start row of a section
def find_section_row(section_name):
  try:
      return data[data.apply(lambda x: x.str.contains(section_name, case=True, na=False)).any(axis=1)].index[0]
  except IndexError:
      return None  # Return None if the section is not found

for filename in os.listdir(input_folder):
  if filename.endswith('.xlsx'):  # Only process Excel files
    input_file = os.path.join(input_folder, filename)
    output_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_sections.txt")
  # Read the Excel file
    data = pd.read_excel(input_file, header=None)  # Read without a header since this is structured data
    data = data.astype(str)

    # Strip leading/trailing spaces
    data = data.map(lambda x: x.strip() if isinstance(x, str) else x)

    # Find start rows for all sections
    sections = {name: find_section_row(name) for name in section_names}

    # Filter out missing sections and sort by row index
    sections = {name: row for name, row in sections.items() if row is not None}
    sorted_sections = sorted(sections.items(), key=lambda x: x[1])  # Sort by start row

    # Add a pseudo end row for the last section
    end_row = find_section_row('I declare that the above information is accurate.')
    if end_row is None:
        end_row = data.shape[0]  # Default to the last row of the data if the end row is not found

    # Process each section
    output_content = []
    for i, (section_name, start_row) in enumerate(sorted_sections):
        # Determine the end row for this section
        next_start_row = sorted_sections[i + 1][1] if i + 1 < len(sorted_sections) else end_row
        section_data = data.iloc[start_row:next_start_row, :]

        # Combine rows into a single string
        section_text = section_data.apply(lambda row: ' '.join(row.dropna()), axis=1)
        section_content = '\n'.join([line.replace('nan', '').replace('- ', '').replace(' – ', '~').strip()
                                   for line in section_text if line.strip()]).strip()

        # Remove Vietnamese accents
        section_content = remove_accent(section_content)

        # Anonymize PERSONAL DETAIL section
        if section_name == 'PERSONAL DETAIL':
            modified_lines = []
            for line in section_content.split('\n'):
                # Anonymize Name
                if line.startswith('Name'):
                    parts = line.split(maxsplit=1)
                    if len(parts) == 2:
                        name_parts = parts[1].split()
                        if name_parts:
                            # Keep first parts and initial of last name
                            anonymized = ' '.join(name_parts[:-1] + [name_parts[-1][0]])
                            line = f"{parts[0]}    {anonymized}"
                # Generalize DOB
                elif line.startswith('DOB'):
                    parts = line.split(maxsplit=1)
                    if len(parts) == 2:
                        # Extract just the year
                        year = re.search(r'\b\d{4}\b', parts[1]).group() if re.search(r'\b\d{4}\b', parts[1]) else ''
                        line = f"{parts[0]}    {year}"
                modified_lines.append(line)
            section_content = '\n'.join(modified_lines)

        # Append the section content to the output
        output_content.append(f"{section_content}\n")

    # Save the output to a text file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(output_content)

    print(f"Extracted content has been saved to {output_file}")

Extracted content has been saved to input\Copy of CV-Vu-Viet-Anh_sections.txt
Extracted content has been saved to input\Copy of VTI - CV Bui Duc Anh_sections.txt
Extracted content has been saved to input\CV - TO QUY THANH_sections.txt
Extracted content has been saved to input\CV Pham Viet Hieu - Cloud SA1_sections.txt
Extracted content has been saved to input\CV-Doan-Dinh-Vu-Cong_sections.txt
Extracted content has been saved to input\CV-Le Huu Minh_sections.txt
Extracted content has been saved to input\CV-Le Sy Quang_sections.txt
Extracted content has been saved to input\CV-Ngo-Thach-Anh_sections.txt
Extracted content has been saved to input\CV-Nguyen Phi Hai Nam_sections.txt
Extracted content has been saved to input\CV-Nguyen Thi Hong Nhung_sections.txt
Extracted content has been saved to input\CV-NguyenHaiMy-EN_sections.txt
Extracted content has been saved to input\CV-NguyenQuangHoa_sections.txt
Extracted content has been saved to input\CV-Pham-Thi-Minh-Luong-PM _sections.txt
Extract

  warn(f"Unknown type for {prop.name}")


Extracted content has been saved to input\VTI_CV_Thai-Xuan-Phuong_DevOps_sections.txt


tiêu đề đang liền với văn bản