In [197]:
import re
from typing import List, Tuple
import pdfplumber

class ResumeParser:
    def __init__(self):
        self.text_items = []
        self.lines = []
        self.sections = []

    def read_pdf(self, pdf_file):
        """
        Step 1. Read the text items from a PDF file
        """
        with pdfplumber.open(pdf_file) as pdf:
            page = pdf.pages[0]
            # self.text_items = page.extract_text(x_tolerance=1, y_tolerance=1)
            self.text_items = page.extract_words(extra_attrs=['fontname'])

    def group_text_items_into_lines(self):
        """
        Step 2. Group text items into lines
        """
        line = []
        for item in self.text_items:
            if line and self.distance(line[-1], item) > self.avg_char_width * 1.5:
                self.lines.append(line)
                line = []
            line.append(item)
        if line:
            self.lines.append(line)

    def distance(self, item1, item2):
        return abs(item2['x0'] - item1['x1'])
    
    def is_bold(self, item):
        return "Bold" in item['fontname']

    @property
    def avg_char_width(self):
        # total_width = sum(item['x1'] - item['x0'] for item in self.text_items if not item['upright'])
        # total_chars = sum(len(item['text']) for item in self.text_items if not item['upright'])
        total_width = sum(item['x1'] - item['x0'] for item in self.text_items if self.is_bold(item))
        total_chars = sum(len(item['text']) for item in self.text_items if self.is_bold(item))
        return total_width / total_chars if total_chars else 0

    def group_lines_into_sections(self):
        """
        Step 3. Group lines into sections
        """
        current_section = []
        for line in self.lines:
            if self.is_section_title(line[0]):
                if current_section:
                    self.sections.append(current_section)
                current_section = [line]
            else:
                current_section.append(line)
        if current_section:
            self.sections.append(current_section)
        

    def is_section_title(self, text_item):
        if self.is_bold(text_item) and text_item['text'].isupper():
            return True
        return any(keyword in text_item['text'].lower() for keyword in self.section_titles)

    section_titles = [
        'about', 'profile', 'objective', 'education', 'certifications', 'experience', 'skills',
        'projects', 'honors', 'awards', 'involvement', 'volunteer'
    ]

    def extract_resume_from_sections(self):
        """
        Step 4. Extract resume from sections
        """
        resume = {}
        
        resume.update(self.extract_profile(self.sections[0]))
        
        for section in self.sections:
            section_title = section[0][0]['text'].lower()
            if section_title == 'profile':
                resume.update(self.extract_profile(section))
            elif section_title == 'education':
                resume['education'] = self.extract_education(section)
            elif section_title == 'experience':
                resume['experience'] = self.extract_experience(section)
            elif section_title == 'skills':
                resume['skills'] = self.extract_skills(section)
            # Add more section extractors as needed

        return resume
    
    def merge_line(self, line: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Merge words in a line into a single text item.
        """
        merged_text = ' '.join(item['text'] for item in line)
        return {
            'text': merged_text,
            'fontname': line[0]['fontname'],  # Assuming the same font for simplicity
            'x0': line[0]['x0'],
            'x1': line[-1]['x1'],
            'top': line[0]['top'],
            'bottom': line[0]['bottom'],
            'upright': line[0]['upright']
        }
        
    def extract_profile(self, section):
        profile = {}
        profile_section = []
        for line in section:
            profile_section.append(self.merge_line(line))
        
            
        for item in profile_section:
            score, attribute = max((self.score_feature(item, attr), attr) for attr in self.profile_attrs)
            if score > 0:
                profile[attribute] = item['text']
        return profile

    profile_attrs = ['name', 'email', 'phone', 'location', 'url'] #, 'summary']

    def score_feature(self, text_item, attr):
        score = 0
        for feature, weight in self.profile_features[attr]:
            if callable(feature):
                if feature(text_item):
                    score += weight
            else:
                if feature.match(text_item['text']):
                    score += weight
        return score

    profile_features = {
        'name': [
            (re.compile(r'^[a-zA-Z\s\.]+$'), 3),
            (lambda item: 'Bold' in item['fontname'], 2),
            (lambda item: item['text'].isupper(), 2),
            (lambda item: '@' in item['text'], -4),
            (lambda item: any(char.isdigit() for char in item['text']), -4),
            (lambda item: ',' in item['text'], -4),
            (lambda item: '/' in item['text'], -4),
        ],
        'email': [
            (re.compile(r'\S+@\S+\.\S+'), 10),
        ],
        'phone': [
            (re.compile(r'\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}'), 10),
        ],
        'location': [
            (re.compile(r'[A-Z][a-z]+, [A-Z]{2}'), 10),
        ],
        'url': [
            (re.compile(r'https?://\S+'), 10),
        ],
        
        # Add more feature sets as needed
    }

    def extract_education(self, section):
        education = []
        subsections = self.split_into_subsections(section[1:])
        for subsection in subsections:
            degree = self.extract_degree(subsection)
            if degree:
                education.append(degree)
        return education

    def extract_degree(self, subsection):
        degree = {}
        for line in subsection:
            if self.is_school(line[0]):
                degree['school'] = self.merge_line(line)['text']
            elif self.is_degree(line[0]):
                degree['degree'] = self.merge_line(line)['text']
            elif self.is_gpa(line[0]):
                degree['gpa'] = self.merge_line(line)['text']
            elif self.is_date(line[0]):
                degree['date'] = self.merge_line(line)['text']
            else:
                degree['description'] = self.merge_line(line)['text']
        return degree

    def is_school(self, text_item):
        return any(keyword in text_item['text'].lower() for keyword in ['university', 'college', 'school'])

    def is_degree(self, text_item):
        return any(keyword in text_item['text'].lower() for keyword in ['bachelor', 'master', 'associate'])

    def is_gpa(self, text_item):
        return re.match(r'[0-9]\.\d{1,2}', text_item['text']) is not None

    def is_date(self, text_item):
        return any(keyword in text_item['text'].lower() for keyword in ['year', 'month', 'present'])

    def split_into_subsections(self, lines):
        subsections = []
        current_subsection = []
        for i in range(len(lines)):
            if i == len(lines) - 1 or self.is_new_subsection(lines[i], lines[i + 1]):
                current_subsection.append(lines[i])
                subsections.append(current_subsection)
                current_subsection = []
            else:
                current_subsection.append(lines[i])
        return subsections

    def is_new_subsection(self, line1, line2):
        return any(self.is_bold(item) for item in line2) and self.line_gap(line1, line2) > self.avg_line_gap * 1.4

    @property
    def avg_line_gap(self):
        gaps = []
        for i in range(len(self.lines) - 1):
            line1, line2 = self.lines[i], self.lines[i + 1]
            gap = line2[0]['bottom'] - line1[-1]['top']
            gaps.append(gap)
        return sum(gaps) / len(gaps) if gaps else 0

    def line_gap(self, line1, line2):
        return line2[0]['bottom'] - line1[-1]['top']

    def extract_experience(self, section):
        experiences = []
        subsections = self.split_into_subsections(section[1:])
        for subsection in subsections:
            experience = self.extract_experience_subsection(subsection)
            if experience:
                experiences.append(experience)
        return experiences

    def extract_experience_subsection(self, subsection):
        experience = {}
        for line in subsection:
            if self.is_company(line[0]):
                experience['company'] = line[0]['text']
            elif self.is_job_title(line[0]):
                experience['job_title'] = line[0]['text']
            elif self.is_date(line[0]):
                experience['date'] = line[0]['text']
            else:
                if 'description' not in experience:
                    experience['description'] = []
                experience['description'].append(line[0]['text'])
        return experience

    def is_company(self, text_item):
        return self.is_bold(text_item) or not self.is_job_title(text_item)

    def is_job_title(self, text_item):
        return any(keyword in text_item['text'].lower() for keyword in ['analyst', 'engineer', 'intern', 'manager', 'developer'])

    def extract_skills(self, section):
        skills = []
        for line in section[1:]:
            for item in line:
                if item['upright']:
                    skills.append(item['text'])
        return skills

# Usage example
parser = ResumeParser()
parser.read_pdf('../data/cv/CV_SonBao_DS.pdf')
parser.group_text_items_into_lines()
parser.group_lines_into_sections()
resume = parser.extract_resume_from_sections()
from pprint import pprint
pprint(resume)

{'education': [{'degree': 'Bachelor Degree in Data Science (2020-2024)',
                'description': 'Cumulative GPA: 7.6/10',
                'school': 'University of Economics, Ho Chi Minh City - Viet '
                          'Nam'}],
 'name': 'PHAN TRAN SON BAO',
 'phone': '0938009294 | sonbao0901@gmail.com | https://github.com/sonbao0901 |',
 'skills': ['Language:',
            'Python',
            '(pandas,',
            'numpy,',
            'sklearn,',
            'tensoflow,..),',
            'SQL.',
            'Visualization:',
            'Power',
            'BI,',
            'Python',
            '(matplotlib,',
            'seaborn).',
            'Others:',
            'Excel',
            '(pivot',
            'table,',
            'visualize),',
            'Word,',
            'PowerPoint.',
            'Pursing:',
            'Statistic,',
            'Machine',
            'Learning,',
            'Deep',
            'Learning.'],
 'url': 'https://www.linked

In [191]:
parser.split_into_subsections(parser.lines)

[[[{'text': 'PHAN',
    'x0': 200.47,
    'x1': 251.14000000000001,
    'top': 32.545999999999935,
    'doctop': 32.545999999999935,
    'bottom': 50.545999999999935,
    'upright': True,
    'height': 18.0,
    'width': 50.670000000000016,
    'direction': 'ltr',
    'fontname': 'Helvetica-Bold'},
   {'text': 'TRAN',
    'x0': 256.144,
    'x1': 306.43199999999996,
    'top': 32.545999999999935,
    'doctop': 32.545999999999935,
    'bottom': 50.545999999999935,
    'upright': True,
    'height': 18.0,
    'width': 50.287999999999954,
    'direction': 'ltr',
    'fontname': 'Helvetica-Bold'},
   {'text': 'SON',
    'x0': 311.364,
    'x1': 350.33599999999996,
    'top': 32.545999999999935,
    'doctop': 32.545999999999935,
    'bottom': 50.545999999999935,
    'upright': True,
    'height': 18.0,
    'width': 38.97199999999998,
    'direction': 'ltr',
    'fontname': 'Helvetica-Bold'},
   {'text': 'BAO',
    'x0': 355.34,
    'x1': 395.264,
    'top': 32.545999999999935,
    'doctop':

In [160]:
section = parser.sections[0]
profile_attrs = ['name', 'email', 'phone', 'location', 'url'] #, 'summary']

def score_feature(text_item, attr):
    score = 0
    for feature, weight in profile_features[attr]:
        if callable(feature):
            if feature(text_item):
                score += weight
        else:
            if feature.match(text_item['text']):
                score += weight
    return score

profile_features = {
    'name': [
        (re.compile(r'^[a-zA-Z\s\.]+$'), 3),
        (lambda item: 'Bold' in item['fontname'], 2),
        (lambda item: item['text'].isupper(), 2),
        (lambda item: '@' in item['text'], -4),
        (lambda item: any(char.isdigit() for char in item['text']), -4),
        (lambda item: ',' in item['text'], -4),
        (lambda item: '/' in item['text'], -4),
    ],
    'email': [
        (re.compile(r'\S+@\S+\.\S+'), 10),
    ],
    'phone': [
        (re.compile(r'\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}'), 10),
    ],
    'location': [
        (re.compile(r'[A-Z][a-z]+, [A-Z]{2}'), 10),
    ],
    'url': [
        (re.compile(r'https?://\S+'), 10),
    ],
}

for item in section[1]:
    print(item["text"], score_feature(item, 'email'))

0938009294 0
| 0
sonbao0901@gmail.com 10
| 0
https://github.com/sonbao0901 0
| 0


In [174]:
parser.lines

[{'text': 'P H A N T R A N S O N B A O',
  'fontname': 'Helvetica-Bold',
  'size': 18.0,
  'x0': 200.47,
  'x1': 395.264,
  'top': 32.545999999999935,
  'bottom': 50.545999999999935,
  'upright': True},
 {'text': '0 9 3 8 0 0 9 2 9 4 | s o n b a o 0 9 0 1 @ g m a i l . c o m | h t t p s : / / g i t h u b . c o m / s o n b a o 0 9 0 1 |',
  'fontname': 'Helvetica',
  'size': 11.039999999999964,
  'x0': 119.33,
  'x1': 477.0604,
  'top': 56.065279999999916,
  'bottom': 67.10527999999988,
  'upright': True},
 {'text': 'h t t p s : / / w w w . l i n k e d i n . c o m / i n / p h a n t r a n s o n b a o /',
  'fontname': 'Helvetica',
  'size': 11.039999999999964,
  'x0': 188.95,
  'x1': 407.02312,
  'top': 68.78527999999994,
  'bottom': 79.8252799999999,
  'upright': True},
 {'text': 'A b o u t',
  'fontname': 'Helvetica-Bold',
  'size': 12.0,
  'x0': 24.0,
  'x1': 58.452000000000005,
  'top': 97.54399999999998,
  'bottom': 109.54399999999998,
  'upright': True},
 {'text': 'I a m a s e n i 

In [128]:
with open('sections.txt', 'w') as f:
    for line in parser.sections[0]:
        f.write(' '.join(item['text'] for item in line))
        f.write('\n\n')

In [136]:
with pdfplumber.open('../data/cv/CV_SonBao_DS.pdf') as pdf:
    page = pdf.pages[0]
    text_items = page.extract_text_lines(extra_attrs=['fontname'], return_chars=False)
# pdf = pdfplumber.open('../data/cv/CV_SonBao_DS.pdf')
# page = pdf.pages[0]
# img = page.to_image()
# img.draw_rects(text_items)
text_items

[{'text': 'PHAN TRAN SON BAO',
  'x0': 200.47,
  'top': 32.545999999999935,
  'x1': 395.264,
  'bottom': 50.545999999999935},
 {'text': '0938009294 | sonbao0901@gmail.com | https://github.com/sonbao0901 |',
  'x0': 119.33,
  'top': 56.065279999999916,
  'x1': 477.0604,
  'bottom': 67.10527999999988},
 {'text': 'https://www.linkedin.com/in/phantransonbao/',
  'x0': 188.95,
  'top': 68.78527999999994,
  'x1': 407.02312,
  'bottom': 79.8252799999999},
 {'text': 'About',
  'x0': 24.0,
  'top': 97.54399999999998,
  'x1': 58.452000000000005,
  'bottom': 109.54399999999998},
 {'text': 'I am a senior student at the University of Economics in Ho Chi Minh City, majoring in Data Science. With a big passion for working with',
  'x0': 24.0,
  'top': 116.86784,
  'x1': 559.84888,
  'bottom': 125.98784},
 {'text': 'data and models and I am eager to gain more experiences from realistic problems in a company. I am a dedicated person and a',
  'x0': 24.0,
  'top': 128.89783999999997,
  'x1': 533.19368,
