In [1]:
!pip install PyPDF2
import PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
def read_pdf(file_path):
    output = None
    try:
        pdf_obj = open(file_path, 'rb')
        pdf_reader = PyPDF2.PdfReader(pdf_obj)
        page_obj = pdf_reader.pages[0]
        output = page_obj.extract_text()
        pdf_obj.close()
    except Exception as e:
        print(f"Error reading file '{file_path}': {str(e)}")
    return output

In [3]:
from google.colab import drive
drive.mount('/content/drive')
test = read_pdf('/content/drive/MyDrive/resume-match/data/resume/Mani_Kanta_Gogula_Resume.pdf')

Mounted at /content/drive


### PREPARE TRAINING DATASET
resume1 = read_pdf('data/Mani_Kanta_Gogula_Resume.pdf')
#resume1
with open('resume1.txt', 'w') as f:
    f.write(resume1)

### text cleaning

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
def clean_text(s):
  tokens = word_tokenize(s.lower())
  tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens if t not in set(stopwords.words('english') + list(string.punctuation))]
  return ' '.join(tokens)

In [6]:
clean_test = clean_text(test)

In [7]:
clean_test

'mani kanta gogula ph:413 -315-1056 |mgogula46 gmail.com linkedin education m.s data business analytics| university massachusetts amherst |gpa 3.85/4 jan 22 – dec 23 course data science fundamental business intelligence analytics machine learning quantitative analysis data mining b.tech electronics communication engineering| jawaharlal nehru technological university aug 15 june19 relevant coursework mathematical method probability statistic managerial economics financial analysis professional experience business analyst intern product ops analytics experian may 2023 aug 2023 • developed executed sql query aggregate analyze interpret business data enabling comprehensive quantitative qualitative analysis insight support decision -making contributed success experian consumer service ec product service • partnered closely product technology customer care bizops external partner track address resolve customer escalation resulting 25 decrease customer support ticket 10 increase first -call r

### EXTRACTING DATA

In [12]:
import re
import spacy
import spacy_transformers

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [9]:
nlp = spacy.load("en_core_web_sm")

In [11]:
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/197.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/197.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.9/197.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/

In [13]:
nlp_trained = spacy.load('/content/drive/MyDrive/resume-match/output/model-best')

In [14]:
test_doc = nlp(clean_test)
test_doc_trained = nlp_trained(test)

In [15]:
def extract_names(nlp_text):  #need some filter
    return [e.text for e in nlp_text.ents if e.label_ == 'PERSON'][0]

In [None]:
extract_names(test_doc)

'mani kanta gogula'

In [16]:
def extract_education(nlp_text):
  return [e.text for e in nlp_text.ents if e.label_ == 'DEGREE']

In [None]:
extract_education(test_doc_trained)

['M.S.  Data  & Business  Analytics|',
 'B.Tech.  Electronics  and Communication  Engineering|  Jawaharlal Nehru  Technological  University']

In [17]:
def extract_emails(text):
    email_pattern = r'\w+@\S+\.\S+'
    return re.findall(email_pattern, text)

In [None]:
extract_emails(test)

['mgogula46@gmail.com']

In [18]:
def extract_phones(text):
    phone_pattern = r'[+]?[0-9]{0,3}\s?[(]?[1-9][0-9]{2}[)\-\s]*[0-9]{3}[\-\s]?[0-9]{4}'
    phones = re.findall(phone_pattern, text)
    phone = []
    for p in phones:
      phone.append("".join([i for i in p if i.isnumeric()]))
    return phone

In [None]:
extract_phones(test)

['4133151056']

In [19]:
def extract_links(text):
    link_pattern = r'(?:https?://|www\.)\S+'
    return re.findall(link_pattern, text)

In [None]:
extract_links(test)

[]

In [20]:
def extract_entities(nlp_text): #education/previous company
    entity_labels = ['GPE', 'ORG']
    entities = [e.text for e in nlp_text.ents if e.label_ in entity_labels]
    return entities

In [None]:
extract_entities(test_doc)

['m.s data business', '|gpa', 'ec', 'kanban']

In [None]:
RESUME_SECTIONS = [
    'Contact Information',
    'Objective',
    'Summary',
    'Education',
    'Experience',
    'Skills',
    'Projects',
    'Certifications',
    'Licenses',
    'Awards',
    'Honors',
    'Publications',
    'References',
    'Technical Skills',
    'Computer Skills',
    'Programming Languages',
    'Software Skills',
    'Soft Skills',
    'Language Skills',
    'Professional Skills',
    'Transferable Skills',
    'Work Experience',
    'Professional Experience',
    'Employment History',
    'Internship Experience',
    'Volunteer Experience',
    'Leadership Experience',
    'Research Experience',
    'Teaching Experience'
]
EXPERIENCE_SECTIONS = [
    'Experience',
    'Work Experience',
    'Professional Experience',
    'Employment History',
    'Internship Experience',
    'Volunteer Experience',
    'Leadership Experience',
    'Research Experience',
    'Teaching Experience'
]
EXPERIENCE_SECTIONS = [i.lower() for i in EXPERIENCE_SECTIONS]

In [21]:
def extract_experience(nlp_text):
    experience_section = []
    in_experience_section = False

    for token in nlp_text:
        if token.text == 'experience':
            in_experience_section = True
            #print(token.text)
        if in_experience_section:
            #print(token.text)
            experience_section.append(token.text)
    return " ".join(experience_section)

In [None]:
extract_experience(test_doc)

'experience business analyst intern product ops analytics experian may 2023 aug 2023 • developed executed sql query aggregate analyze interpret business data enabling comprehensive quantitative qualitative analysis insight support decision -making contributed success experian consumer service ec product service • partnered closely product technology customer care bizops external partner track address resolve customer escalation resulting 25 decrease customer support ticket 10 increase first -call resolution rate • analyzed 50 + customer escalated ticket monthly identify top 3 product improvement opportunity presented finding senior leadership • instrumental facilitating ad - hoc business analysis delivering invaluable insight strategic recommendation drive optimization strategy collaborated seamlessly stakeholder pinpoint avenue refining product streamlining operational workflow enhancing overall business process • created comprehensive analysis playbook encompassing diverse service le

In [22]:
def extract_position_year(text):
  position_year_search_pattern = r"(\b\w+\b\s+\b\w+\b),\s+(\d{4})\s*-\s*(\d{4}|\bpresent\b)"
  position_year = re.findall(position_year_search_pattern, text)
  return position_year

In [None]:
extract_position_year(test)

[]

In [23]:
def extract_keywords(nlp_text):
  return [t.text for t in nlp_text if t.pos_ in ['NOUN', 'PROPN']]

In [None]:
extract_keywords(test_doc)

['mani',
 'gogula',
 'ph:413',
 '-315',
 '|mgogula46',
 'gmail.com',
 'linkedin',
 'education',
 'm.s',
 'data',
 'business',
 'university',
 'massachusetts',
 '|gpa',
 'jan',
 'dec',
 'course',
 'data',
 'science',
 'business',
 'intelligence',
 'analytics',
 'machine',
 'analysis',
 'data',
 'mining',
 'electronics',
 'communication',
 'engineering|',
 'jawaharlal',
 'nehru',
 'university',
 'aug',
 'june19',
 'coursework',
 'method',
 'probability',
 'economics',
 'analysis',
 'experience',
 'business',
 'analyst',
 'intern',
 'product',
 'ops',
 'analytics',
 'aug',
 'query',
 'aggregate',
 'analyze',
 'business',
 'data',
 'analysis',
 'insight',
 'support',
 'decision',
 '-making',
 'success',
 'consumer',
 'service',
 'ec',
 'product',
 'service',
 'product',
 'technology',
 'customer',
 'care',
 'partner',
 'track',
 'address',
 'customer',
 'escalation',
 'decrease',
 'customer',
 'support',
 'ticket',
 'increase',
 '-call',
 'resolution',
 'rate',
 'customer',
 'ticket',
 'pr

In [24]:
def to_JSON(text):
  clean_data = clean_text(text)
  nlp_doc = nlp(clean_data)
  resume_dict = {
      'resume': text,
      'clean_text': clean_data,
      'name': extract_names(nlp_doc),
      'email': extract_emails(text),
      'phone': extract_phones(text),
      'link': extract_links(text),
      'keywords': extract_keywords(nlp_doc),
      'education': extract_education(nlp_trained(text)),
      'experience': extract_experience(nlp_doc)
  }
  return resume_dict

In [28]:
import json
json_object = json.dumps(to_JSON(test), sort_keys=True)
with open("/content/drive/MyDrive/resume-match/data/extracted_resume.json", "w+") as outfile:
  outfile.write(json_object)