In [1]:
#Importing necessary libraries
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io 

In [2]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

# calling above function and extracting text
file_path="try_resume.pdf"
text=''
for page in extract_text_from_pdf(file_path):
    text += ' ' + page

In [3]:
text

" Harsh Gundecha\xa0\n+917567934387 | \u200bharsh.gundecha@gmail.com\xa0\n\nPROFESSIONAL EXPERIENCE\xa0\nVuclip\u200b - Software Engineer | Intern\xa0\n\nPune | JAN 2020 - PRESENT\xa0\n\n● Working on offers/subscription and related areas.\xa0\n●\n\nTech Stack :  Springboot MicroServices, Config Server, Queues, Cloud\xa0\nManaged SQL/NoSql DB, System Design, Design Patterns.\xa0\n\nGridle\u200b - Software Engineer | Intern\xa0\n\nAhmedabad | MAY - JUNE 2020\xa0\n\n● Worked on an investment based project with two other team\xa0\n\nmembers. My contribution was to make APIs and general\xa0\nimprovement to the system.\xa0\nTech Stack : Django with Rest API Framework, Docker, Postman,\xa0\nPandas\xa0\n\n●\n\nHacktofni Infotech\u200b - Jr. Web dev. | Tutor\xa0\n\nSurat | MAY 2017 - MAY 2018\xa0\n\n●\n\n●\n\nJoined as an intern at the beginning of  final year of bachelor's degree\xa0\n& learned PHP & related techs ahead of college course.\xa0\nLater I learned the Codeigniter framework & used t

In [4]:
import spacy
from spacy.matcher import Matcher

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]
    
    matcher.add(resume_text,pattern)
    #matcher.add(pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

In [5]:
ex_text=extract_name(text)#calling a function
ex_text

'Harsh Gundecha'

In [6]:
import re

def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

In [7]:
ex_mob=extract_mobile_number(text)
ex_mob

'9175679343'

In [8]:
import re

def extract_email(email):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

In [9]:
ex_email=extract_email(text)
ex_email

'\u200bharsh.gundecha@gmail.com'

In [10]:
import pandas as pd
import spacy

# load pre-trained model
nlp = spacy.load('en_core_web_sm')
noun_chunks = nlp(text)

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # reading the csv file
    data = pd.read_csv("skills.csv") 
    print(data.columns.values[0].split(','))
    # extract values
    skills =data.columns.values[0].split(',')
    #print(skills)
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    # check for bi-grams and tri-grams (example: machine learning)
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]


In [11]:
ex_skills=extract_skills(text)
ex_skills

['technical skills', 'ajenti', 'django-suit', 'django-xadmin', 'flask-admin', 'flower', 'grappelli', 'wooey', 'algorithms', 'pypattyrn', 'python-patterns', 'sortedcontainers', 'django-simple-captcha', 'django-simple-spam-blocker', 'django-compressor', 'django-pipeline', 'django-storages', 'fanstatic', 'fileconveyor', 'flask-assets', 'jinja-assets-compressor', 'webassets', 'audiolazy', 'audioread', 'beets', 'dejavu', 'django-elastic-transcoder', 'eyed3', 'id3reader', 'm3u8', 'mingus', 'pyaudioanalysis', 'pydub', 'pyechonest', 'talkbox', 'timeside', 'tinytag', 'authomatic', 'django-allauth', 'django-oauth-toolkit', 'flask-oauthlib', 'oauthlib', 'python-oauth2', 'python-social-auth', 'rauth', 'sanction', 'jose', 'pyjwt', 'python-jws', 'python-jwt', 'bitbake', 'buildout', 'platformio', 'pybuilder', 'scons', 'django-cms', 'djedi-cms', 'feincms', 'kotti', 'mezzanine', 'opps', 'plone', 'quokka', 'wagtail', 'widgy', 'beaker', 'diskcache', 'django-cache-machine', 'django-cacheops', 'django-view

['Config',
 'Django',
 'Mysql',
 'Apis',
 'Twitter',
 'Rest',
 'Nosql',
 'Javascript',
 'Docker',
 'Sql',
 'Postgresql',
 'Cloud',
 'System',
 'Algorithms',
 'Java',
 'Api',
 'Content',
 'Pandas',
 'Js',
 'Design',
 'Improvement',
 'Php']

In [12]:
import re
import spacy
from nltk.corpus import stopwords

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S','BCA','BTech', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S','MScIT', 
            'BTECH', 'B.Tech', 'M.TECH', 'MTECH', 
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
        ]

def extract_education(resume_text):
    #nlp_text = nlp(resume_text)
    document = nlp(resume_text)
    nlp_text=[sent.text.strip() for sent in document.sents]

    edu = []
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            tex=str(tex)
            print(tex)
                
            if tex in EDUCATION:
                edu.append(tex)

    
    # Extract year
#     education = []
#     for key in edu.keys():
#         year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
#         if year:
#             education.append((key, ''.join(year[0])))
#         else:
#             education.append(key)
    return edu

In [13]:
ex_ed=extract_education(text)
ex_ed

Harsh
Gundecha
+917567934387

​harshgundecha@gmailcom
PROFESSIONAL
EXPERIENCE
Vuclip​
-
Software
Engineer

Intern
Pune

JAN
2020
-
PRESENT
●
Working
on
offers/subscription
and
related
areas
●
Tech
Stack
:
Springboot
MicroServices
Config
Server
Queues
Cloud
Managed
SQL/NoSql
DB
System
Design
Design
Patterns
Gridle​
-
Software
Engineer

Intern
Ahmedabad

MAY
-
JUNE
2020
●
Worked
on
an
investment
based
project
with
two
other
team
members
My
contribution
was
to
make
APIs
and
general
improvement
to
the
system
Tech
Stack
:
Django
with
Rest
API
Framework
Docker
Postman
Pandas
●
Hacktofni
Infotech​
-
Jr
Web
dev

Tutor
Surat

MAY
2017
-
MAY
2018
●
●
Joined
as
an
intern
at
the
beginning
of
final
year
of
bachelor's
degree
&
learned
PHP
&
related
techs
ahead
of
college
course
Later
I
learned
the
Codeigniter
framework
&
used
the
same
for
clients
projects
as
well
as
guided
~​50​
final
year
students
in
multiple
batches
along
with
developing
my
own
graduation
project(e-learn)
PROJECTS
E-learn​
-
PHP



['MScIT', 'BCA']

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import docx2txt
cv=CountVectorizer()
jd=docx2txt.process('rp2.docx')

In [15]:
text=[text,jd]
count_matrix=cv.fit_transform(text)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
print('Similarity Score: ')
print(cosine_similarity(count_matrix))
matchpercentage=cosine_similarity(count_matrix)[0][1]*100
matchpercentage=round(matchpercentage,2)
print('Your resume matches about '+str(matchpercentage)+ '% of the job description')

Similarity Score: 
[[1.        0.3930612]
 [0.3930612 1.       ]]
Your resume matches about 39.31% of the job description
