In [1]:
import docx2txt

def get_resume():
    temp = docx2txt.process("Manan_Bhatt_Resume (1).docx")
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    return ' '.join(text)

In [2]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

def get_name(resume):
    nlp_text = nlp(resume)
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('NAME', [pattern])
    matcher_list = matcher(nlp_text)
    
    for match_id, start, end in matcher_list:
        span = nlp_text[start:end]
        return span.text

In [3]:
import re

def get_email_address(resume):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", resume)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

In [4]:
import pandas as pd
import spacy

def get_skills(resume):
    nlp_text = nlp(resume)
    noun_chunks = nlp_text.noun_chunks
    tokens = [token.text for token in nlp_text if not token.is_stop]
    df = pd.read_csv("skill_keywords.csv")
    skills = list(df.columns.values)
    skill_list = []
    
    for token in tokens:
        if token in skills:
            skill_list.append(token)
    
    
    for token in noun_chunks:
        token = token.text.strip()
        if token in skills:
            skill_list.append(token)
    
    return set(skill_list)

In [5]:
import re
import spacy
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')

STOPWORDS = set(stopwords.words('english'))

DEGREE_TYPES = [
            'PhD', 'MBA', 'BA', 'BS', 'MA', 'MS'
        ]

def get_degree(resume):
    nlp_text = nlp(resume)
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    degree = {}
    for i, text in enumerate(nlp_text):
        for temp in text.split():
            temp = re.sub(r'[?|$|.|!|,]', r'', temp)
            if temp.upper() in DEGREE_TYPES and temp not in STOPWORDS:
                degree[temp] = text + nlp_text[i + 1]

    degree_year = []
    for i in degree.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), degree[i])
        if year:
            degree_year.append((i, ''.join(year[0])))
        else:
            degree_year.append(i)
   
    return degree_year

In [6]:
resume = get_resume()

name = get_name(resume)
email_address = get_email_address(resume)
skills = get_skills(resume)
degree = get_degree(resume)

profile = open("bias_free_profile.txt", "a")
profile.write("Name: " + name + "\n")
profile.write("Email: " + email_address + "\n")
profile.write("Skills: " + str(skills) + "\n")
profile.write("Degree: " + str(degree) + "\n")
profile.close()