### Extracting text from a PDF file

In [None]:
!pip install pdfminer.six

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdfminer.six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 26.4 MB/s 
[?25hCollecting cryptography>=36.0.0
  Downloading cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 71.0 MB/s 
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-38.0.3 pdfminer.six-20221105


In [None]:
from pdfminer.high_level import extract_text
 
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)
 
txt = extract_text_from_pdf("/content/Shraddha-Surywanshi-Resume-Final.pdf")
txt

'                   SHRADDHA SURYAWANSHI \n\nshraddha3m@gmail.com     8378032152 \n\nGithub \n\n           LinkedIn   \n\nEDUCATION  \n\nSYMBIOSIS INSTITUTE OF TECHNOLOGY   \n\nB.TECH in Information Technology  \n (CGPA: 8.363/10.0)  \n\nPROJECTS  \n\nNGO EVENT MANAGEMENT WEBSITE,  \nCo-developer  \n\nPune, MH, IN  \n\n2019-2023  \n\nApril, 2020   \n\n•  A website where an NGO can better reach donors by advertising their clothes/books donation drives.   \n\n•  Makes it convenient for donors by allowing them to request door step pickup to the NGO.  \n\n•  Uses JSP, Servlets, MySQL, HTML, CSS, JS.  \n\nDATA VISUALISATION WEBSITE Co-developer  \n\nOngoing  \n\nOctober 2021-   \n\n•  Helps an NGO to view their kitchen operations (material used and bought per month) efficiency visually in the \n\nform of simple charts.  \n\n•  Can help analyse and adjust raw materials usage and thereby reduce wastage.  \n\n•  Uses React, Bootstrap, node.js, MySQL.  \n\nETCH-A-SKETCH  \nDeveloper  \n\n•  A s

### Extracting text from a word document

In [None]:
!pip install docx2txt
import docx2txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3981 sha256=c1fd937350784a6e55a3ba2c4e59cfeac31fbbc0d7b8908fe4305f3a0f15f853
  Stored in directory: /root/.cache/pip/wheels/b7/20/b2/473e3aea9a0c0d3e7b2f7bd81d06d0794fec12752733d1f3a8
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [None]:
def doctotext(m):
    temp = docx2txt.process(m)
    resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    text = ' '.join(resume_text)
    return (text)

txt = doctotext('/content/sample_data/Shraddha-Surywanshi-Resume-Final.docx')
txt

'SHRADDHA SURYAWANSHI                               shraddha3m@gmail.com     8378032152          Github           LinkedIn                                                SHRADDHA SURYAWANSHI                               shraddha3m@gmail.com     8378032152          Github           LinkedIn      EDUCATION      SYMBIOSIS INSTITUTE OF TECHNOLOGY   Pune, MH, IN  B.TECH in Information Technology    (CGPA: 8.363/10.0)  2019-2023    PROJECTS     NGO EVENT MANAGEMENT WEBSITE,  April, 2020   Co-developer  A website where an NGO can better reach donors by advertising their clothes/books donation drives.   Makes it convenient for donors by allowing them to request door step pickup to the NGO.  Uses JSP, Servlets, MySQL, HTML, CSS, JS.    DATA VISUALISATION WEBSITE Co-developer  Ongoing  October 2021-   Helps an NGO to view their kitchen operations (material used and bought per month) efficiency visually in the form of simple charts.  Can help analyse and adjust raw materials usage and thereby re

### Extracting names using NLTK

In [None]:
!pip install nltk
!pip install numpy # (also required by nltk, for running the following code)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import spacy
import nltk

nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
def extract_names(txt):
    person_names = []
 
    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): #chunking tokens
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves()) #tree to tokens
                )
    
    return person_names
 

person_names = extract_names(txt)
person_names

['Github LinkedIn SHRADDHA',
 'Github LinkedIn',
 'React',
 'Bootstrap',
 'Python',
 'React',
 'Agile',
 'Udemy Software',
 'Harvard']

In [None]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Extracting names using spacy

In [None]:
from spacy.matcher import Matcher

def extract_name(text):
    nlp = spacy.load("en_core_web_sm")
    nlp_text = nlp(text)

    matcher = Matcher(nlp.vocab)
    
    # first, last name = Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
print('Name: ',extract_name(txt))

Name:  SHRADDHA SURYAWANSHI


In [None]:
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
import nltk

!wget 'https://nlp.stanford.edu/software/stanford-ner-2018-10-16.zip'
!unzip stanford-ner-2018-10-16.zip

nltk.download('punkt')



--2022-11-13 18:00:54--  https://nlp.stanford.edu/software/stanford-ner-2018-10-16.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-ner-2018-10-16.zip [following]
--2022-11-13 18:00:54--  https://downloads.cs.stanford.edu/nlp/software/stanford-ner-2018-10-16.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180358328 (172M) [application/zip]
Saving to: ‘stanford-ner-2018-10-16.zip’


2022-11-13 18:01:25 (5.72 MB/s) - ‘stanford-ner-2018-10-16.zip’ saved [180358328/180358328]

Archive:  stanford-ner-2018-10-16.zip
   creating: stanford-ner-2018-10-16/
  inflating: stanford-ner-2018-10-16/R

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
st = StanfordNERTagger('/content/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/content/stanford-ner-2018-10-16/stanford-ner.jar',
                       encoding='utf-8')


tokenized_text = word_tokenize(txt)
classified_text = st.tag(tokenized_text)
print(tokenized_text)
print(classified_text)

['SHRADDHA', 'SURYAWANSHI', 'shraddha3m', '@', 'gmail.com', '8378032152', 'Github', 'LinkedIn', 'SHRADDHA', 'SURYAWANSHI', 'shraddha3m', '@', 'gmail.com', '8378032152', 'Github', 'LinkedIn', 'EDUCATION', 'SYMBIOSIS', 'INSTITUTE', 'OF', 'TECHNOLOGY', 'Pune', ',', 'MH', ',', 'IN', 'B.TECH', 'in', 'Information', 'Technology', '(', 'CGPA', ':', '8.363/10.0', ')', '2019-2023', 'PROJECTS', 'NGO', 'EVENT', 'MANAGEMENT', 'WEBSITE', ',', 'April', ',', '2020', 'Co-developer', 'A', 'website', 'where', 'an', 'NGO', 'can', 'better', 'reach', 'donors', 'by', 'advertising', 'their', 'clothes/books', 'donation', 'drives', '.', 'Makes', 'it', 'convenient', 'for', 'donors', 'by', 'allowing', 'them', 'to', 'request', 'door', 'step', 'pickup', 'to', 'the', 'NGO', '.', 'Uses', 'JSP', ',', 'Servlets', ',', 'MySQL', ',', 'HTML', ',', 'CSS', ',', 'JS', '.', 'DATA', 'VISUALISATION', 'WEBSITE', 'Co-developer', 'Ongoing', 'October', '2021-', 'Helps', 'an', 'NGO', 'to', 'view', 'their', 'kitchen', 'operations', '

### Extracting phone number

In [None]:
import re
import subprocess  
 
def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number

print(extract_mobile_number(txt))

8378032152


### Extracting email ids

In [None]:
import re
 
from pdfminer.high_level import extract_text
 
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
 
def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

extract_emails(txt)

['shraddha3m@gmail.com', 'shraddha3m@gmail.com']

In [None]:
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Extracting skills using skills.api

In [None]:
import nltk
import requests
 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def skill_exists(skill):
    url = f'https://api.apilayer.com/skills?q={skill}'
    headers = {'apikey': 'xCUHMojA9jzAm3TfY6OHAsCdVNLEx3UV'}
    response = requests.request('GET', url, headers=headers)
    result = response.json()
 
    if response.status_code == 200:
        return len(result) != 0 and result[0].lower() == skill.lower()
    raise Exception(result.get('message'))

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def extract_skills(input_text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(input_text)
 
    # remove the stop words
    filtered_tokens = [word for word in word_tokens if word not in stop_words]
 
    # remove the punctuation
    filtered_tokens = [word for word in filtered_tokens if word.isalpha()]
 
    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
    
    print(bigrams_trigrams)
 
    # we create a set to keep the results 
    found_skills = set()
 
    # we search for each token in our skills database
    for token in filtered_tokens:
        if skill_exists(token.lower()):
            found_skills.add(token)
 
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if skill_exists(ngram.lower()):
            found_skills.add(ngram)
 
    return found_skills

extract_skills(txt)

['SHRADDHA SURYAWANSHI', 'SHRADDHA SURYAWANSHI Github', 'SURYAWANSHI Github', 'SURYAWANSHI Github LinkedIn', 'Github LinkedIn', 'Github LinkedIn SHRADDHA', 'LinkedIn SHRADDHA', 'LinkedIn SHRADDHA SURYAWANSHI', 'SHRADDHA SURYAWANSHI', 'SHRADDHA SURYAWANSHI Github', 'SURYAWANSHI Github', 'SURYAWANSHI Github LinkedIn', 'Github LinkedIn', 'Github LinkedIn EDUCATION', 'LinkedIn EDUCATION', 'LinkedIn EDUCATION SYMBIOSIS', 'EDUCATION SYMBIOSIS', 'EDUCATION SYMBIOSIS INSTITUTE', 'SYMBIOSIS INSTITUTE', 'SYMBIOSIS INSTITUTE OF', 'INSTITUTE OF', 'INSTITUTE OF TECHNOLOGY', 'OF TECHNOLOGY', 'OF TECHNOLOGY Pune', 'TECHNOLOGY Pune', 'TECHNOLOGY Pune MH', 'Pune MH', 'Pune MH IN', 'MH IN', 'MH IN Information', 'IN Information', 'IN Information Technology', 'Information Technology', 'Information Technology CGPA', 'Technology CGPA', 'Technology CGPA PROJECTS', 'CGPA PROJECTS', 'CGPA PROJECTS NGO', 'PROJECTS NGO', 'PROJECTS NGO EVENT', 'NGO EVENT', 'NGO EVENT MANAGEMENT', 'EVENT MANAGEMENT', 'EVENT MANAGE

{'Agile',
 'Bootstrap',
 'CSS',
 'Can',
 'Coursera',
 'DATA',
 'Data',
 'Data structures',
 'EDUCATION',
 'EVENT MANAGEMENT',
 'Github',
 'HTML',
 'Information Technology',
 'Java',
 'JavaScript',
 'LinkedIn',
 'MANAGEMENT',
 'MySQL',
 'NGO',
 'Networking',
 'OS',
 'PROJECTS',
 'Python',
 'SKILLS',
 'Servlets',
 'Software',
 'Software development',
 'TECHNOLOGY',
 'Technology',
 'Udemy',
 'VISUALISATION',
 'WEBSITE',
 'advertising',
 'agile',
 'cells',
 'development',
 'donors',
 'drives',
 'form',
 'materials',
 'operations',
 'raw materials',
 'reach',
 'software',
 'software development',
 'step',
 'structures',
 'website'}

In [None]:
# OUTPUT FOR SKILLS

# ['SHRADDHA SURYAWANSHI', 'SHRADDHA SURYAWANSHI Github', 'SURYAWANSHI Github', 'SURYAWANSHI Github LinkedIn', 'Github LinkedIn', 'Github LinkedIn EDUCATION', 'LinkedIn EDUCATION', 'LinkedIn EDUCATION SYMBIOSIS', 'EDUCATION SYMBIOSIS', 'EDUCATION SYMBIOSIS INSTITUTE', 'SYMBIOSIS INSTITUTE', 'SYMBIOSIS INSTITUTE OF', 'INSTITUTE OF', 'INSTITUTE OF TECHNOLOGY', 'OF TECHNOLOGY', 'OF TECHNOLOGY Information', 'TECHNOLOGY Information', 'TECHNOLOGY Information Technology', 'Information Technology', 'Information Technology CGPA', 'Technology CGPA', 'Technology CGPA PROJECTS', 'CGPA PROJECTS', 'CGPA PROJECTS NGO', 'PROJECTS NGO', 'PROJECTS NGO EVENT', 'NGO EVENT', 'NGO EVENT MANAGEMENT', 'EVENT MANAGEMENT', 'EVENT MANAGEMENT WEBSITE', 'MANAGEMENT WEBSITE', 'MANAGEMENT WEBSITE Pune', 'WEBSITE Pune', 'WEBSITE Pune MH', 'Pune MH', 'Pune MH IN', 'MH IN', 'MH IN April', 'IN April', 'IN April A', 'April A', 'April A website', 'A website', 'A website NGO', 'website NGO', 'website NGO better', 'NGO better', 'NGO better reach', 'better reach', 'better reach donors', 'reach donors', 'reach donors advertising', 'donors advertising', 'donors advertising donation', 'advertising donation', 'advertising donation drives', 'donation drives', 'donation drives Makes', 'drives Makes', 'drives Makes convenient', 'Makes convenient', 'Makes convenient donors', 'convenient donors', 'convenient donors allowing', 'donors allowing', 'donors allowing request', 'allowing request', 'allowing request door', 'request door', 'request door step', 'door step', 'door step pickup', 'step pickup', 'step pickup NGO', 'pickup NGO', 'pickup NGO Uses', 'NGO Uses', 'NGO Uses JSP', 'Uses JSP', 'Uses JSP Servlets', 'JSP Servlets', 'JSP Servlets MySQL', 'Servlets MySQL', 'Servlets MySQL HTML', 'MySQL HTML', 'MySQL HTML CSS', 'HTML CSS', 'HTML CSS JS', 'CSS JS', 'CSS JS DATA', 'JS DATA', 'JS DATA VISUALISATION', 'DATA VISUALISATION', 'DATA VISUALISATION WEBSITE', 'VISUALISATION WEBSITE', 'VISUALISATION WEBSITE Ongoing', 'WEBSITE Ongoing', 'WEBSITE Ongoing October', 'Ongoing October', 'Ongoing October Helps', 'October Helps', 'October Helps NGO', 'Helps NGO', 'Helps NGO view', 'NGO view', 'NGO view kitchen', 'view kitchen', 'view kitchen operations', 'kitchen operations', 'kitchen operations material', 'operations material', 'operations material used', 'material used', 'material used bought', 'used bought', 'used bought per', 'bought per', 'bought per month', 'per month', 'per month efficiency', 'month efficiency', 'month efficiency visually', 'efficiency visually', 'efficiency visually form', 'visually form', 'visually form simple', 'form simple', 'form simple charts', 'simple charts', 'simple charts Can', 'charts Can', 'charts Can help', 'Can help', 'Can help analyse', 'help analyse', 'help analyse adjust', 'analyse adjust', 'analyse adjust raw', 'adjust raw', 'adjust raw materials', 'raw materials', 'raw materials usage', 'materials usage', 'materials usage thereby', 'usage thereby', 'usage thereby reduce', 'thereby reduce', 'thereby reduce wastage', 'reduce wastage', 'reduce wastage Uses', 'wastage Uses', 'wastage Uses React', 'Uses React', 'Uses React Bootstrap', 'React Bootstrap', 'React Bootstrap MySQL', 'Bootstrap MySQL', 'Bootstrap MySQL Developer', 'MySQL Developer', 'MySQL Developer A', 'Developer A', 'Developer A like', 'A like', 'A like software', 'like software', 'like software grid', 'software grid', 'software grid practice', 'grid practice', 'grid practice DOM', 'practice DOM', 'practice DOM manipulation', 'DOM manipulation', 'DOM manipulation using', 'manipulation using', 'manipulation using JavaScript', 'using JavaScript', 'using JavaScript Grid', 'JavaScript Grid', 'JavaScript Grid cells', 'Grid cells', 'Grid cells change', 'cells change', 'cells change desired', 'change desired', 'change desired colour', 'desired colour', 'desired colour hovered', 'colour hovered', 'colour hovered Uses', 'hovered Uses', 'hovered Uses JavaScript', 'Uses JavaScript', 'Uses JavaScript HTML', 'JavaScript HTML', 'JavaScript HTML CSS', 'HTML CSS', 'HTML CSS SKILLS', 'CSS SKILLS', 'CSS SKILLS AND', 'SKILLS AND', 'SKILLS AND RELEVANT', 'AND RELEVANT', 'AND RELEVANT COURSEWORK', 'RELEVANT COURSEWORK', 'RELEVANT COURSEWORK Java', 'COURSEWORK Java', 'COURSEWORK Java JSP', 'Java JSP', 'Java JSP Servlets', 'JSP Servlets', 'JSP Servlets MySQL', 'Servlets MySQL', 'Servlets MySQL Python', 'MySQL Python', 'MySQL Python HTML', 'Python HTML', 'Python HTML CSS', 'HTML CSS', 'HTML CSS JS', 'CSS JS', 'CSS JS React', 'JS React', 'JS React OS', 'React OS', 'React OS Networking', 'OS Networking', 'OS Networking Data', 'Networking Data', 'Networking Data structures', 'Data structures', 'Data structures Agile', 'structures Agile', 'structures Agile software', 'Agile software', 'Agile software development', 'software development', 'software development DBMS', 'development DBMS', 'development DBMS CERTIFICATIONS', 'DBMS CERTIFICATIONS', 'DBMS CERTIFICATIONS Udemy', 'CERTIFICATIONS Udemy', 'CERTIFICATIONS Udemy Software', 'Udemy Software', 'Udemy Software development', 'Software development', 'Software development agile', 'development agile', 'development agile practices', 'agile practices', 'agile practices Coursera', 'practices Coursera', 'practices Coursera Python', 'Coursera Python', 'Coursera Python Everybody', 'Python Everybody', 'Python Everybody Coursera', 'Everybody Coursera', 'Everybody Coursera Harvard', 'Coursera Harvard', 'Coursera Harvard complete', 'Harvard complete', 'Harvard complete guide', 'complete guide', 'complete guide Udemy', 'guide Udemy', 'guide Udemy ongoing', 'Udemy ongoing']

# {'Agile',
#  'Bootstrap',
#  'CSS',
#  'Can',
#  'Coursera',
#  'DATA',
#  'Data',
#  'Data structures',
#  'EDUCATION',
#  'EVENT MANAGEMENT',
#  'Github',
#  'HTML',
#  'Information Technology',
#  'Java',
#  'JavaScript',
#  'LinkedIn',
#  'MANAGEMENT',
#  'MySQL',
#  'NGO',
#  'Networking',
#  'OS',
#  'PROJECTS',
#  'Python',
#  'SKILLS',
#  'Servlets',
#  'Software',
#  'Software development',
#  'TECHNOLOGY',
#  'Technology',
#  'Udemy',
#  'VISUALISATION',
#  'WEBSITE',
#  'advertising',
#  'agile',
#  'cells',
#  'development',
#  'donors',
#  'drives',
#  'form',
#  'materials',
#  'operations',
#  'raw materials',
#  'reach',
#  'software',
#  'software development',
#  'step',
#  'structures',
#  'website'}

### Extracting qualifications

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import re
from nltk.corpus import stopwords


# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'M.B.A', 'MBA', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSLC', 'SSC' 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
        ]

def extract_education(resume_text):
    nlp = spacy.load("en_core_web_sm")
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]
                
                

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education
print('Qualification: ',extract_education(txt))

Qualification:  ['BTECH']


### Extracting universities

In [None]:
import pandas as pd

In [None]:
def extract_university(text, file):
        df = pd.read_csv(file, header=None)
        universities = [i.lower() for i in df[1]]
        college_name = []
        listex = universities
        listsearch = [text.lower()]

        for i in range(len(listex)):
            for ii in range(len(listsearch)):
                
                if re.findall(listex[i], re.sub(' +', ' ', listsearch[ii])):
                
                    college_name.append(listex[i])
        
        return college_name
extract_university(txt, '/content/sample_data/world-universities.csv')

[]