In [103]:
import pandas as pd
import re
import nltk
import spacy
from num2words import num2words
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from spacy.pipeline import EntityRecognizer

In [104]:
# Read the JSON file into a DataFrame
df = pd.read_json('Resume.json', lines=True)

# Save the DataFrame to CSV
df.to_csv('dataframe.csv', index=None)

# Print the shape of the dataset
print(f"Shape of the dataset: {df.shape}")

# Display the first few rows of the DataFrame
df.head()

Shape of the dataset: (200, 2)


Unnamed: 0,content,annotation
0,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ..."
1,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ..."
2,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22..."
3,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46..."
4,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18..."


In [105]:
df['content'][0]

'Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\nhttps://www.

In [106]:
df['annotation'][0]

[{'label': ['Companies worked at'],
  'points': [{'start': 1749, 'end': 1754, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1696, 'end': 1701, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1417, 'end': 1422, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1356,
    'end': 1792,
    'text': 'Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle\nPL-SQL programming, Sales Force with APEX.\nTools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,\nPL/SQL Developer, WinSCP, Putty\nWeb Technologies: JavaScript, XML, HTML, Webservice\n\nOperating Systems: Linux, Windows\nVersion control system SVN & Git-Hub\nDatabases: Oracle\nMiddleware: Web logic, OC4J\nProduct FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1209, 'end': 1214, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1136,
    'end': 1247,


## Data Cleaning

### Text Cleaning

In [107]:
def clean_text(resume):
    # Convert the text to lowercase
    resume = resume.lower()
    
    # Remove newlines
    resume = re.sub("\n", ' ', resume)
    
    # Remove special characters
    resume = re.sub(r'[,•()➢❑]', ' ', resume)
    
    # Remove extra whitespaces, dashes, and dots
    resume = re.sub(r'\s\s+|\s-\s|\.\s', ' ', resume)
    
    # Tokenize the text into words
    tokenized_words = resume.split(" ")
    
    length = len(tokenized_words)
    
    # Convert digits to words
    for i in range(length):
        if tokenized_words[i].isdigit():
            tokenized_words[i] = num2words(tokenized_words[i])
        
    # Remove stopwords
    sw = set(stopwords.words('english'))
    tokens_without_sw = []
    for w in tokenized_words:
        if w not in sw:
            tokens_without_sw.append(w)
    
    # Join the tokens back into a string
    final_resume = " ".join(tokens_without_sw)
    
    return final_resume


### Word Tagging

In [108]:
def tag_words(text):
    # Load the English language model in spaCy
    nlp = spacy.load("en_core_web_sm")
    
    # Process the text with the language model
    words = nlp(text)
    
    tagged = []
    
    # Iterate over each word in the processed text
    for word in words:
        # Print the word and its part-of-speech tag
        print(word, word.pos_)
        
        # Append the word and its part-of-speech tag to the tagged list
        tagged.append((word.text, word.pos_))
        
    return tagged

### Clean content column

In [109]:
content_resumes = df['content']

for i in range(0,200):
    content_resumes[i] = clean_text(content_resumes[i])

In [110]:
df['content'][0]

'govardhana k senior software engineer bengaluru karnataka karnataka email indeed: indeed.com/r/govardhana-k/ b2de315d95905b68 total experience five years six months cloud lending solutions inc four month salesforce developer oracle five years two month core java developer languages core java go lang oracle pl-sql programming sales force developer apex  designations & promotions willing relocate: anywhere work experience senior software engineer cloud lending solutions  bangalore karnataka  january two thousand and eighteen present present senior consultant oracle  bangalore karnataka  november two thousand and sixteen december two thousand and seventeen staff consultant oracle  bangalore karnataka  january two thousand and fourteen october two thousand and sixteen associate consultant oracle  bangalore karnataka  november two thousand and twelve december two thousand and thirteen education b.e computer science engineering adithya institute technology  tamil nadu september two thousand

In [111]:
df['content'][1]

"harini komaravelli test analyst oracle hyderabad hyderabad telangana email indeed: indeed.com/r/harini- komaravelli/2659eee82e435d1b six yrs experience manual automation testing  work experience qa analyst oracle test analyst oracle hyderabad infosys ltd  hyderabad telangana  november two thousand and eleven february two thousand and sixteen hyderabad nov two thousand and eleven feb17 two thousand and sixteen worked tata consultancy services hyderabad feb twenty-four apr eleven two thousand and seventeen currently working test analyst oracle hyderabad qa analyst six years experience oracle education mca osmania university b.sc computer science osmania university skills functional testing blue prism qtp additional information area expertise: familiar agile methodologies  knowledge energy petroleum & health care domains  involved preparation test scenarios  preparing test data test cases  https://www.indeed.com/r/harini-komaravelli/2659eee82e435d1b?isid=rex-download&ikw=download-top&co=