In [15]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from nltk.corpus import stopwords

In [16]:
df_orig = pd.read_csv("resume.csv")
df = df_orig.copy(deep = True)


In [17]:
df.head()

Unnamed: 0,Id,Category,Resume
0,1,Data Science,Skills * Programming Languages: Python (pandas...
1,2,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,3,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,4,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,5,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


### Data cleaning

In [18]:
df.columns    #list of columns

Index(['Id', 'Category', 'Resume'], dtype='object')

In [19]:
df.columns.isnull()     #no null values

array([False, False, False])

In [20]:
df.shape  #checking number of rows,column

(169, 3)

In [21]:
print ("Displaying the distinct categories of resume -") # display unique domains
print (df['Category'].unique())

Displaying the distinct categories of resume -
['Data Science' 'HR' 'Advocate' 'Arts' 'Web Designing'
 'Mechanical Engineer' 'Sales' 'Health and fitness' 'Civil Engineer'
 'Java Developer' 'Business Analyst' 'SAP Developer' 'Automation Testing'
 'Electrical Engineering' 'Operations Manager' 'Python Developer'
 'DevOps Engineer' 'Network Security Engineer' 'PMO' 'Database' 'Hadoop'
 'ETL Developer' 'DotNet Developer' 'Blockchain' 'Testing']


In [22]:
# display unique domains and count of resumes on each domain
print ("Displaying the distinct categories of resume and the number of records belonging to each category -")
print (df['Category'].value_counts())

Displaying the distinct categories of resume and the number of records belonging to each category -
Java Developer               14
Database                     11
HR                           11
Advocate                     10
Data Science                 10
DotNet Developer              7
DevOps Engineer               7
Testing                       7
Automation Testing            7
Hadoop                        7
Civil Engineer                6
SAP Developer                 6
Python Developer              6
Arts                          6
Health and fitness            6
Business Analyst              6
Network Security Engineer     5
Blockchain                    5
Electrical Engineering        5
ETL Developer                 5
Web Designing                 5
Sales                         5
Mechanical Engineer           5
Operations Manager            4
PMO                           3
Name: Category, dtype: int64


In [23]:
#count of words in the dataset before data cleaning
with open('resume.csv', 'r') as file:
    file_contents = file.read()

    print('Total words:   ', len(file_contents.split()))

Total words:    70903


In [24]:
df['newer_res'] = '' # creating new columne to keep the cleaned 

In [25]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', ' ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #removing stopwords
    resumeText = pattern.sub('', resumeText)
    resumeText = re.sub(r"[A-Za-z]",
                    lambda x :  x.group(0).lower()
                                if x.group(0).isupper()       #uppercase to lowercase
                                else x.group(0).lower(),
                    resumeText)
    stop_words_lst = ['description', 'mumbai', 'chennai', 'location', 'karate', 'january', 'august', 'nagpur', 'months',
                 'solapur', 'maharashtra', 'march', 'district', 'using']
    for w in stop_words_lst:                                    #removing custom stopwords
        pattern = r'\b'+w+r'\b'
        resumeText = re.sub(pattern, '', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    
    return resumeText

In [26]:
df['newer_res'] = df.Resume.apply(lambda x: cleanResume(x))
print (df['newer_res'][29])

good grasping quality skillful work education details 2013 2018 b a ll b law universityadvocateskill details good knowledge typing well many activities exprience less 1 year monthscompany details company session court forward thinking individual refined interpersonal multitasking skills looking join progressive organization provide assistance legal work company session court provide legal assistance legal work


In [27]:
df.head()

Unnamed: 0,Id,Category,Resume,newer_res
0,1,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,2,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may 2013 may 2017 b e uit rg...
2,3,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,4,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,5,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [28]:
df.to_csv("clean_data1.csv")
print("Cleaned dataset written to cleaned_data.csv")

Cleaned dataset written to cleaned_data.csv


### checking the similarity between resumes

In [29]:
import pandas as pd
df = pd.read_csv('clean_data1.csv')
df = df.drop(['Resume'],axis=1)
df.rename(columns={'newer_res':'Resume'},inplace=True)



#some cleaning
from io import StringIO
col = ['Category', 'Resume']
df = df[col]
df = df[pd.notnull(df['Resume'])]
df.columns = ['Category', 'Resume']
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

df.head()

Unnamed: 0,Category,Resume,category_id
0,Data Science,skills programming languages python pandas num...,0
1,Data Science,education details may 2013 may 2017 b e uit rg...,0
2,Data Science,areas interest deep learning control system de...,0
3,Data Science,skills r python sap hana tableau sap hana sql ...,0
4,Data Science,education details mca ymcaust faridabad haryan...,0


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(df.Resume)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,000,01,017,02,03,04,04th,05,050education,06th,...,zaggle,zambia,zd,zenoss,zensar,zero,zhypility,zone,zookeeper,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,168
0,1.000000,0.060589,0.087076,0.184051,0.078899,0.070170,0.167567,0.211439,0.149972,0.220879,...,0.055454,0.084431,0.082902,0.014766,0.044136,0.014555,0.032168,0.021809,0.026250,0.045193
1,0.060589,1.000000,0.222637,0.082508,0.319099,0.258969,0.100533,0.160732,0.096685,0.084797,...,0.073640,0.034692,0.068899,0.017243,0.064493,0.012226,0.049128,0.010630,0.041113,0.022575
2,0.087076,0.222637,1.000000,0.134254,0.388007,0.331546,0.125224,0.186753,0.165612,0.116805,...,0.092669,0.059924,0.092379,0.055321,0.086665,0.036770,0.094834,0.019193,0.098570,0.078586
3,0.184051,0.082508,0.134254,1.000000,0.132761,0.101396,0.169515,0.136583,0.124031,0.213332,...,0.080841,0.044282,0.102082,0.023625,0.053415,0.015710,0.073343,0.035143,0.045567,0.103992
4,0.078899,0.319099,0.388007,0.132761,1.000000,0.430545,0.134901,0.127127,0.090245,0.154944,...,0.072474,0.042294,0.058018,0.026132,0.097590,0.016448,0.094807,0.019835,0.076543,0.054531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,0.014555,0.012226,0.036770,0.015710,0.016448,0.028027,0.024079,0.019486,0.019605,0.017420,...,0.045481,0.015713,0.034382,0.054716,0.054557,1.000000,0.084857,0.162969,0.036818,0.089070
165,0.032168,0.049128,0.094834,0.073343,0.094807,0.082039,0.022213,0.039393,0.036863,0.039325,...,0.050338,0.055728,0.031572,0.118499,0.066164,0.084857,1.000000,0.163190,0.076076,0.152753
166,0.021809,0.010630,0.019193,0.035143,0.019835,0.015802,0.011867,0.015047,0.025624,0.034861,...,0.026726,0.016004,0.023555,0.063845,0.041062,0.162969,0.163190,1.000000,0.033070,0.143931
167,0.026250,0.041113,0.098570,0.045567,0.076543,0.137386,0.024374,0.048912,0.035002,0.038890,...,0.099028,0.040752,0.043937,0.095973,0.095397,0.036818,0.076076,0.033070,1.000000,0.060922


### Modelling

In [32]:
df = pd.read_csv('clean_data1.csv')
df = df.drop(['Resume'],axis=1)
df.rename(columns={'newer_res':'Resume'},inplace=True)
resume_punc = df["Resume"].copy(deep  = True)
df.head()
#resume_punc

Unnamed: 0.1,Unnamed: 0,Id,Category,Resume
0,0,1,Data Science,skills programming languages python pandas num...
1,1,2,Data Science,education details may 2013 may 2017 b e uit rg...
2,2,3,Data Science,areas interest deep learning control system de...
3,3,4,Data Science,skills r python sap hana tableau sap hana sql ...
4,4,5,Data Science,education details mca ymcaust faridabad haryan...


In [33]:
import string
def rem_punc(s):
    punc = string.punctuation
    return [i for i in s if i not in punc]

In [34]:
#Remove punctaution for further processing
for ind,i in enumerate(df.itertuples()):
    token = nltk.word_tokenize(i[4])
    #print(token)
    df["Resume"][ind] = " ".join(rem_punc(token))

In [35]:
import string
from wordcloud import STOPWORDS
def rem_punc(s):
    punc = string.punctuation
    return [i for i in s if i not in punc]

def rem_sw(s):
    sw = set(STOPWORDS)
    return [i for i in s if i not in sw]

def preprocess(eval_res):
    try:
        eval_res = eval(eval_res).decode()
    except:
        pass
    eval_res = eval_res.encode("ASCII","ignore").decode()
    length = len(eval_res)
    eval_res = " ".join(eval_res.split("\n"))
    token = rem_sw(nltk.word_tokenize(eval_res)) #Removing punctaution later since we need punctaution for sentence tokenization
    eval_res = " ".join(token).lower()
    return eval_res


### Cleaning data and adding in ID for category

In [36]:
from io import StringIO
col = ['Category', 'Resume']
df = df[col]
df = df[pd.notnull(df['Resume'])]
df.columns = ['Category', 'Resume']
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

df.head()

Unnamed: 0,Category,Resume,category_id
0,Data Science,skills programming languages python pandas num...,0
1,Data Science,education details may 2013 may 2017 b e uit rg...,0
2,Data Science,areas interest deep learning control system de...,0
3,Data Science,skills r python sap hana tableau sap hana sql ...,0
4,Data Science,education details mca ymcaust faridabad haryan...,0


### Vectorizing docs¶

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1,2), stop_words='english')
features = tfidf.fit_transform(df.Resume).toarray()
labels = df.category_id
features.shape

(169, 1583)

### Applying Naive Bayes

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

x_train, x_test, y_train, y_test = train_test_split(df['Resume'], df['Category'], random_state = 0)

#print(x_train)

count_vect = CountVectorizer() # bag-of-ngrams model , based on frequency count
x_train_counts = count_vect.fit_transform(x_train)

tfidf_transformer = TfidfTransformer() #passing the word:word count
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

classifier = MultinomialNB().fit(x_train_tfidf, y_train)

### Testing it on other pdf resume

In [39]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convertPDFtoText(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string

In [40]:
test_resume = convertPDFtoText("sample.pdf")
print(test_resume)

SHRINU KUSHAGRA 

 
 

 
 

email: shrinukushagra@gmail.com 
Phone: +91-9735301541 

C-230, AZAD HALL OF RESIDENCE 
Indian Institute of Technology 
 
KHARAGPUR, INDIA 
  
ACADEMEIC PROFILE 

Year 
2007-Present 

 

 

2007 

2005 

 

Degree/Certificate 
B.Tech in Computer 
Science and 
Engineering 
Senior 
Secondary(12th) 
CBSE (10th) 

Institute / School 
Indian Institute of 
Technology , 
Kharagpur 
Central Academy ,Kota   86.60%  

Marks Obtained 
CGPA : 8.16 
(out of 10) 

Delhi Public School, 
Patna 

91.00% 

ACADEMIC ACHIEVEMENTS 

•  Secured an ALL INDIA rank of 402 among approx 300,000 (a percentile of 99.86%) students in 

Joint Entrance Examination (JEE)-2007. 

•  Secured an ALL INDIA rank of 35 among approx 600,000 (a percentile of 99.99%) students in the 

All India Engineering Entrance Examination (AIEEE)-2007. 

•  Selected for the final round of OVERNITE ,the ACM certified programming  contest  organised by 

KSHITIJ (the annual techno-manegemnt fest of IIT KHARAGPUR)

In [41]:
from nltk.probability import FreqDist
from string import punctuation
import math
def summarize(doc,words):
    score={}
    fd = FreqDist(words)
    for i,t in enumerate(doc):
        score[i] = 0
        for j in nltk.word_tokenize(t):
            if j in fd:
                score[i]+=fd[j]
    
    r = sorted(list(score.items()),key=lambda x:x[1],reverse=True)[:math.floor(0.60*len(doc))]
    r.sort(key=lambda x:x[0])
    l = [doc[i[0]] for i in r]
    return "\n\n".join(l)

#### We pass the resume extracted from the pdf using OCR through preprocess function to bring it down to the same state as the trained data, and use this for classification and summarization


### Summary of Test Resume

In [42]:
resume = preprocess(test_resume)#remove stop words etc
sent = nltk.sent_tokenize(test_resume)
puncu = punctuation
word_token = nltk.word_tokenize(test_resume)#tokenize preprocessed text for scoring

print(summarize(sent,test_resume))

SHRINU KUSHAGRA 

 
 

 
 

email: shrinukushagra@gmail.com 
Phone: +91-9735301541 

C-230, AZAD HALL OF RESIDENCE 
Indian Institute of Technology 
 
KHARAGPUR, INDIA 
  
ACADEMEIC PROFILE 

Year 
2007-Present 

 

 

2007 

2005 

 

Degree/Certificate 
B.Tech in Computer 
Science and 
Engineering 
Senior 
Secondary(12th) 
CBSE (10th) 

Institute / School 
Indian Institute of 
Technology , 
Kharagpur 
Central Academy ,Kota   86.60%  

Marks Obtained 
CGPA : 8.16 
(out of 10) 

Delhi Public School, 
Patna 

91.00% 

ACADEMIC ACHIEVEMENTS 

•  Secured an ALL INDIA rank of 402 among approx 300,000 (a percentile of 99.86%) students in 

Joint Entrance Examination (JEE)-2007.

•  Secured an ALL INDIA rank of 35 among approx 600,000 (a percentile of 99.99%) students in the 

All India Engineering Entrance Examination (AIEEE)-2007.

•  Selected for the final round of OVERNITE ,the ACM certified programming  contest  organised by 

KSHITIJ (the annual techno-manegemnt fest of IIT KHARAGPUR) 



### Predicted Label for Test Resume

In [43]:
print(classifier.predict(count_vect.transform([test_resume])))

['Java Developer']


### Checking Accuracy of Naive Bayes Model


In [44]:
df = pd.read_csv('clean_data1.csv')
df = df.drop(['Resume'],axis=1)
df.rename(columns={'newer_res':'Resume'},inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Category,Resume
0,0,1,Data Science,skills programming languages python pandas num...
1,1,2,Data Science,education details may 2013 may 2017 b e uit rg...
2,2,3,Data Science,areas interest deep learning control system de...
3,3,4,Data Science,skills r python sap hana tableau sap hana sql ...
4,4,5,Data Science,education details mca ymcaust faridabad haryan...


In [45]:
from io import StringIO
col = ['Category', 'Resume']
df = df[col]
df = df[pd.notnull(df['Resume'])]
df.columns = ['Category', 'Resume']
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

In [46]:
res_train, res_test, cat_train, cat_test = train_test_split(df['Resume'], df['Category'], test_size=0.3, random_state = 15)

vectorizer= CountVectorizer()
res_counts= vectorizer.fit_transform(res_train)

tfidf= TfidfTransformer()
res_tfidf= tfidf.fit_transform(res_counts)


classifier=MultinomialNB().fit(res_tfidf, cat_train)

In [47]:
predicted = []
for i in res_test:
    predicted.append((classifier.predict(vectorizer.transform([i])))[0])

In [48]:
a= pd.DataFrame(cat_test)
a['predicted'] = predicted
a.head()

Unnamed: 0,Category,predicted
9,Data Science,Data Science
26,Advocate,Advocate
165,Testing,Java Developer
71,Java Developer,Java Developer
154,DotNet Developer,DotNet Developer


In [49]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
accuracy=accuracy_score(a.Category, a.predicted)
print("Accuracy from Naive bayes:",accuracy)

Accuracy from Naive bayes: 0.5686274509803921
