#Import Dependancies

In [None]:
import numpy as np
import pandas as pd

#Data Exploration

In [None]:
df = pd.read_csv('ResumeData.csv')

In [None]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
df.shape

(962, 2)

In [None]:
df['Category'].value_counts()

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64

In [None]:
df['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [None]:
df['Category'][0:20]

0     Data Science
1     Data Science
2     Data Science
3     Data Science
4     Data Science
5     Data Science
6     Data Science
7     Data Science
8     Data Science
9     Data Science
10    Data Science
11    Data Science
12    Data Science
13    Data Science
14    Data Science
15    Data Science
16    Data Science
17    Data Science
18    Data Science
19    Data Science
Name: Category, dtype: object

In [None]:
df['Resume'][0:5]

0    Skills * Programming Languages: Python (pandas...
1    Education Details \r\nMay 2013 to May 2017 B.E...
2    Areas of Interest Deep Learning, Control Syste...
3    Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4    Education Details \r\n MCA   YMCAUST,  Faridab...
Name: Resume, dtype: object

#Text Preprocessing

In [None]:
import re
def preprocessing(txt):
    preprocess = re.sub('http\S+\s', ' ', txt)
    preprocess = re.sub('RT|cc', ' ', preprocess)
    preprocess = re.sub('#\S+\s', ' ', preprocess)
    preprocess = re.sub('@\S+', '  ', preprocess)
    preprocess = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', preprocess)
    preprocess = re.sub(r'[^\x00-\x7f]', ' ', preprocess)
    preprocess = re.sub('\s+', ' ', preprocess)
    return preprocess

In [None]:
df['Resume'] = df['Resume'].apply(lambda x: preprocessing(x))

In [None]:
df['Resume'][0]

'Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details \r\n\r\nData Science Assurance Associate \r\n\r\nData Science Assurance Associate - Ernst & Young LLP\r\nSkill Details \r\nJAVASCRIPT- Exprience - 24 months\r\njQuery- Exprience - 24 months\r\nPython- Exprience - 24 monthsCompany Details \r\ncompany - Ernst & Young LLP\r\ndescription - Fraud Investigatio

#Encode Target Column

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['Category'])
df['Category'] = le.transform(df['Category'])
df.Category.unique()

array([ 6, 12,  0,  1, 24, 16, 22, 14,  5, 15,  4, 21,  2, 11, 18, 20,  8,
       17, 19,  7, 13, 10,  9,  3, 23])

In [34]:
category_mapping = {
    15: "Java Developer",
    23: "Testing",
    8: "DevOps Engineer",
    20: "Python Developer",
    24: "Web Designing",
    12: "HR",
    13: "Hadoop",
    3: "Blockchain",
    10: "ETL Developer",
    18: "Operations Manager",
    6: "Data Science",
    22: "Sales",
    16: "Mechanical Engineer",
    1: "Arts",
    7: "Database",
    11: "Electrical Engineering",
    14: "Health and fitness",
    19: "PMO",
    4: "Business Analyst",
    9: "DotNet Developer",
    2: "Automation Testing",
    17: "Network Security Engineer",
    21: "SAP Developer",
    5: "Civil Engineer",
    0: "Advocate",
}

#Create Vectors for Resume Textual Data

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizor = TfidfVectorizer(stop_words='english')
vectorizor.fit(df['Resume'])
encoded_data  = vectorizor.transform(df['Resume'])

#Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(encoded_data, df['Category'], test_size=0.2, random_state=21)

In [23]:
X_train.shape

(769, 7351)

In [24]:
X_test.shape

(193, 7351)

#Model Training

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
KNN_Classifier = KNeighborsClassifier()
KNN_Classifier.fit(X_train,y_train)
ypred = KNN_Classifier.predict(X_test)
print(accuracy_score(y_test,ypred))

0.9792746113989638


#Store Model as well as Vectorizer

In [41]:
import pickle
pickle.dump(vectorizor,open('vectorizor.pkl','wb'))
pickle.dump(KNN_Classifier, open('KNN_Classifier.pkl', 'wb'))

In [42]:
import pickle

KNN_Classifier = pickle.load(open('KNN_Classifier.pkl','rb'))
vectorizor = pickle.load(open('vectorizor.pkl','rb'))
sample_resume="""Contact  mehulvaidya94@gmail.com www.linkedin.com/in/mehul-vaidya(LinkedIn) Top Skills Recurrent Neural Networks (RNN) ChatGPT Convolutional Neural Networks(CNN) Mehul Vaidya Full Stack Developer | BI Developer | M.Tech. IIT Kharagpur (CSE) |ML | DL | NLP | Gen AI Pune, Maharashtra, India Summary Full Stack ( Spring Boot + Angular) Developer (Citi).Experienced BI developer (Persistent System).M.Tech. Computer Science IIT Kharagpur B.Tech. Computer Science VIT PuneImplemented various Gen AI, ML , DL and NLP applications.SkillsProgramming Languages: Java, PythonWeb Technologies: HTML, CSS, Javascript, AngularBackend Frameworks: Spring BootData Management: T-SQL, SQL, PowerBI, Data WarehousingMachine Learning: Scikit-learn, Pandas, NumPyDeep Learning: TensorFlow 2.0 NLP : SpacyGenerative AI: LangChain, Streamlit Algorithms & Data Structures Experience Citi Technology Analyst July 2022 - Present (2 years) Working as DevOps and Spring Boot / Python Developer Persistent Systems Module Lead July 2016 - July 2020 (4 years 1 month) Worked as BI developer. (DB creation / ETL setup / Reports creation/ Dataanalysis) Education Indian Institute of Technology, Kharagpur Master of Technology - MTech, Computer Science · (2020 - 2022) Vishwakarma Institute Of Technology Bachelor of Technology - BTech, Computer Engineering · (2012 - 2016) PTVA's Sathaye College Higher Secondary School Certificate (HSC) VPMs Vidya Mandir Secondary School Certificate (SSC) """
preprocessed_resume = preprocessing(sample_resume)
input_features = vectorizor.transform([preprocessed_resume])
prediction_id = KNN_Classifier.predict(input_features)[0]

In [43]:
category_name = category_mapping.get(prediction_id, "Unknown Category")
print("Predicted Category:", category_name , " Category ID :" ,prediction_id )

Predicted Category: Data Science  Category ID : 6
