**Initial Setup**

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# data imported from https://www.kaggle.com/datasets/gauravduttakiit/resume-dataset?resource=download
data = pd.read_csv("UpdatedResumeDataSet.csv") 

#clean the data:
data = data.dropna()

print("size:", data.shape)
data.describe()

size: (962, 2)


Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


**Populate X, y, and testing variables**

In [2]:
# bag of words to convert the resumes to data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vect = CountVectorizer()
X = vect.fit_transform(data["Resume"]) #vectorize by frequency of words

tfidf = TfidfTransformer()
X = tfidf.fit_transform(X) #remove filler words like "the"

# y variable 
data = data[['Category']].copy() 
data['Category_encoded'] = data["Category"].astype(str)
le = preprocessing.LabelEncoder()
data['Category_encoded'] = le.fit_transform(data["Category"])
y = data["Category_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

**Train and Test Model**

In [3]:
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier(n_neighbors = 11)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.8969072164948454

In [4]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.00      0.00      0.00         1
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         4
           4       0.00      0.00      0.00         1
           5       1.00      0.71      0.83         7
           6       0.80      1.00      0.89         4
           7       1.00      1.00      1.00         3
           8       1.00      1.00      1.00         3
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         6
          11       1.00      1.00      1.00         1
          12       0.80      0.67      0.73         6
          13       1.00      1.00      1.00         2
          14       1.00      0.75      0.86         4
          15       1.00      0.93      0.96        14
          16       1.00      1.00      1.00         2
          18       0.38    

In [5]:
import joblib
joblib.dump(model,  "resume-classifier")

['resume-classifier']

In [6]:
data = data.drop_duplicates('Category')
data

Unnamed: 0,Category,Category_encoded
0,Data Science,6
40,HR,12
84,Advocate,0
104,Arts,1
140,Web Designing,24
185,Mechanical Engineer,16
225,Sales,22
265,Health and fitness,14
295,Civil Engineer,5
319,Java Developer,15
