In [None]:
import pandas as pd
import numpy as np
import pdfplumber
import warnings
warnings.filterwarnings('ignore')

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")

print("Path to dataset files:", path)

In [None]:
import os
path = os.path.join(path,os.listdir(path)[0])

In [None]:
path = os.path.join(path,os.listdir(path)[0])

In [None]:
path

Preprocessing and Creating DataSet
==

In [None]:
import re
def preprocess(text):
  text = text.lower()
  text = re.sub(r'\n',' ',text)
  text = re.sub(r'[^a-z ]'," ",text)
  text = re.sub(r' +',' ',text)
  return text.strip()

def extract_text(file_path):
  text = ""
  with pdfplumber.open(file_path) as pdf:
    for page in pdf.pages:
      page_text = page.extract_text()
      if page_text:
        text += page_text
    text = preprocess(text)
    return text

In [None]:
import random
data = []
for folder in os.listdir(path):
    folder_path = os.path.join(path,folder)
    files = os.listdir(folder_path)
    for file in files:
        file_path = os.path.join(folder_path,file)
        text = extract_text(file_path)
        data.append({
            "label" : folder,
            "text" : text
        })
 
df = pd.DataFrame(data)
df.head()

In [None]:
df = pd.read_csv('resume_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.text[0]

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace = True)

Using FastText
==

In [None]:
df['texts'] = "__label__" +df.label +" "+ df.text

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df.texts,test_size = 0.2,stratify = df.label,random_state = 42)

In [None]:
train.to_csv("train.txt",index = False,header = False)
test.to_csv("test.txt",index = False,header = False)

In [None]:
import fasttext

In [None]:
model = fasttext.train_supervised(input = "train.txt",lr=0.5, epoch=25, wordNgrams=2)

In [None]:
model.get_labels()

In [None]:
result = model.test("test.txt")
print("Samples:", result[0])
print("Precision:", result[1])
print("Recall:", result[2])

In [None]:
text = "i have experience of 5 years in sql and python"
labels, probs = model.predict([text], k=1)

domain = labels[0][0].replace("__label__", "")
confidence = probs[0][0]

print(domain, confidence)

Label Encoding
==

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['labels'] = encoder.fit_transform(df.label)
df.head()

LinearSVC, Random Forest and TfIdf Vectorizer
==

In [None]:
from sklearn.model_selection import train_test_split
X = df.text
y = df.labels

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42,stratify=df.labels)
X_train.shape,y_train.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

rf = Pipeline([
    ('vectorizer',TfidfVectorizer(max_features=30000,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True)),
    ('classifier',RandomForestClassifier())
])

svc = Pipeline([
    ('vectorizer',TfidfVectorizer(max_features=30000,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True)),
    ('classifier',LinearSVC())
])

lr = Pipeline([
    ('vectorizer',TfidfVectorizer(max_features=30000,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True)),
    ('classifier',LogisticRegression(max_iter=2000))
])

In [None]:
lr.fit(X_train,y_train)

In [None]:
rf.fit(X_train,y_train)

In [None]:
svc.fit(X_train,y_train)

In [None]:
y_pred_rf = rf.predict(X_test)
y_pred_svc = svc.predict(X_test)

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_rf))

In [None]:
print(classification_report(y_test,y_pred_svc))

In [None]:
print(classification_report(y_test,y_pred_lr))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm_lr = confusion_matrix(y_test,y_pred_lr)
cm_rf = confusion_matrix(y_test,y_pred_rf)
cm_svc = confusion_matrix(y_test,y_pred_svc)
plt.figure(figsize=(15,15))
plt.subplot(2,2,1)
sns.heatmap(cm_rf,annot = True,fmt = 'd')
plt.subplot(2,2,2)
sns.heatmap(cm_lr,annot = True,fmt = 'd')
plt.subplot(2,2,3)
sns.heatmap(cm_svc,annot = True,fmt = 'd')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
scores_lr = cross_val_score(lr, X, y, cv=skf)
scores_rf = cross_val_score(rf, X, y, cv=skf)
scores_svc = cross_val_score(svc, X, y, cv=skf)

In [None]:
print("Logistic Regressoin: ",scores_lr.mean(),"\nSupport Vector Machine",scores_svc.mean(),"\nRandom Forest: ",scores_rf.mean())

Using gensim
==

In [None]:
import gensim.downloader as api
wv = api.load("word2vec-google-news-300")

In [None]:
def vectorize(text):
    temp = text.split(' ')
    return wv.get_mean_vector(temp)
df['vector'] = df.text.apply(vectorize)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X = df.vector
y = df.labels

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42,stratify=df.labels)
y_train.shape

In [None]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [None]:
rf_gen = RandomForestClassifier()
rf_gen.fit(X_train,y_train)

In [None]:
y_pred_rf_gen = rf_gen.predict(X_test)

In [None]:
print(classification_report(y_pred_rf_gen,y_test))