In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from nltk.corpus import stopwords

#### EDA and Preprocessing

In [7]:
df = pd.read_csv('Resume.csv')
df.sample(5)

Unnamed: 0,ID,Resume_str,Resume_html,Category
2454,22442947,PASSENGER SERVICE CONCOURSE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2464,77626587,FLIGHT MANAGER Summary I am ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
2065,27257013,PUBLIC RELATIONS & COMMUNICATIONS M...,"<div class=""MPR skn-cbg1 fontsize fontface vma...",PUBLIC-RELATIONS
754,16702198,KEY ACCOUNT MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE
1901,25867805,SENIOR ACCOUNTANT Summary ...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT


In [11]:
df.shape

(2484, 4)

In [37]:
df["Category"].nunique()

24

In [12]:
df["Category"].value_counts()

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

In [19]:
df['Resume_str'][0]

"         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss pr

In [21]:
def clean_text(text):
    
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    
    # Remove urls
    clean_text = url_pattern.sub('', text)
    
    # Remove emails
    clean_text = email_pattern.sub('', clean_text)
    
    # Remove special characters
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    clean_text = ' '.join(word for word in clean_text.split() if word.lower() not in stop_words)
    
    return clean_text

In [25]:
clean_text(df['Resume_str'][0])

'HR ADMINISTRATORMARKETING ASSOCIATE HR ADMINISTRATOR Summary Dedicated Customer Service Manager 15 years experience Hospitality Customer Service Management Respected builder leader customerfocused teams strives instill shared enthusiastic commitment customer service Highlights Focused customer satisfaction Team management Marketing savvy Conflict resolution techniques Training development Skilled multitasker Client relations specialist Accomplishments Missouri DOT Supervisor Training Certification Certified IHG Customer Loyalty Marketing Segment Hilton Worldwide General Manager Training Certification Accomplished Trainer cross server hospitality systems Hilton OnQ Micros Opera PMS Fidelio OPERA Reservation System ORS Holidex Completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment Experience HR AdministratorMarketing Associate HR Administrator Dec 2013 Current Company Name City State Helps d

In [26]:
df["Resume_str"] = df["Resume_str"].apply(lambda x: clean_text(x))

In [30]:
df["Resume_str"][1000]

'SALES Summary 17 years sales operations management experience specialty bigbox retail 4 years sales experience automotive sector Experienced hiring training supervision coaching Proven skills operations human resource management planning negotiating organizing overseeing projects events Committed highest work ethic attainment organizational goals objectives Highlights Excellent interpersonal coaching skills Sales leadership development Performance metrics Detailoriented Procedure development Employee engagement Conflict resolution Onboarding training Recruiting Performance coaching counseling Operations management Human resources management PL management Powerful negotiator Accomplishments Drove store ranking 297 company top 10 less 6 months taking store Increased yearoveryear sales 60 Oversaw multiple stores Southeast worked directly local management drive sales operational success Initiated sales merchandising policies adopted companywide Recipient Best Buys Top Gun Award multiple t

## Preprocessing

In [52]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [32]:
# Lebel Encoding the category
le = LabelEncoder()

df["Category"] = le.fit_transform(df["Category"])

In [36]:
df["Category"].unique()

array([19, 13, 20, 23,  1,  9, 18, 17,  2,  8, 22, 12, 14,  5, 10, 16,  3,
       15,  0, 11, 21,  7,  4,  6])

In [35]:
df["Category"].nunique()

24

## TF-IDF Vectorization

In [39]:
tfidf = TfidfVectorizer()

resume = tfidf.fit_transform(df["Resume_str"])

## Train-Test-Validation Split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(resume, df["Category"], test_size=0.2, random_state=2024)

In [49]:
X_train.shape, y_train.shape

((1987, 53980), (1987,))

In [62]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

In [50]:
knn = KNeighborsClassifier()

In [63]:
cv_scores = cross_val_score(knn, X_train_svd, y_train, cv=5, scoring='accuracy')

In [64]:
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean():.2f}')
print(f'Standard Deviation of Accuracy: {cv_scores.std():.2f}')

Cross-Validation Scores: [0.49748744 0.51256281 0.53148615 0.4836272  0.56423174]
Mean Cross-Validation Accuracy: 0.52
Standard Deviation of Accuracy: 0.03


In [57]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the KNN classifier on test set: {accuracy:.2f}')

Accuracy of the KNN classifier on test set: 0.54


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Define a dictionary to hold models
models = {
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(),
    'MultinomialNB': MultinomialNB()
}

# Dictionary to hold accuracy scores
accuracy_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy
    
    # Print the accuracy
    print(f'Accuracy of {model_name} on test set: {accuracy:.2f}')

Accuracy of KNeighborsClassifier on test set: 0.54
Accuracy of LogisticRegression on test set: 0.63
Accuracy of RandomForestClassifier on test set: 0.60
Accuracy of SVC on test set: 0.60
Accuracy of MultinomialNB on test set: 0.52
