In [None]:
#Importing libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sbn
from matplotlib.gridspec import GridSpec
import re
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Loading the dataset

data = pd.read_csv('/kaggle/input/resumescreener/UpdatedResumeDataSet.csv', encoding='utf-8')
data['cleaned_resume'] = ''
data.head()

In [None]:
print("Displaying the distinct categories of resume - ")
print(data['Category'].unique())

In [None]:
print("Displaying the distinct categoires of resume and the number of records belonging to each category - ")
print(data['Category'].value_counts())

In [None]:
#Visualization of the data
#Visualizing the number of categories in the dataset

plt.figure(figsize = (15, 15))
plt.xticks(rotation = 90)
sbn.countplot(y = 'Category', data = data)

In [None]:
#Visualizing the distribution of categories

target_counts = data['Category'].value_counts()
target_labels = data['Category'].unique()
plt.figure(1, figsize = (25, 25))
the_grid = GridSpec(2, 2)

cmap = plt.get_cmap('coolwarm')
colors = [cmap(i) for i in np.linspace(0, 1, 3)]
plt.subplot(the_grid[0, 1], aspect = 1, title = 'Category Distribution')

source_pie = plt.pie(target_counts, labels = target_labels, autopct = '%1.1f%%', shadow = True, colors = colors)
plt.show()

In [None]:
#Data Preprocessing
#Function to remove the URLs, hashtags, special letters, and punctuations

def clean_resume(resume_text):
    #Removing URLs
    resume_text = re.sub('http\S+\s*', ' ', resume_text)
    #Removing RT and cc
    resume_text = re.sub('RT|cc', ' ', resume_text)
    #Removing hashtags
    resume_text = re.sub('#\S+', '', resume_text)
    #Removing mentions
    resume_text = re.sub('@\S+', ' ', resume_text)
    #Removing punctuations
    resume_text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resume_text)
    #Removing characters which are not in ASCII range
    resume_text = re.sub(r'[^\x100-\x7f]', r' ', resume_text)
    #Removing the extra whitespace
    resume_text = re.sub('\s+', ' ', resume_text)
    return resume_text

data['cleaned_resume'] = data.Resume.apply(lambda x: clean_resume(x))

In [None]:
#Creating a wordcloud 

one_set_of_stop_words = set(stopwords.words('english')+['``', "''"])
total_words = []
Sentences = data['Resume'].values
cleaned_sentences = ""

for i in range(0, 160):
    cleaned_text = clean_resume(Sentences[i])
    cleaned_sentences += cleaned_text
    required_words = nltk.word_tokenize(cleaned_text)
    for word in required_words:
        if word not in one_set_of_stop_words and word not in string.punctuation:
            total_words.append(word)

word_freq_dist = nltk.FreqDist(total_words)
most_common = word_freq_dist.most_common(50)
print(most_common)

wc = WordCloud().generate(cleaned_sentences)
plt.figure(figsize = (15, 15))
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
#Creating Model
#Converting the words into categorical values

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [None]:
#Training a model
#Splitting the data into train and test sets

required_text = data['cleaned_resume'].values
required_target = data['Category'].values

word_vectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = 'english', max_features = 1500)
word_vectorizer.fit(required_text)
word_features = word_vectorizer.transform(required_text)

print("Feature completed...")

x_train, x_test, y_train, y_test = train_test_split(word_features, required_target, random_state = 0, test_size = 0.2)
print(x_train.shape)
print(x_test.shape)

In [None]:
#Model Building and Testing

clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(x_train, y_train)
prediction = clf.predict(x_test)
print("Accuracy of KNeighbors Classifier on training set: {:.2f}".format(clf.score(x_train, y_train)))
print("Accuracy of KNeighbors Classifier on test set: {:.2f}".format(clf.score(x_test, y_test)))
print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, prediction)))