In [14]:
import pandas as pd
import numpy as np
import spacy
import unicodedata
import string
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import seaborn as sns
import joblib
import re

In [3]:
# Load the spacy library for text cleaning
nlp = spacy.load('en_core_web_sm')

In [5]:
#Loading the dataset
df_resume = pd.read_csv('data/UpdatedResumeDataSet.csv')

In [6]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    all_letters = string.ascii_letters + " .,;'-"
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Remove Stop Words
def remove_stopwords(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_stop])

In [7]:
def clean_text(text):
    #print(f'Text before Cleaning: {text}')
    # Text to lowercase
    text = text.lower()
    # Remove URL from text
    text = re.sub(r"http\S+", "", text)
    # Remove Numbers from text
    text = re.sub(r'\d+', '', text)
    # Convert the unicode string to plain ASCII
    text = unicode_to_ascii(text)
    # Remove Punctuations
    text = re.sub(r'[^\w\s]','', text) 
    #text = remove_punct(text)
    # Remove StopWords
    text = remove_stopwords(text)
    # Remove empty spaces
    text = text.strip()
    # \s+ to match all whitespaces
    # replace them using single space " "
    text = re.sub(r"\s+", " ", text)    
    #print(f'Text after Cleaning: {text}')
    return text

In [8]:
def preprocess_text(df):
    all_resume = list()
    print("Datapreprocessing Started!!!!")
    df["clean_resume"] = df.Resume.apply(lambda x: clean_text(x))
    resume = df.clean_resume.values.tolist()
    for res in resume:
    # Lemma the resume
        doc = nlp(res)
        tokens = [token.lemma_ for token in doc]
        # remove remaining tokens that are not alphabetic
        words = [word for word in tokens if word.isalpha()]
        # filter out stop words
        my_stopwords = ['amp','nt', '_', 'utckm']
        stop_words = set(nlp.Defaults.stop_words)
        words = [word for word in words if word not in stop_words and word not in my_stopwords]
        all_resume.append(words)
    print("Datapreprocessing Done!!!")

    return all_resume, df

In [9]:
resume_list, resumeDataSet = preprocess_text(df_resume)

Datapreprocessing Started!!!!
Datapreprocessing Done!!!


In [10]:
resumeDataSet

Unnamed: 0,Category,Resume,clean_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details uitrgpvdata scientist data s...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skills proficient ms office word basi...
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenges positive thinkin...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skills quick learner eagerness learn ...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skills software knowledge mspower poi...


In [11]:
X = resumeDataSet["clean_resume"]
y = resumeDataSet["Category"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
pipe_knn = Pipeline(steps=[('cv', CountVectorizer()), ('knn_clf', KNeighborsClassifier(n_neighbors=3))])
pipe_rf = Pipeline(steps=[('cv', CountVectorizer()), ('rf_clf', RandomForestClassifier(min_samples_leaf=3))])

In [26]:
pipe_rf.fit(X_train, y_train)


Pipeline(steps=[('cv', CountVectorizer()),
                ('rf_clf', RandomForestClassifier(min_samples_leaf=3))])

In [27]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('knn_clf', KNeighborsClassifier(n_neighbors=3))])

In [29]:
pipeline_file = open('knn_clf.pkl', 'wb')
joblib.dump(pipe_knn, pipeline_file)
pipeline_file.close()

In [30]:
pipeline_file = open('rf_clf.pkl', 'wb')
joblib.dump(pipe_rf, pipeline_file)
pipeline_file.close()