# Read the data

In [None]:
resumes_path = '/kaggle/input/resume-dataset/Resume/Resume.csv'
jobdescriptions_path = '/kaggle/input/jobs-on-naukricom/home/sdf/marketing_sample_for_naukri_com-jobs__20190701_20190830__30k_data.csv'

In [None]:
import pandas as pd

In [None]:
resumes_raw = pd.read_csv(resumes_path)
jobdescriptions_raw = pd.read_csv(jobdescriptions_path)

# Preprocess the resumes

In [None]:
resumes_raw

In [None]:
resumes_text = [text for text in resumes_raw['Resume_str']]

In [None]:
resumes_text

In [None]:
resumes_category = [category for category in resumes_raw['Category']]

In [None]:
resumes_category

In [None]:
import spacy
from tqdm import tqdm

In [None]:
nlp = spacy.load("en_core_web_sm")
resumes_text_tokenized = []

for resume in tqdm(nlp.pipe(resumes_text, n_process = 4)):
    tokens = [token.text.lower() for token in resume
              if (token.is_ascii and not token.is_punct and not token.is_space)]
    resumes_text_tokenized.append(tokens)
    
resumes_category = [' '.join(category.split('-')).lower() for category in resumes_category]

In [None]:
resumes_text_tokenized

In [None]:
resumes_category

In [None]:
resumes_text_tokenized[0]

In [None]:
resumes_raw['Resume_str'][0]

# Train FastText, Word2Vec, Doc2Vec models on the resumes data

In [None]:
from gensim.models.fasttext import FastText

ft_model = FastText(
    sg=1,
    workers=4,
    window=10,
    negative=15,
    min_n=2,
    max_n=10,
    min_count=1
)
ft_model.build_vocab(resumes_text_tokenized)
ft_model.train(
    resumes_text_tokenized,
    epochs=10,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words
)

ft_vectors = ft_model.wv
ft_model.save('ft_model.model')
ft_vectors.save('ft_vectors.kv')

In [None]:
from gensim.models.word2vec import Word2Vec

w2v_model = Word2Vec(
    window=10,
    workers=4,
    sg=1,
    min_count=1
)
w2v_model.build_vocab(resumes_text_tokenized)
w2v_model.train(
    resumes_text_tokenized,
    epochs=10,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words
)

w2v_vectors = w2v_model.wv
w2v_model.save('w2v_model.model')
w2v_vectors.save('w2v_vectors.kv')

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes_text_tokenized)]
d2v_model = Doc2Vec(
    window=10,
    workers=4,
    min_count=1,
)
d2v_model.build_vocab(documents)
d2v_model.train(
    documents,
    epochs=10,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words
)

d2v_vectors = d2v_model.wv
d2v_model.save('d2v_model.model')
d2v_vectors.save('d2v_vectors.kv')

# Preprocess the job descriptions

In [None]:
jobdescriptions_raw

In [None]:
jobdescriptions_raw.info()

In [None]:
jobdescriptions_raw_nonull = jobdescriptions_raw.drop(
    ['Uniq Id', 'Crawl Timestamp', 'Job Salary', 'Location'], axis=1
)
jobdescriptions_raw_nonull.dropna(inplace=True)

In [None]:
jobdescriptions_raw_nonull

In [None]:
jobdescriptions_raw_nonull.info()

In [None]:
import numpy as np

In [None]:
jd_title = [str(text).translate({ord('/'): ' ', ord('-'): ' ', ord(','): ' ', ord('|'): ' '})
            for text in jobdescriptions_raw_nonull['Job Title']
]
jd_experience = [text for text in jobdescriptions_raw_nonull['Job Experience Required']]
jd_skills = [str(text).translate({ord('|'): ' '}) for text in jobdescriptions_raw_nonull['Key Skills']]
jd_category = [str(text).translate({ord('/'): ' '}) for text in jobdescriptions_raw_nonull['Role Category']]
jd_area = [str(text).translate({ord('-'): ' ', ord(','): ' '})
           for text in jobdescriptions_raw_nonull['Functional Area']
]
jd_industry = [str(text).translate({ord('-'): ' ', ord(','): ' '})
               for text in jobdescriptions_raw_nonull['Industry']
]
jd_role = [str(text).translate({ord('/'): ' '}) for text in jobdescriptions_raw_nonull['Role']]

# -------------------------------------Assign labels to experience ranges--------------------------------------

# for i in range(len(jd_experience)):
#     if 'above' in jd_experience[i]:
#         jd_experience[i] = '30'
#     elif 'yrs' in jd_experience[i]:
#         jd_experience[i] = jd_experience[i][:-4]
#     elif 'Years' in jd_experience[i]:
#         jd_experience[i] = jd_experience[i].strip()
#         jd_experience[i] = jd_experience[i][:-6]

# # Generated by Chat-GPT
# seniority_levels = [
#     ["Apprentice", "Trainee", "Intern", "Entry-level", "Junior", "Graduate"],
#     ["Associate", "Staff", "Assistant", "Specialist", "Analyst", "Coordinator"],
#     ["Officer", "Consultant", "Technician", "Advisor", "Supervisor", "Team Leader"],
#     ["Lead", "Senior", "Expert", "Principal", "Manager", "Senior Manager"],
#     ["Director", "Senior Director", "Head", "Vice President", "Executive"],
#     ["Senior Executive", "Managing Director", "Chief", "Chief Operating Officer"],
#     ["Chief Financial Officer", "Chief Executive Officer", "President"],
#     ["Chairman", "Board Member", "Partner", "Senior Partner", "Managing Partner"]
# ]

# for i in range(len(jd_experience)):
#     if jd_experience[i] == '0':
#         jd_experience[i] = ' '.join(seniority_levels[0])
#     elif jd_experience[i] == '30':
#         jd_experience[i] = ' '.join(seniority_levels[-1])
#         jd_experience[i] += ' ' + ' '.join(seniority_levels[-2])
#     else:
#         _min = int(jd_experience[i].split(' - ')[0])
#         _max = int(jd_experience[i].split(' - ')[1])
        
#         if _min >= 0 and _max < 2:
            
#         if _min >=  and _max < :
            
#         if _min >=  and _max < :
            
#         if _min >=  and _max < :
            
#         if _min >=  and _max < :

# Load the models, and find similarities

In [None]:
from gensim.models import KeyedVectors

In [None]:
ft_vectors = KeyedVectors.load('ft_vectors.kv')

In [None]:
ft_vectors.most_similar(jd_title[0])

In [None]:
jd_title[0]

## ?? Wut?
## I guess the resumes' preprocessing was bad

In [None]:
jd_title[-2]

In [None]:
ft_vectors.most_similar(jd_title[-2])

## More promising I guess

In [None]:
from pprint import pprint

for i in range(0, 27005, 27005 // 5):
    print('>>' + jd_title[i])
    pprint(ft_vectors.most_similar(jd_title[i]))
    print('------------------------------------------------------------------------------------')

## Seems pretty average
## Can definitely be improved