In [1]:
!pip install nltk



In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [75]:
class JobSimilarityAnalyzer:
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stop_words = set(stopwords.words('english'))

    def tokenize_and_filter(self, text):
        tokens = self.tokenizer.tokenize(text)
        filtered_tokens = [word.lower() for word in tokens if word.lower() not in self.stop_words]

        pos_tags = pos_tag(filtered_tokens)
        filtered_words = [word for word, pos in pos_tags if pos in ['NNP', 'NN']]

        if not len(filtered_words) > 1:
            raise ValueError("The length is wrong")

        return ' '.join(filtered_words)

    def get_cosine_similarity(self, text1, text2):
        vectorizer = CountVectorizer()
        matrix = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(matrix)
        return similarity[0][1]

    def process_job_files(self, file_paths):
        job_descriptions = []

        for file_path in file_paths:
            with open("/content/drive/MyDrive/Colab Notebooks/jobs/" + file_path, 'r') as file:
                text = file.read()
                filtered_tokens = self.tokenize_and_filter(text)
                job_descriptions.append([file_path, filtered_tokens])

        return job_descriptions

    def calculate_similarity_scores(self, job_descriptions, user_text):
        user_text_tokens = self.tokenize_and_filter(user_text)
        similarity_scores = []

        for job in job_descriptions:
            similarity = self.get_cosine_similarity(user_text_tokens, job[1])
            similarity_scores.append((job[0], similarity))

        similarity_scores.sort(key=lambda x: x[1], reverse=True)

        return similarity_scores


In [79]:

test_sentence = """As a highly skilled and motivated data engineering professional, I possess a robust foundation in database design, having successfully implemented and optimized scalable database solutions using technologies such as SQL, NoSQL, and cloud-based data storage systems. My proficiency extends to ETL (Extract, Transform, Load) processes, where I have demonstrated expertise in designing and implementing efficient data pipelines to seamlessly integrate and transform diverse data sources, ensuring data quality and integrity.

I am adept at utilizing programming languages such as Python and Java to develop custom data processing scripts and applications, automating data workflows, and enhancing overall system efficiency. My experience includes working with big data technologies, including Hadoop and Spark, enabling me to process and analyze large datasets with a focus on performance optimization.

In addition, I have a strong understanding of data warehousing concepts and have effectively designed and maintained data warehouses, providing stakeholders with timely and accurate business intelligence. My experience with data modeling and schema design contributes to creating well-organized and accessible data structures that cater to specific business requirements.

Furthermore, I am well-versed in cloud platforms such as AWS, Azure, and Google Cloud, utilizing their services for data storage, processing, and analytics. My expertise extends to version control systems, ensuring proper documentation and collaborative development practices. As a proactive problem solver, I am accustomed to troubleshooting and resolving complex data issues, ensuring the continuous flow and availability of high-quality data for analytical purposes.

With a keen eye for emerging trends in data engineering and a commitment to staying abreast of technological advancements, I am confident in my ability to contribute effectively to any data engineering team, driving innovation and delivering robust solutions to meet the dynamic needs of the organization."""


analyzer = JobSimilarityAnalyzer()

file_paths = [ "data_engineer.txt", "backend_developer.txt", "frontend_developer.txt", "sales_manager.txt"]

# Process job files and calculate similarity scores
job_descriptions = analyzer.process_job_files(file_paths)
similarity_scores = analyzer.calculate_similarity_scores(job_descriptions, test_sentence)

# Print similarity scores
print("You most likely want to do this jobs: ")
for idx1, similarity in similarity_scores:
    print(f"{idx1}: {similarity}")

You most likely want to do this jobs: 
data_engineer.txt: 0.3828478769486491
frontend_developer.txt: 0.27183741892777974
backend_developer.txt: 0.18072289156626506
sales_manager.txt: 0.14065447086831775
