## Import Libraries

In [1]:
!pip install PyPDF2 cohere sentence_transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cohere
  Downloading cohere-5.3.3-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.2/151.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.21.2 (from cohere)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     

In [133]:
import re
import ast
import json
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

import pandas as pd
import numpy as np

import cohere
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## TextExtractor

In [134]:
class TextExtractor:
    def __init__(self, text):
        self.text = text
        self.name = self.get_name_with_cohere()
        self.skills = self.get_key_skills_with_cohere()

    def get_name_with_cohere(self):
        if self.text:
            prompt = f"У тебя на вход есть текст {self.text}. Нужно выдать json с названием вакансии. Json в формате 'название': '...'"
            co = cohere.Client("8rP3am6NDcGmGieq6x8E80Wps4nU3xXZSoYSucaw")
            completion = co.chat(
                message=prompt,
                model='command-r-plus',
                temperature=0.01,
            )
            result = completion.text
            return json.loads(result)
        else:
            return "No text provided"

    def get_key_skills_with_cohere(self):
        if self.text:
            prompt = f"У тебя на вход есть текст {self.text}. Нужно выдать json cо всеми ключевыми навыками и технологиями в данном описании. Json в формате 'навыки': ['...', ..., '...']"
            co = cohere.Client("8rP3am6NDcGmGieq6x8E80Wps4nU3xXZSoYSucaw")
            completion = co.chat(
                message=prompt,
                model='command-r-plus',
                temperature=0.01,
            )
            result = completion.text
            return json.loads(result)
        else:
            return "No text provided"

    def get_job_info(self):
        return self.name + self.skills


## PDF Parser

In [135]:
class FileParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pdf_text = self.read_pdf()
        self.text_extractor = TextExtractor(self.pdf_text)
        self.job_info = self.text_extractor.get_job_info()

    def read_pdf(self, file_path):
        text = ""
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            num_pages = len(reader.pages)
            for page_num in range(num_pages):
                page = reader.pages[page_num]
                text += page.extract_text()
        return text

    def get_job_info(self):
        return self.job_info


## HH Parser

In [136]:
class HHParser:
    def __init__(self, url: str):
        self.url = url

    def get_vacancy_info(self, vacancy):
        vacancy_id = vacancy.get("id")
        vacancy_title = vacancy.get("name")
        vacancy_url = vacancy.get("alternate_url")
        vacancy_exp = vacancy.get("experience", {}).get("name")
        vacancy_empl = vacancy.get("employment", {}).get("name")
        company_name = vacancy.get("employer", {}).get("name")
        professional_roles = vacancy.get("professional_roles")
        salary = vacancy.get("salary")
        key_skills = vacancy.get("key_skills")
        vacancy_desc = vacancy.get("description")
        return (
            f"ID: {vacancy_id}\nНазвание: {vacancy_title}"
            + f"\nКомпания: {company_name}\nURL: {vacancy_url}"
            + f"\nОпыт: {vacancy_exp}\nЗанятость: {vacancy_empl}"
            + f"\nЗП: {salary}\nРоль:{professional_roles}"
            + f"\nКлючевые навыки: {key_skills}\n"
            + f"\nОписание: {vacancy_desc}\n"
        )

    def get_vacancy_info_from_url(self) -> str:
        vacancy_id = self.url.split("/")[-1]
        url = f"https://api.hh.ru/vacancies/{vacancy_id}"
        response = requests.get(url)
        if response.status_code == 200:
            vacancy_info = response.json()
            return self.get_vacancy_info(vacancy_info)
        else:
            return f"Request failed with status code: {response.status_code}"

    def get_name_from_hh(self, text: str) -> str:
        if "Название:" not in text:
            return ""
        name_start = text.index("Название:") + len("Название:")
        name_end = text.find("\n", name_start)
        if name_end == -1:
            name_end = len(text)
        return text[name_start:name_end].strip()

    def get_key_skils(self, text):
        if "Ключевые навыки" not in text:
            return ""
        match = re.search(r'\[.*?\]', text.split('Ключевые навыки:')[1])
        if match:
            result = match.group()
            skils = [d['name'] for d in ast.literal_eval(result)]
            return skils
        return None

    def get_job_info(self):
        job_text = self.get_vacancy_info_from_url()
        job_name = self.get_name_from_hh(job_text)
        job_key_skils = self.get_key_skils(job_text)
        job_info = [job_name] + job_key_skils
        return job_info


## GeekBrains Parser

In [137]:
class GeekBrainsParser:
    def __init__(self, geek_brains_courses_path=None):
        self.geek_brains_courses_path = geek_brains_courses_path
        if geek_brains_courses_path is not None:
            self.geek_brains_data = pd.read_parquet(self.geek_brains_courses_path)
            self.course_dict = self.geek_brains_data.to_dict()
        else:
            self.course_dict = {}

    def get_course_links(self, url):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a', href=True)
            links_on_course = [link['href'] for link in links]
            pattern = r'https://gb\.ru/geek_university/.+'
            filtered_links = [link for link in links_on_course if re.match(pattern, link)]
            return filtered_links
        else:
            print("Ошибка при загрузке страницы:", response.status_code)
            return []

    def get_html_from_url(self, url):
        response = requests.get(url)
        return response.text

    def clean_html(self, html):
        cleaned_text = re.sub(r'<[^>]*>', '', html)  # Удаление HTML-тегов
        return cleaned_text

    def extract_text_between_phrases(self, url, start_phrase, end_phrase):
        html = self.get_html_from_url(url)
        cleaned_text = self.clean_html(html)

        start_index = cleaned_text.find(start_phrase)
        end_index = cleaned_text.find(end_phrase, start_index)

        if start_index != -1 and end_index != -1:
            extracted_text = cleaned_text[start_index + len(start_phrase):end_index].strip()
            extracted_list = extracted_text.split('\n')
            extracted_list = [line.strip() for line in extracted_list if line.strip()]
            if len(extracted_list) > 20:
                extracted_list = extracted_list[1:21]
            else:
                extracted_list = extracted_list[1:-1]
            return extracted_list
        else:
            return []

    def extract_text_after_phrase(self, url, start_phrase, limit):
        html = self.get_html_from_url(url)
        cleaned_text = self.clean_html(html)

        start_index = cleaned_text.find(start_phrase)

        if start_index != -1:
            extracted_text = cleaned_text[start_index + len(start_phrase):].strip()
            extracted_list = extracted_text.split('\n')
            extracted_list = [line.strip() for line in extracted_list if line.strip()]
            if len(extracted_list) > limit:
                extracted_list = extracted_list[:limit]
            return extracted_list
        else:
            return []

    def extract_program_track(self, url, start_phrase, end_phrase):
        html = self.get_html_from_url(url)
        cleaned_text = self.clean_html(html)

        start_index = cleaned_text.find(start_phrase)
        end_index = cleaned_text.find(end_phrase, start_index)

        if start_index != -1 and end_index != -1:
            extracted_text = cleaned_text[start_index + len(start_phrase):end_index].strip()
            extracted_list = extracted_text.split('\n')
            extracted_list = [line.strip() for line in extracted_list if line.strip()]
            return extracted_list
        else:
            return []

    def title_and_description(self, url):
        html = self.get_html_from_url(url)
        cleaned_text = self.clean_html(html)
        soup = BeautifulSoup(html, 'html.parser')
        title = ''
        promo_text = ''
        try:
            title = re.search(r'«(.*?)»', str(soup.find('meta', property='og:title'))).group(1)
            promo_description = soup.find('div', class_='gkb-promo__description')
            promo_text = promo_description.find('p', class_='gkb-promo__text').get_text(strip=True)
        except AttributeError:
            pass
        return title, promo_text

    def find_price(self, url, target_char):
        # Получаем все цены со страницы
        html = self.get_html_from_url(url)
        cleaned_text = self.clean_html(html)
        currency_positions = [m.start() for m in re.finditer(target_char, cleaned_text)]
        prices = []

        for pos in currency_positions:
            substr = cleaned_text[pos - 8:pos + 5]
            numbers = re.findall(r'\d+', substr)
            price = ''.join(numbers)
            if price:
                prices.append(price)

        if len(prices) == 2:
            prices = [prices[0]]

        if len(prices) == 4:
            prices = [prices[1]]

        if len(prices) == 5:
            prices = [prices[0]]

        if len(prices) == 6:
            prices = ['4049']
        return prices

    def get_courses_dict(self):
        if not self.course_dict:
            url = "https://gb.ru/courses/all"
            course_links = self.get_course_links(url)
            for link in sorted(course_links):

                self.course_dict[link] = {}
                title, description = self.title_and_description(link)
                self.course_dict[link]["title"] = title
                self.course_dict[link]["description"] = description

                courses1 = self.extract_text_between_phrases(link, "Изучаемые ", "Диплом")
                courses2 = self.extract_text_between_phrases(link, "Технологии и инструменты", "Диплом")
                courses3 = self.extract_text_between_phrases(link,"Научитесь работать с основными инструментами", "Диплом")
                courses4 = self.extract_text_between_phrases(link,"Что вы изуч", "Диплом")
                courses5 = self.extract_text_after_phrase(link,'Получите все', 20)
                courses6 = self.extract_text_after_phrase(link,'Чему вы научитесь', 20)
                courses7 = self.extract_text_after_phrase(link,"Уверенное владение", 20)
                # courses8 = self.extract_text_after_phrase(link,"Результаты после обучения", 10)
                skills = courses1 + courses2 + courses3 + courses4 + courses5 + courses6 + courses7
                self.course_dict[link]["skills"] = skills

                program_track = self.extract_program_track(link, 'Основной бл','Запросить полную')
                self.course_dict[link]["program_track"] = program_track

                price = self.find_price(link,'₽/мес') + self.find_price(link,'₽ /мес.')
                self.course_dict[link]["price"] = price

        return self.course_dict


In [138]:
courses_dict = GeekBrainsParser("/content/geek_brains_courses_data.parquet").get_courses_dict()

In [139]:
courses_df = pd.DataFrame(courses_dict)
courses_df.head()

Unnamed: 0,url,title,description,skills,program_track,price
0,https://gb.ru/geek_university/design,Дизайнер с нуля до Junior,"– Универсальный курс для тех, кто ищет свою пр...","[Figma, Adobe Photoshop, Tilda, Adobe After Ef...","[ок, 3 месяца, Основы дизайна, Компьютерные те...",[3772]
1,https://gb.ru/geek_university/design/3d-artist-gb,3D-художник,"Станьте тем, кто создает 3D-модели для кино и ...","[Autodesk Maya, Blender, ZBrush, Houdini, Subs...","[ок, Основы Photoshop для CG-специалистов, Вве...",[4579]
2,https://gb.ru/geek_university/design/3d-spec,Специалист по 3D-моделированию,— Научитесь 3D-моделированию с нуля— Получите ...,"[3ds Max, Adobe Photoshop, Adobe InDesign]",[],[4525]
3,https://gb.ru/geek_university/design/concept-a...,Концепт-художник,"Научитесь создавать концепции для игр и кино, ...","[Photoshop, Blender, 3D Coat]","[ок, Photoshop для иллюстраторов, Знакомство с...",[4511]
4,https://gb.ru/geek_university/design/digital,Цифровой дизайнер с нуля до Junior,Вы научитесь думать как дизайнер и создавать у...,"[Figma, Tilda, HTML, CSS]","[ок, 2 месяца, Вводное занятие. Основы дизайна...",[3577]


## Recommender

In [146]:
class Recommender:
    def __init__(self, k: int):
        self.k = k
        self.geekbrains_embeddings = {}
        self.reshaped_geekbrains_embeddings = {}
        self.init_gb_parser()
        self.init_bert()

    def init_bert(self):
        print("[INFO]: Загрузка bert-base-nli-mean-tokens...")
        self.bert = SentenceTransformer('bert-base-nli-mean-tokens')
        print("[SUCCESS]: Берт загружен.")

    def init_gb_parser(self):
        print("[INFO]: Парсинг данных с GeekBrains.ru...")
        self.geek_brain_parser = GeekBrainsParser(geek_brains_courses_path="geek_brains_courses_data.parquet")
        courses_dict = self.geek_brain_parser.get_courses_dict()
        result = {}
        for i, row in pd.DataFrame(courses_dict).iterrows():
            result[row["url"]] = [row["title"]] + list(row["skills"])
        self.courses_dict = result
        print("[SUCCESS]: Парсинг данных успешен.")

    def get_job_info_from_text(self, text):
        print("[INFO]: Парсинг данных с текста...")
        self.text_extractor = TextExtractor(text=text)
        job_info = self.text_extractor.get_job_info()
        print("[SUCCESS]: Парсинг данных успешен.")
        return job_info

    def get_job_info_from_pdf(self, file_path):
        print("[INFO]: Парсинг данных с PDF...")
        self.file_parser = FileParser(file_path=file_path)
        job_info = self.file_parser.get_job_info()
        print("[SUCCESS]: Парсинг данных успешен.")
        return job_info

    def get_job_info_from_url(self, url):
        print("[INFO]: Парсинг данных с hh.ru...")
        self.hh_parser = HHParser(url=url)
        job_info = self.hh_parser.get_job_info()
        print("[SUCCESS]: Парсинг данных успешен.")
        return job_info

    def get_embeddings_bert_base_nli_mean_tokens(self, text):
        sen_embeddings = self.bert.encode(text)
        return sen_embeddings

    def calc_geekbrains_embs(self):
        for url, info in self.courses_dict.items():
            self.geekbrains_embeddings[url] = self.get_embeddings_bert_base_nli_mean_tokens(
                self.courses_dict.get(url, None)
            )

    def semantic_similarity_bert_base_nli_mean_tokens(self, job, course):
        score = 0
        sen = job + course
        sen_embeddings = self.bert.encode(sen)
        for i in range(len(job)):
            if job[i] in course:
                score += 1
            else:
                if max(cosine_similarity([sen_embeddings[i]], sen_embeddings[len(job):])[0]) >= 0.4:
                    score += max(cosine_similarity([sen_embeddings[i]], sen_embeddings[len(job):])[0])
        score = score / len(job)
        return round(score, 3)

    def get_coverage_mtx(self, recommendations, job_info):
        courses_dict = self.courses_dict
        competencies = job_info[1:]
        coverage_mtx = pd.DataFrame({"competencies": competencies})
        coverage_mtx = coverage_mtx.set_index("competencies")

        for course_url in recommendations:
            coverage_mtx[course_url] = np.zeros(len(competencies))
            for competence in competencies:
                if len(courses_dict.get(course_url)) < 2:
                    break
                coverage_mtx.loc[competence, course_url] = self.semantic_similarity_bert_base_nli_mean_tokens(
                    [competence], courses_dict.get(course_url)[1:]
                )
        return coverage_mtx

    def recommend(self, input, k: int or None = None):
        if input[-4:] == ".pdf":
            job_info = self.get_job_info_from_pdf(input)
        elif "https" in input:
            job_info = self.get_job_info_from_url(input)
        else:
            job_info = self.get_job_info_from_text(input)

        if k is None:
            k = self.k

        recomendations = {}
        for url, course_info in self.courses_dict.items():
            recomendations[url] = self.semantic_similarity_bert_base_nli_mean_tokens(
                job_info, course_info
            )
        recomendations_df = pd.DataFrame({"url": recomendations.keys(), "sim": recomendations.values()})
        recomendations_df = recomendations_df.sort_values(by="sim", ascending=False)
        recomendations = list(recomendations_df.iloc[:k, 0].values)

        names = [self.courses_dict.get(recomendation_url)[0] for recomendation_url in recomendations]
        print(recomendations)
        print(names)
        print(job_info)
        coverage_mtx = self.get_coverage_mtx(recommendations, job_info)
        return {
            "recommendations": recomendations,
            "job_info": job_info,
            "coverage_mtx": coverage_mtx,
            "names": names,
        }


In [147]:
recommender = Recommender(k=5)

[INFO]: Парсинг данных с GeekBrains.ru...
[SUCCESS]: Парсинг данных успешен.
[INFO]: Загрузка bert-base-nli-mean-tokens...
[SUCCESS]: Берт загружен.


In [149]:
recommendations = recommender.recommend("https://hh.ru/vacancy/97405949", k=3)

[INFO]: Парсинг данных с hh.ru...
[SUCCESS]: Парсинг данных успешен.
['Data Analyst / Data Scientist (Junior/Junior+)', 'Python', 'SQL', 'Математическая статистика', 'Git', 'Data Analysis', 'VBA', 'ML', 'Pandas', 'Numpy']


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-149-5e4e46718edb>", line 1, in <cell line: 1>
    recommendations = recommender.recommend("https://hh.ru/vacancy/97405949", k=3)
  File "<ipython-input-146-fb4c98c8aded>", line 98, in recommend
    recomendations[url] = self.semantic_similarity_bert_base_nli_mean_tokens(
  File "<ipython-input-146-fb4c98c8aded>", line 58, in semantic_similarity_bert_base_nli_mean_tokens
    sen_embeddings = self.bert.encode(sen)
  File "/usr/local/lib/python3.10/dist-packages/sentence_transformers/SentenceTransformer.py", line 371, in encode
    out_features = self.forward(features)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py", line 217, in forward
    input = module(input)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 

TypeError: object of type 'NoneType' has no len()

In [132]:
recommendations

'recommendations'