In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
import requests
import json
import re
from collections import defaultdict

# Define Advisor class:

In [2]:
class Advisor():
    """ 
        Class to help a client:
        1. To choose a proper professional direction;
        2. To get a stack of necessary skills to have for chosen direction;
        3. To recommend courses to master;
    """
    
    def __init__(self):
        pass
    
    def _parse_skills(self, skills_raw):
        """ List of dicts to the list of skills """
        
        key_skills = list()
        skills_list = re.findall(r'\'.*?\'', skills_raw)
        for skill in skills_list:
            if skill != "'name'":
                key_skills.append(skill[1:-1].lower())
        return key_skills
    
    def _generate_dict(self, a):
        """ Get a dict of all the skills """
        
        d = defaultdict(int)
        for i in range(len(a)):
            for j in a['key_skills'][i]:
                d[j.lower()] += 1
        return d

    def _sort_dict(self, a):
        """ Sort a dictionary by values """
    
        list_d = list(a.items())
        list_d.sort(key=lambda i:i[1])
        list_d = list_d[::-1]
        return dict(list_d)

    def _crop_dict(self, dict_y, threshold=10):
        """ Leave only skills that frequently occur in the data """
        
        res_dict = dict()
        for i in dict_y.items():
            if i[1] > threshold:
                res_dict[i[0]] = i[1]
        return res_dict       
    
    def add_vacancies(self, path, threshold=10):
        """ 
            Add vacancies to the system.
            Input:
                - "path": where is vacancies data file;
                - "threshold": crop those skills that occur less than threshold
                times in the data;
        """
        
        data = pd.read_csv(path)
        
        # Drop duplicates:
        data = data[data.duplicated('idx') == False]
        data.reset_index(inplace=True)
        data.drop('index', axis=1, inplace=True)
        data['key_skills'] = data['key_skills'].apply(lambda x: self._parse_skills(x))
        self.data_vacancies = data
        
        # Get sorted skills dictionary:
        dict_skills = self._generate_dict(data)
        dict_skills = self._sort_dict(dict_skills)
        
        # Get main skills:
        self.dict_skills_main = self._crop_dict(dict_skills, threshold)
        
        self.skills_list = list(self.dict_skills_main)
        
        self.skill_name_to_index = {self.skills_list[i] : i for i in range(len(self.skills_list))}
        
        # Generate a matrix: vacancies x skills
        self.vacancies_mtx = np.zeros(shape=(data.shape[0], len(self.dict_skills_main)), dtype=np.float32)
        for i in range(data.shape[0]):
            for j in data.iloc[i]["key_skills"]:
                try:
                    self.vacancies_mtx [i, self.skill_name_to_index[j]] = dict_skills[j]
                except KeyError as e:
                    continue
                    
    def get_profession(self, client, show_score=False):
        """ 
            Get profession list that fit your profile best.
            Input:
                - "client": a client vector coprised of his skills;
                - "show_score": show how probable a particular
                profession is for you;
            Output:
                - a list of strings representing professional paths;
        """
        
        how_many = 10
        result = self.vacancies_mtx @ client
        self.indices_vacances_sorted = result.argsort()[::-1]
        bad_dict = (self.data_vacancies.iloc\
                    [self.indices_vacances_sorted[:how_many]]['profession']).to_dict()
        self.good_dict = {bad_dict[i] : i for i in bad_dict.keys()}
        
        if show_score:
            idx = list(self.good_dict.values())
            
            prob = result / np.sum(result[idx])
            prob_dict = {i[0]: prob[i[1]] for i in self.good_dict.items()}
            print(prob_dict)
        
        return list(self.good_dict.keys())
        
    def add_courses(self, data_courses):
        """ 
            Add courses to the system.
            Input:
                - "data_courses": Dataframe of courses and skills;
        """
        
        link_courses = dict()
        for i in data_courses.index:
            temp = data_courses.iloc[i]
            link_courses[temp["link"]] = temp['text']
    
        for i in link_courses.items():
            link_courses[i[0]] = []
            for j in self.skills_list:
                if j in i[1].lower():
                    link_courses[i[0]].append(j) 
        
        self.link_courses = link_courses 
        
        full_skills = [i[0] for i in link_courses.items() if i[1] != []]
        self.link_to_skills = pd.Series(link_courses)[full_skills]
        
        # Generate a matrix: vacancies x skills
        self.courses_mtx = np.zeros(
            shape=(self.link_to_skills.size, len(self.dict_skills_main)),
            dtype=np.float32,
        )
        
        for i, item in enumerate(self.link_to_skills.items()):
            for j in self.link_to_skills[item[0]]:
                try:
                    self.courses_mtx[i, self.skill_name_to_index[j]] = 1
                except KeyError as e:
                    continue
                    
    def get_course(self, direction, how_many=1, show_score=False):
        """ 
            Get courses links list that best fit your chosen direction.
            Input:
                - "direction": a string representing what direction a client chose;
                - "how_many": how many courses to show?;
                - "show_score": show how probable a particular
                course is for you;
            Output:
                - a list of strings representing courses links;
        """
        
        try:
            self.diff = ((self.vacancies_mtx[self.good_dict[direction]] - client) > 0).astype(float)
        except KeyError as ke:
            print(f" Do not have that: {ke} - key;")
            return 
        
        result = self.courses_mtx @ self.diff.T
        self.indices_courses_sorted = result.argsort()[::-1]
        if show_score:
            idx = self.indices_courses_sorted[:how_many]
            prob = result[idx] / np.sum(result[idx])
            print(prob)
            
        return list(self.link_to_skills.iloc[self.indices_courses_sorted[:how_many]].keys())
    
    def get_skills(self, direction, know):
        """ 
            Get skills list that you need to master.
            Input:
                - "direction": a string representing what direction a client chose;
                - "know": a list of skills that client pointed out;
            Output:
                - a list of strings representing skills;
        """
        
        set_know = set(know)
        try:
            set_job = set(self.data_vacancies.iloc[list(self.good_dict.values())]\
                          ['key_skills'][self.good_dict[direction]])
        except KeyError as ke:
            print(f" Do not have that: {ke} - key;")
            return    
        
        return list(set_job.difference(set_know))

# Load Vacancies:

In [3]:
advisor = Advisor()

In [4]:
path_data = "Data/"

In [5]:
freq_skills_threshold = 10 # to throw away the rare skills from the system
advisor.add_vacancies(path_data + 'modified_data_hh.csv', freq_skills_threshold)

# Load Courses:

### Netology:

In [6]:
data_courses_netology = pd.read_csv(path_data + 'raw_netology.csv')
data_courses_netology = data_courses_netology.rename(columns={"url_for_user": "link"})
data_courses_netology['text'] = data_courses_netology["blocks"] + data_courses_netology["skills"]
data_courses_netology = data_courses_netology.drop(columns=["blocks", "skills"])
data_courses_netology.head(3)

Unnamed: 0,link,text
0,https://netology.ru/programs/qa-middle,"[]['Кроссбраузерная верстка HTML и CSS', 'Верс..."
1,https://netology.ru/programs/qa,"[]['Кроссбраузерная верстка HTML и CSS', 'Верс..."
2,https://netology.ru/programs/python,"[]['Работа с данными на Python', 'Создание веб..."


### Skillbox:

In [7]:
data_courses_skillbox = pd.read_csv(path_data + 'skillbox.csv')
data_courses_skillbox = data_courses_skillbox[["link", "text"]]
data_courses_skillbox.head(3)

Unnamed: 0,link,text
0,https://skillbox.ru/course/profession-graphdes...,программа вас ждут 6 блоков с разным уровнем с...
1,https://skillbox.ru/course/profession-marketolog/,"программа вас ждут онлайн-лекции и задания,вып..."
2,https://skillbox.ru/course/profession-data-sci...,"программа вас ждут 9 курсов,онлайн-лекции и пр..."


### Stepik:

In [8]:
data_courses_stepik = pd.read_csv(path_data + 'stepik_data.csv')
data_courses_stepik = data_courses_stepik.rename(columns={"url": "link"})
data_courses_stepik = data_courses_stepik[["link", 'text']]
data_courses_stepik.head(3)

Unnamed: 0,link,text
0,https://stepik.org/course/99080/promo,Download any course Open app or continue in a ...
1,https://stepik.org/course/99069/promo,Download any course Open app or continue in a ...
2,https://stepik.org/course/99066/promo,Download any course Open app or continue in a ...


### Coursera:

In [9]:
data_courses_coursera = pd.read_csv(path_data + 'Coursera_data.csv')
data_courses_coursera = data_courses_coursera.rename(
    columns={"link_to_course": "link", "skills": "text"}
)
data_courses_coursera = data_courses_coursera[["link", 'text']]
data_courses_coursera.head(3)

Unnamed: 0,link,text
0,https://ru.coursera.org/specializations/academ...,"['punctuation', 'essay writing', 'academic wri..."
1,https://ru.coursera.org/professional-certifica...,"['efl', 'english language', 'tesol', 'esl', 'e..."
2,https://ru.coursera.org/learn/learning-how-to-...,"['test preparation', 'learning to learn', 'pom..."


# Add courses to Advisor:

In [10]:
data_courses = pd.concat(
    [
        data_courses_netology,
        data_courses_skillbox,
        data_courses_stepik,
        data_courses_coursera
    ],
    ignore_index=True
)

In [11]:
advisor.add_courses(data_courses)

# Model a Client:

#### $\color{red}{\text{Change this -->>}}$

In [12]:
# Look through the skills to put them to "know"
advisor.skills_list[:10]

['sql',
 'английский язык',
 'linux',
 'управление проектами',
 'git',
 'работа в команде',
 'python',
 'javascript',
 'ms sql',
 'atlassian jira']

#### $\color{red}{\text{Change this -->>}}$

In [13]:
# What skills do you have? (6 is enough!)

know = ['unity', 'git', 'it', 'linux', '3d', 'c']

In [14]:
client = np.zeros(len(advisor.dict_skills_main)).astype(np.float32)
know_inx = [advisor.skill_name_to_index[i] for i in know]
client[know_inx] = 1

# Get direction (profession):

#### $\color{green}{\text{Change "show_score": True/False}}$

In [15]:
advisor.get_profession(client, show_score=False)

['Игровое ПО',
 'Аналитик',
 'Начальный уровень, Мало опыта',
 'Тестирование',
 'Web инженер',
 'Инженер',
 'Программирование, Разработка',
 'Сетевые технологии']

#### $\color{red}{\text{Change this -->>}}$

In [16]:
# What direction do you want to follow? (Choose from the list above)

chosen = 'Аналитик'

# Get necessary skills:

In [17]:
# The skills you need to master either for the chosen direction
advisor.get_skills(chosen, know)

['python',
 'ml',
 'sql',
 'machine learning',
 'торговая площадка',
 'nlp',
 'аналитическое мышление']

# Get courses:

#### $\color{green}{\text{Change "show_score": True/False; "how_many": 1->20}}$

In [18]:
# The valid links to courses
advisor.get_course(chosen, how_many=10, show_score=False)

['https://skillbox.ru/course/profession-data-scientist/',
 'https://skillbox.ru/course/profession-machine-learning/',
 'https://skillbox.ru/course/profession-python/',
 'https://skillbox.ru/course/profession-fullstack-python/',
 'https://stepik.org/course/154/promo',
 'https://skillbox.ru/course/profession-cybersecurity/',
 'https://skillbox.ru/course/java-dev/',
 'https://skillbox.ru/course/profession-webdev/',
 'https://ru.coursera.org/specializations/python',
 'https://skillbox.ru/course/profession-test/']