In [7]:
import pandas as pd
import re
from pymystem3 import Mystem

pd.options.mode.chained_assignment = None

## Обработка таксономии

In [8]:
column_names = ['direction', 'type', 'name', 'percent']
data_skills = pd.DataFrame(columns=column_names)
for i in range(0,6):
    skills_list = pd.read_excel('Таксономии на основе анализа рынка труда.xlsx', 
                                sheet_name=i, names=column_names, usecols=[0,1,2,3])
    data_skills = pd.concat([data_skills, skills_list])

In [9]:
def is_percent(percent):
    if percent > 1:
        percent = percent/100
    return percent
    
data_skills['percent'] = data_skills['percent'].apply(is_percent)

In [10]:
data_skills = data_skills.query('percent >= 0.1')
data_skills.reset_index(drop=True, inplace=True)

In [11]:
def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

In [12]:
data_skills['evo_name'] = data_skills['name'].str.lower()

In [13]:
def remove_punctuation(text):
    text = text.replace('\\', ' ')
    text = text.replace('!', ' ')
    text = text.replace('.', ' ')
    text = text.replace('/', ' ')
    text = text.replace('^', ' ')
    text = text.replace(':', ' ')
    text = text.replace('_', ' ')
    text = text.replace(';', ' ')
    text = text.replace('`', ' ') 
    text = text.replace(',', ' ')
    text = text.replace('-', ' ')
    text = text.replace('=', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('$', ' ') 
    text = text.replace('%', ' ')
    text = text.replace('&', ' ')
    text = text.replace('@', ' ')
    text = text.replace('[', ' ')
    text = text.replace(']', ' ')
    text = text.replace('~', ' ')
    text = text.replace('‚', ' ')
    text = text.replace('’', ' ')
    text = text.replace('➢',' ')
    text = text.replace('•',' ')
    text = text.replace('●',' ')
    text = text.replace("'",' ')
    text = text.replace('"',' ')
    text = text.replace('«',' ')
    text = text.replace('»',' ')
    return text

data_skills['evo_name'] = data_skills['evo_name'].apply(remove_punctuation)

In [14]:
data_skills['evo_name'] = data_skills['evo_name'].apply(lambda x: x.strip())

In [15]:
def lemmatize(text):
    m = Mystem()
    union_text = "&&".join(text)
    lemm_list = m.lemmatize(union_text)
    lemm_union_text = "".join(lemm_list)
    lemm_text_list = lemm_union_text.lower().split('&&')
    return lemm_text_list

data_skills['evo_name'] = lemmatize(data_skills['evo_name'])

In [16]:
cnt = -1
def basic(text, text_cell):
    global cnt
    cnt+=1
    if '+' in text:
        text = text[:-2] + '\\' + text[-2:-1]
    reg_exp = r'\W' + r'{}'.format(text) + r'\W'
    find_word = re.findall(reg_exp, text_cell)
#     print(len(find_word))
#     print(cnt)
    return len(find_word)

## Обработка текста ПЦС и расчет метрики по ПЦС

In [17]:
data_texts = pd.read_csv('file.csv')

In [18]:
data_texts.dropna(subset=['text'], inplace=True)

In [19]:
texts_sample = data_texts
# texts_sample = data_texts.sample(300, random_state=0)
texts_sample.reset_index(drop=True, inplace=True)

In [20]:
texts_sample['text'] = texts_sample['text'].apply(remove_multiple_spaces)
texts_sample['text'] = texts_sample['text'].apply(remove_punctuation)
texts_sample['text'] = texts_sample['text'].str.lower()

In [21]:
texts_sample['text'] = lemmatize(texts_sample['text'])

In [22]:
for i in range(len(texts_sample)):
    data_skills['text_{}'.format(i)] = data_skills['evo_name'].apply(lambda text: basic(text, texts_sample['text'][i]))

In [23]:
primary_skills = {'ML':'машинный обучение', 'AR/VR':'виртуальный реальность', 'аналитик данных':'анализ данный', 
               'Специалист по обласным вычислениям и распределенным системам':'облачный система', 
               'геймдизайнер':'геймдизайн', 'Образовательный дата-инженер':'big data'}

instr_dict = {'Инструменты': 4, 'Области знаний': 2, 'Софт скилл': 1, 'Хард скилл': 1.5, 'Действия/задачи': 3,
              'Действия/задачи;Хард скилл': 3, 'Функции/задачи': 3}

data_skills['weight_instr'] = 0 
data_skills['primary_skills'] = 1 

for key, value in instr_dict.items():
    data_skills['weight_instr'].loc[data_skills['type'] == '{}'.format(key)] = value 
    
for key, value in primary_skills.items():
    data_skills['primary_skills'].loc[data_skills['evo_name'] == '{}'.format(value)] = 23    

In [24]:
data_skills = data_skills.merge(1/(data_skills['name'].value_counts()), left_on='name', right_index=True)

In [25]:
for key, value in primary_skills.items():
    ar_vr = data_skills[data_skills['direction'] == '{}'.format(key)]
    list_of = []
    for i in range(len(texts_sample)):
        recall_w = len(ar_vr[ar_vr['text_{}'.format(i)] != 0]) / len(ar_vr) 
        list_of.append((ar_vr['text_{}'.format(i)] * 
                        ar_vr['percent'] *
                        ar_vr['weight_instr'] *
                        ar_vr['primary_skills'] *
                        ar_vr['name_y']).sum() + recall_w)
    list_of=pd.Series(list_of)
    texts_sample['{}'.format(key)] = list_of

In [29]:
fin_course = pd.DataFrame()
for key, value in primary_skills.items():
    temp = texts_sample[['id', 'title_of_program', '{}'.format(key), 'provider']]
    temp = temp.sort_values(by='{}'.format(key), ascending=False).head(5)
    temp['category'] = key
    temp = temp.drop(columns='{}'.format(key))
    fin_course = pd.concat([fin_course, temp])

In [439]:
fin_course.to_csv('fin_course.csv')

## Обработка текста ВУЗов и расчет метрики по ВУЗам

In [453]:
column_names = ['direction', 'type', 'name', 'percent']
data_skills = pd.DataFrame(columns=column_names)
for i in range(0,6):
    skills_list = pd.read_excel('Таксономии на основе анализа рынка труда.xlsx', 
                                sheet_name=i, names=column_names, usecols=[0,1,2,3])
    data_skills = pd.concat([data_skills, skills_list])

In [454]:
data_skills['percent'] = data_skills['percent'].apply(is_percent)
data_skills = data_skills.query('percent >= 0.1')
data_skills.reset_index(drop=True, inplace=True)
data_skills['evo_name'] = data_skills['name'].str.lower()
data_skills['evo_name'] = data_skills['evo_name'].apply(remove_punctuation)
data_skills['evo_name'] = data_skills['evo_name'].apply(lambda x: x.strip())
data_skills['evo_name'] = lemmatize(data_skills['evo_name'])

In [455]:
data_univer = pd.read_csv('universities.csv')

In [456]:
texts_sample = data_univer
# texts_sample = data_texts.sample(300, random_state=0)
texts_sample.reset_index(drop=True, inplace=True)

In [457]:
texts_sample['text'] = texts_sample['text'].apply(remove_multiple_spaces)
texts_sample['text'] = texts_sample['text'].apply(remove_punctuation)
texts_sample['text'] = texts_sample['text'].str.lower()

In [458]:
texts_sample['text'] = lemmatize(texts_sample['text'])

In [459]:
for i in range(len(texts_sample)):
    data_skills['text_{}'.format(i)] = data_skills['evo_name'].apply(lambda text: basic(text, texts_sample['text'][i]))

In [460]:
primary_skills = {'ML':'машинный обучение', 'AR/VR':'виртуальный реальность', 'аналитик данных':'анализ данный', 
               'Специалист по обласным вычислениям и распределенным системам':'облачный система', 
               'геймдизайнер':'геймдизайн', 'Образовательный дата-инженер':'big data'}

instr_dict = {'Инструменты': 4, 'Области знаний': 2, 'Софт скилл': 1, 'Хард скилл': 1.5, 'Действия/задачи': 3,
              'Действия/задачи;Хард скилл': 3, 'Функции/задачи': 3}

data_skills['weight_instr'] = 0 
data_skills['primary_skills'] = 1 

for key, value in instr_dict.items():
    data_skills['weight_instr'].loc[data_skills['type'] == '{}'.format(key)] = value 
    
for key, value in primary_skills.items():
    data_skills['primary_skills'].loc[data_skills['evo_name'] == '{}'.format(value)] = 23    

In [461]:
data_skills = data_skills.merge(1/(data_skills['name'].value_counts()), left_on='name', right_index=True)

In [463]:
for key, value in primary_skills.items():
    ar_vr = data_skills[data_skills['direction'] == '{}'.format(key)]
    list_of = []
    for i in range(len(texts_sample)):
        recall_w = len(ar_vr[ar_vr['text_{}'.format(i)] != 0]) / len(ar_vr) 
        list_of.append((ar_vr['text_{}'.format(i)] * 
                        ar_vr['percent'] *
                        ar_vr['weight_instr'] *
                        ar_vr['primary_skills'] *
                        ar_vr['name_y']).sum() + recall_w)
    list_of=pd.Series(list_of)
    texts_sample['{}'.format(key)] = list_of

In [464]:
fin_course = pd.DataFrame()
for key, value in primary_skills.items():
    temp = texts_sample[['id', 'title_of_program', '{}'.format(key), 'provider']]
    temp = temp.sort_values(by='{}'.format(key), ascending=False).head(5)
    temp['category'] = key
    temp = temp.drop(columns='{}'.format(key))
    fin_course = pd.concat([fin_course, temp])

In [471]:
fin_course.to_csv('fin_vuz.csv')