## Text classification and information extraction

In [1]:
import pandas as pd
import numpy as np

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, Doc, DocBin

import re
import os
from IPython.display import clear_output

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import time

import ipywidgets as widgets
from ipywidgets import IntProgress, Label
from IPython.display import display

# this is import of my module which can help to process text and use NLP-technologies to get the result we needed
import nk_nlp1_5
from nk_nlp1_5 import TextPreprocessing, Categorizator

pd.set_option('display.max_row', 1000)
pd.set_option('display.max_column', 100)
pd.set_option('display.max_colwidth', None)

In [2]:
# initializing pretrained models
nlp = spacy.load('ru_core_news_lg')
nlp_en = spacy.load('en_core_web_lg')

In [3]:
# loading predefined list of categories
main_cats = pd.read_excel('main_cats.xlsx')

In [4]:
main_cats

Unnamed: 0,Categories,Cats_en
0,"Компьютерные игры, онлайн игры, игровые приставки","Computer games, MMORPG, games consoles"
1,Языки программирования,Programming languages
2,"Акции, инвестиционные возможности, вложение денег","Stocks, investment opportunities, investing money"
3,"Информационные технологии, программирование","IT, programming"
4,"Заработок в интернете, онлайн доход",Earning money on the Internet
5,"Акции, биржи, банки, ценные бумаги","Stocks, exchanges, banking, securities"
6,"Азарт, казино, ставки, букмекеры","Excitement, casino, betting, bookmaker"
7,"Эмиграция, релокация, переезд в другую страну, получение визы","Emigration, relocation, moving to another country"
8,"Политика, геополитика, экономика","Politics, geopolitics, economics, economy"
9,Спорт и фитнес,Sports and fitness


In [5]:
# loading source dataset
df = pd.read_excel('text_proc_df.xlsx')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15123 entries, 0 to 15122
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   carrier         2761 non-null   object
 1   company_name    3429 non-null   object
 2   groups          8155 non-null   object
 3   interests_info  9324 non-null   object
 4   guid            15123 non-null  object
dtypes: object(5)
memory usage: 590.9+ KB


In [7]:
# examples of raw career data
df[df['carrier'].notna()]['carrier'].tail(5)

15098    Инженер-программист – АО "Научно-производственный комлекс "Дедал" ГК "Росатом"
15099                                               Senior DevOps Engineer – XIMI group
15101                                      Государственный Университет Управления (ГУУ)
15109                                                     Developer Relations / Tech PR
15113                                                         Sales Manager – Sports.ru
Name: carrier, dtype: object

In [8]:
# examples of raw interest data
df[df['interests_info'].notna()]['interests_info'].tail(5)

15112    ["IT","физтех мфти","база данных","язык программирования","tensorflow","python язык","боты","машинное обучение","посты дпс/гибдд","open source","университеты вузы","sql язык","анализ данных data science","open data science ods","университет информационных технологий механики и оптики итмо","калининград г","искусственный интеллект","нейронные сети","deep learning school dls dlschool.org"]
15113                                                                                                                                  ["одежда","компьютерные игры","кинопаб","покупка билетов","киберспорт cybersport","объявления","chelsea фк челси","украшения","наручные часы","футбол","аренда","аудио-звуковые системы","скидки","виниловые пластинки","чемпионат","обувь","онлайн кинотеатр","фанаты"]
15115                                                                                                                                                                                                   

## Categorization clients by their interests

This part of the notebook is dedicated clients categorization using information about their interests in the certain text column.  
This task is a kind of text classification tasks.

See the pipeline below:  
<img src="./pictures/interest_categorization.png" width=1500>

### Processing Russian text and building a mapping dict

#### Cleaning text

In [9]:
# This class combines several useful methods for text cleaning and preprocessing
# See class description for details

tp = TextPreprocessing(text_col=df['interests_info'].str.lower(), nlp=nlp)

In [10]:
tp.nlp

<spacy.lang.ru.Russian at 0x21aa3118590>

In [11]:
# text preprocessing

## remaining only russian text and the specified chars
tp.extract(r'[А-Яа-яЁё \-/\\]', sep_for_tokens=',')

## replacing the specified chars with the space
tp.replace(r'[/\\]|\s+-\s*|\s*-\s+', repl=' ', sep_for_tokens=',');
#

In [12]:
# special cleaning
## deleting entire patterns, containing certain string
tp.replace(r'\A(.*\s|)(жк|фк|г|мкр|рп)(\s.*|)\Z', sep_for_tokens=',')

## deleting certain string from patterns
tp.replace(r'(?:\A|(?<=\s))(?:уже более года|частенько)(?:(?=\s)|\Z)', sep_for_tokens=',')

## deleting patterns, containing only 1 or 2 chars
tp.replace(r'\A\s*.{1,2}\s*\Z', sep_for_tokens=',')

## deleting extra spaces
tp.replace(r'\A\s+|\s+\Z|(?<=\s)(\s+)', sep_for_tokens=',');

In [13]:
tp.textcol_mod[tp.textcol_mod.notna()].tail()

15112    физтех мфти,база данных,язык программирования,язык,боты,машинное обучение,посты дпс гибдд,университеты вузы,язык,анализ данных,университет информационных технологий механики и оптики итмо,искусственный интеллект,нейронные сети
15113                                одежда,компьютерные игры,кинопаб,покупка билетов,киберспорт,объявления,украшения,наручные часы,футбол,аренда,аудио-звуковые системы,скидки,виниловые пластинки,чемпионат,обувь,онлайн кинотеатр,фанаты
15115                                                                                                                         московский политех,навигационные приложения,москва,боты,книги,университеты вузы,мобильные приложения,студенты
15117                                                                                                                                                                      объявления,трейдинг,вакансии,албания,книги,боты,фрилансеры,китай
15121                                                   

#### Extracting unique patterns list

In [14]:
# getting unique patterns from source texts

tp.get_uniquetokens();

Final list length: 2873


In [15]:
# deleting patterns that connected with geographical names and locations using the special method
tp.clear_from_label(labels=['LOC']);

HBox(children=(IntProgress(value=0, description='Progress: ', max=2873), Label(value='0')))

Unique tokens: 2873 => 2595


In [16]:
# checking the result pattern list
tp.unique_tokens

0                                      получение визы
1                                          грин карта
2                                                бали
3                                          математика
4                                             протест
                            ...                      
2590                                       штукатурка
2591    компьютерная графика около года моделирование
2592                                   моржи плавание
2593                                         кирово с
2594                                  домик на дереве
Name: result, Length: 2595, dtype: object

In [17]:
# vectorization. Tokens without vectors are deleting.
tp.vect();

HBox(children=(IntProgress(value=0, description='Progress:', max=2595), Label(value='0')))

Unique tokens: 2595 => 2221


#### Create mapping dict

In [18]:
%%time
# This class searches for bindings between patterns we obtained and the list of categories we defined before using similarity (cosine distance).
# It also calculates a number of mentions in the source texts for every pattern to understand popularity of this patterns
# See class description for details

cat_tor = Categorizator(nlp, cat_list=main_cats.iloc[:, 0], pattern_list=tp.unique_tokens, quoting=tp.textcol_mod, only_w_vector=True)

Starting NLP-processing for cat_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=66), Label(value='0')))

cat_list processed

Categories without vectors: Series([], Name: 0, dtype: object)
Starting NLP-processing for pattern_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=2221), Label(value='0')))

pattern_list processed

Starting quotes counting...


HBox(children=(IntProgress(value=0, description='Progress:', max=2221), Label(value='0')))

Successfully.
CPU times: total: 1min 17s
Wall time: 1min 16s


In [19]:
cat_tor.cat_list.head()

0    Компьютерные игры, онлайн игры, игровые приставки
1                               Языки программирования
2    Акции, инвестиционные возможности, вложение денег
3          Информационные технологии, программирование
4                  Заработок в интернете, онлайн доход
Name: 0, dtype: object

In [20]:
cat_tor.pattern_list.head()

0    получение визы
1        грин карта
2              бали
3        математика
4           протест
Name: 0, dtype: object

In [21]:
cat_tor.quoting_data.head()

Unnamed: 0_level_0,number of quotes,quotes ratio
0,Unnamed: 1_level_1,Unnamed: 2_level_1
получение визы,308.0,0.033033
грин карта,30.0,0.003218
бали,251.0,0.02692
математика,59.0,0.006328
протест,416.0,0.044616


In [22]:
# starting bindings search
ru_dict = cat_tor.patternsim_all(threshold=0.05, count_thres=3)

Using preprocessed cat_list
Using preprocessed pattern_list
Calculating similarity...


HBox(children=(IntProgress(value=0, description='Progress:', max=2221), Label(value='0')))

Successfully.
Starting quotes counting...
Using preprocessed quoting data


In [23]:
# the result is a table with data of the similarity between patterns and categories. By default data is sorted by number of quotes (mentions in the text).
# This way we can pay attention first of all to the most popular patterns.

ru_dict

Unnamed: 0,patterns,categories,similarity,number of quotes
53,объявления,"Поиск работы, вакансии",0.319074,2948.0
5,объявления,"Акции, биржи, банки, ценные бумаги",0.307715,2948.0
35,объявления,События и новости мира,0.305004,2948.0
51,трейдинг,"Онлайн торговля, маркетплейсы, электронная коммерция",0.276952,2518.0
64,трейдинг,"Вэб трансляции, стриминг, блогеры, ютуб каналы",0.255523,2518.0
...,...,...,...,...
64,янино пгт ло,"Вэб трансляции, стриминг, блогеры, ютуб каналы",0.097309,1.0
0,янино пгт ло,"Компьютерные игры, онлайн игры, игровые приставки",0.078020,1.0
65,ясновидение,"Йога, медитации, энергетические практики",0.263276,1.0
30,ясновидение,Психология и саморазвитие,0.249220,1.0


In [25]:
# saving result dict (the file has already saved)
#ru_dict.to_excel('ru_dict.xlsx')

### Processing English text and building a mapping dict

Doing all those stages for English words

#### Cleaning text

In [24]:
tp_en = TextPreprocessing(text_col=df['interests_info'].str.lower(), nlp=nlp_en)

In [25]:
tp_en.nlp

<spacy.lang.en.English at 0x21aaa7dc610>

In [26]:
# text preprocessing

## replacing emails
tp_en.replace(r'\S*@\S+', sep_for_tokens=',')

## replacing urls

tp_en.replace(r'\S+[.]\S+', sep_for_tokens=',')

## remaining only english text and the specified chars
tp_en.extract(r'[A-Za-z \-/\\]', sep_for_tokens=',')

## replacing the specified chars with the space
tp_en.replace(r'[/\\]|\s+-\s*|\s*-\s+', repl=' ', sep_for_tokens=',');
#

In [27]:
# special cleaning

## deleting patterns, containing only 1 chars
tp_en.replace(r'\A\s*.\s*\Z', sep_for_tokens=',')

## deleting patterns, containing '--'
tp_en.replace(r'\A\s*--\s*\Z', sep_for_tokens=',')

## deleting extra spaces
tp_en.replace(r'\A\s+|\s+\Z|(?<=\s)(\s+)', sep_for_tokens=',');

In [28]:
tp_en.textcol_mod[tp_en.textcol_mod.notna()].head()

1                                                                                                                                                                                 green card
7                                                                                                                                                                                         it
10    tarantool,mongodb nosql,clouds,linux,hyip,amazon web services aws,javascript js,frontend,scylladb,os,react native,cassandra,devops,react,marketpalce,nosql,clickhouse,amazon,ubuntu,it
11                                                                                                                                                                       nft,surfing,allunic
12                                                                                                                                                                                          
Name: interests_info, dtype: object

#### Extracting unique patterns list

In [29]:
tp_en.get_uniquetokens();

Final list length: 1078


In [30]:
tp_en.vect();

HBox(children=(IntProgress(value=0, description='Progress:', max=1078), Label(value='0')))

Unique tokens: 1078 => 747


#### Create mapping dict

In [31]:
%%time
cat_tor_en = Categorizator(nlp_en, cat_list=main_cats.iloc[:, 1], pattern_list=tp_en.unique_tokens, quoting=tp_en.textcol_mod, only_w_vector=True)

Starting NLP-processing for cat_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=66), Label(value='0')))

cat_list processed

Categories without vectors: Series([], Name: 0, dtype: object)
Starting NLP-processing for pattern_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=747), Label(value='0')))

pattern_list processed

Starting quotes counting...


HBox(children=(IntProgress(value=0, description='Progress:', max=747), Label(value='0')))

Successfully.
CPU times: total: 10.3 s
Wall time: 9.95 s


In [32]:
cat_tor_en.cat_list.head()

0               Computer games, MMORPG, games consoles
1                                Programming languages
2    Stocks, investment opportunities, investing money
3                                      IT, programming
4                        Earning money on the Internet
Name: 0, dtype: object

In [33]:
cat_tor_en.pattern_list.head()

0       green card
1               it
3    mongodb nosql
4           clouds
5            linux
Name: 0, dtype: object

In [34]:
cat_tor_en.quoting_data.head()

Unnamed: 0_level_0,number of quotes,quotes ratio
0,Unnamed: 1_level_1,Unnamed: 2_level_1
green card,30.0,0.003218
it,2072.0,0.222222
mongodb nosql,8.0,0.000858
clouds,49.0,0.005255
linux,151.0,0.016195


In [35]:
en_dict = cat_tor_en.patternsim_all(count_thres=3)

Using preprocessed cat_list
Using preprocessed pattern_list
Calculating similarity...


HBox(children=(IntProgress(value=0, description='Progress:', max=747), Label(value='0')))

Successfully.
Starting quotes counting...
Using preprocessed quoting data


In [38]:
#en_dict.to_excel('en_dict.xlsx')

### Mapping

In [36]:
# loading the final verified dict from Excel file
total_dict_fin = pd.read_excel('total_dict_fin.xlsx')

In [37]:
total_dict_fin

Unnamed: 0,patterns,categories,similarity,number_of_quotes
0,объявления,"Job search, hh, hr,",0.319074,2948
1,объявления,"Stocks, exchanges, banking, securities",0.307715,2948
2,объявления,Earning money on the Internet,0.261601,2948
3,объявления,"Travel companions, group trips",0.261384,2948
4,трейдинг,"Stocks, exchanges, banking, securities",0.276952,2518
5,it,"IT, programming",0.794361,2072
6,фондовый рынок,Finance and investments,0.437667,1961
7,фондовый рынок,"Stocks, investment opportunities, investing money",0.430579,1961
8,фондовый рынок,"Stocks, exchanges, banking, securities",0.427375,1961
9,книги,Literature and books,0.817532,1079


In [38]:
# concatenating preprocessed text columns for Russian and English

tp.textcol_mod = tp.textcol_mod.str.cat(tp_en.textcol_mod, sep=',')

In [39]:
tp.textcol_mod[tp.textcol_mod.notna()].head()

1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   египет,получение визы,сша,грин карта,индонезия,грузия,бали,шарм-эль-шейх,green card
7                                                                                                                                                                                                                                                                                                                                                                                                                                               

#### Getting results

Here we use a special method `.map_all` to map preprocessed text rows we've seen above with patterns from the dictionary we've built before.  
The result depends on the parameter `mode`. There are three variants of this parameter:  
1. `binary` - puts True/False to the categories columns in front of the corresponding values of the column "interest_info"
2. `quantity` - puts the quantity of found patterns in the same cells
3. `patterns` - puts the pattern lists in the same cases.

#### Getting the result as bool

In [40]:
interests_bin = tp.map_all(dict_df=total_dict_fin, mode='binary')

HBox(children=(IntProgress(value=0, description='Progress:', max=63), Label(value='0')))

In [41]:
interests_bin.insert(0, 'interests_info', df['interests_info'])

In [42]:
interests_bin[(interests_bin==True).any(axis=1)].head(10)

Unnamed: 0,interests_info,"Job search, hh, hr,","Stocks, exchanges, banking, securities",Earning money on the Internet,"Travel companions, group trips","IT, programming",Finance and investments,"Stocks, investment opportunities, investing money",Literature and books,"Politics, geopolitics, economics, economy","Marketplace, online sales, e-commerce",Marketing and advertising,"Streaming, vlog, blog, youtube channels",Languages and linguistics,"Real estate rental, property rental","Computer games, MMORPG, games consoles",Games and gaming,"Military affairs, contract service",Events and news of the world,"Purchases, discount, sales","Fashion, style, beauty",Fashion and shopping,Business and entrepreneurship,Family and children,Sex and relationships,"Charity, donations, fundraising",Design and architecture,Innovative startups and projects,Technology and innovation,"Computers, laptops, parts","Management, administration, control","IT courses, programming courses",Law and legislation,"Education, educational platforms, high school, trainings",Gadgets and electronics,Sports and fitness,Sports competitions and championships,Active recreation and extreme sports,"Emigration, relocation, moving to another country","Humor, entertainment, clubs, parties",Photography and videography,Psychology and psychotherapy,Psychology and self-development,"Excitement, casino, betting, bookmaker",Science and Research,"Analytics, data analysis, data research",Religion and spirituality,Philosophy and Ethics,Travel and tourism,"Creativity, handmade, craft",Renovation and construction,Celebrities,Programming languages,Pets and pets,"Yoga, meditation, energy practices",Social networks and communications,History and archaeology,Health and medicine,Ecology and nature conservation,Cooking and recipes,Movies and TV series,Cars and motorcycles,The film industry and cinematography,Music and concerts
1,"[""египет"",""тбилиси г"",""получение визы"",""сша"",""green card грин карта"",""индонезия"",""грузия"",""бали"",""шарм-эль-шейх""]",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,"[""математика"",""протест"",""книги"",""белоруссия"",""оппозиция"",""программирование"",""физика"",""IT"",""канада"",""белорусы"",""гомель г"",""выборы""]",False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
10,"[""tarantool"",""mongodb nosql"",""облачные решение (clouds)"",""работа с данными"",""linux линукс"",""школота"",""hyip рисковые проекты"",""amazon web services aws"",""javascript js язык"",""фреймворк"",""фронтэнд frontend"",""вакансии"",""компьютерные игры"",""стокгольм"",""scylladb"",""операционные системы os"",""react native"",""cassandra"",""подготовка к тестам"",""devops"",""хранение данных"",""приставки"",""react"",""машинное обучение"",""маркетплейс платформа marketpalce"",""швеция"",""nosql"",""консоли"",""магазин"",""clickhouse"",""университеты вузы"",""amazon"",""база данных"",""node.js"",""криптовалюта"",""задачи"",""нижний новгород"",""финляндия"",""студенты"",""большие данные"",""ubuntu"",""IT"",""язык программирования""]",True,False,True,False,True,False,True,False,False,True,True,False,True,False,True,True,False,False,False,False,False,True,False,False,False,False,True,True,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
11,"[""nft токены"",""фитнес"",""убуд г"",""сёрфинг surfing серфинг"",""стокгольм"",""школа свободных наук"",""веб трансляции стриминг"",""украина"",""шведский язык"",""бали"",""платформа на блокчейн"",""швеция"",""попутчики"",""аренда"",""треш стримы"",""криптовалюта"",""allunic / x100invest.com"",""настольные игры"",""объявления"",""купи-продай"",""пирамида финансовая"",""индонезия"",""калининград г""]",True,True,True,True,False,True,True,False,True,True,True,True,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12,"[""нижний новгород"",""паб бар кафе"",""обменник валюты фиат"",""санкт-петербург""]",False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
126,"[""продавцы"",""маркетплейс платформа marketpalce"",""электронная коммерция""]",False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
128,"[""голодание"",""индуизм"",""армения"",""северный кипр"",""кришнаизм"",""ереван г"",""эвакуация сограждан"",""альтернативная медицина"",""внж"",""бали"",""тантра"",""госуслуги мфц"",""кипр"",""фондовый рынок"",""медитации"",""индия"",""получение визы"",""авиабилеты"",""трейдинг"",""гоа"",""индонезия""]",False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
130,"[""data engineering"",""драки единоборства"",""сбор средств"",""поддержка армии рф"",""мтс"",""рязань"",""clickhouse"",""работа с данными"",""большие данные"",""IT"",""сотовая связь"",""компьютерные игры""]",False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
131,"[""IT""]",False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
132,"[""магазин"",""тверь""]",False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


#### Getting the result as quantity of found patterns

In [43]:
interests_quant = tp.map_all(dict_df=total_dict_fin, mode='quantity')

HBox(children=(IntProgress(value=0, description='Progress:', max=63), Label(value='0')))

In [44]:
interests_quant.insert(0, 'interests_info', df['interests_info'])

In [45]:
interests_quant[(interests_quant.iloc[:, 1:]>0).any(axis=1)].head()

Unnamed: 0,interests_info,"Job search, hh, hr,","Stocks, exchanges, banking, securities",Earning money on the Internet,"Travel companions, group trips","IT, programming",Finance and investments,"Stocks, investment opportunities, investing money",Literature and books,"Politics, geopolitics, economics, economy","Marketplace, online sales, e-commerce",Marketing and advertising,"Streaming, vlog, blog, youtube channels",Languages and linguistics,"Real estate rental, property rental","Computer games, MMORPG, games consoles",Games and gaming,"Military affairs, contract service",Events and news of the world,"Purchases, discount, sales","Fashion, style, beauty",Fashion and shopping,Business and entrepreneurship,Family and children,Sex and relationships,"Charity, donations, fundraising",Design and architecture,Innovative startups and projects,Technology and innovation,"Computers, laptops, parts","Management, administration, control","IT courses, programming courses",Law and legislation,"Education, educational platforms, high school, trainings",Gadgets and electronics,Sports and fitness,Sports competitions and championships,Active recreation and extreme sports,"Emigration, relocation, moving to another country","Humor, entertainment, clubs, parties",Photography and videography,Psychology and psychotherapy,Psychology and self-development,"Excitement, casino, betting, bookmaker",Science and Research,"Analytics, data analysis, data research",Religion and spirituality,Philosophy and Ethics,Travel and tourism,"Creativity, handmade, craft",Renovation and construction,Celebrities,Programming languages,Pets and pets,"Yoga, meditation, energy practices",Social networks and communications,History and archaeology,Health and medicine,Ecology and nature conservation,Cooking and recipes,Movies and TV series,Cars and motorcycles,The film industry and cinematography,Music and concerts
1,"[""египет"",""тбилиси г"",""получение визы"",""сша"",""green card грин карта"",""индонезия"",""грузия"",""бали"",""шарм-эль-шейх""]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,"[""математика"",""протест"",""книги"",""белоруссия"",""оппозиция"",""программирование"",""физика"",""IT"",""канада"",""белорусы"",""гомель г"",""выборы""]",0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
10,"[""tarantool"",""mongodb nosql"",""облачные решение (clouds)"",""работа с данными"",""linux линукс"",""школота"",""hyip рисковые проекты"",""amazon web services aws"",""javascript js язык"",""фреймворк"",""фронтэнд frontend"",""вакансии"",""компьютерные игры"",""стокгольм"",""scylladb"",""операционные системы os"",""react native"",""cassandra"",""подготовка к тестам"",""devops"",""хранение данных"",""приставки"",""react"",""машинное обучение"",""маркетплейс платформа marketpalce"",""швеция"",""nosql"",""консоли"",""магазин"",""clickhouse"",""университеты вузы"",""amazon"",""база данных"",""node.js"",""криптовалюта"",""задачи"",""нижний новгород"",""финляндия"",""студенты"",""большие данные"",""ubuntu"",""IT"",""язык программирования""]",1,0,2,0,12,0,2,0,0,2,1,0,1,0,3,1,0,0,0,0,0,1,0,0,0,0,2,1,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0
11,"[""nft токены"",""фитнес"",""убуд г"",""сёрфинг surfing серфинг"",""стокгольм"",""школа свободных наук"",""веб трансляции стриминг"",""украина"",""шведский язык"",""бали"",""платформа на блокчейн"",""швеция"",""попутчики"",""аренда"",""треш стримы"",""криптовалюта"",""allunic / x100invest.com"",""настольные игры"",""объявления"",""купи-продай"",""пирамида финансовая"",""индонезия"",""калининград г""]",1,1,2,2,0,1,1,0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12,"[""нижний новгород"",""паб бар кафе"",""обменник валюты фиат"",""санкт-петербург""]",0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Getting the result as found patterns

In [46]:
interests_pats = tp.map_all(dict_df=total_dict_fin, mode='patterns')

HBox(children=(IntProgress(value=0, description='Progress:', max=63), Label(value='0')))

In [47]:
interests_pats.insert(0, 'interests_info', df['interests_info'])

In [48]:
interests_pats[(interests_pats.iloc[:, 1:]!='').any(axis=1)]

Unnamed: 0,interests_info,"Job search, hh, hr,","Stocks, exchanges, banking, securities",Earning money on the Internet,"Travel companions, group trips","IT, programming",Finance and investments,"Stocks, investment opportunities, investing money",Literature and books,"Politics, geopolitics, economics, economy","Marketplace, online sales, e-commerce",Marketing and advertising,"Streaming, vlog, blog, youtube channels",Languages and linguistics,"Real estate rental, property rental","Computer games, MMORPG, games consoles",Games and gaming,"Military affairs, contract service",Events and news of the world,"Purchases, discount, sales","Fashion, style, beauty",Fashion and shopping,Business and entrepreneurship,Family and children,Sex and relationships,"Charity, donations, fundraising",Design and architecture,Innovative startups and projects,Technology and innovation,"Computers, laptops, parts","Management, administration, control","IT courses, programming courses",Law and legislation,"Education, educational platforms, high school, trainings",Gadgets and electronics,Sports and fitness,Sports competitions and championships,Active recreation and extreme sports,"Emigration, relocation, moving to another country","Humor, entertainment, clubs, parties",Photography and videography,Psychology and psychotherapy,Psychology and self-development,"Excitement, casino, betting, bookmaker",Science and Research,"Analytics, data analysis, data research",Religion and spirituality,Philosophy and Ethics,Travel and tourism,"Creativity, handmade, craft",Renovation and construction,Celebrities,Programming languages,Pets and pets,"Yoga, meditation, energy practices",Social networks and communications,History and archaeology,Health and medicine,Ecology and nature conservation,Cooking and recipes,Movies and TV series,Cars and motorcycles,The film industry and cinematography,Music and concerts
1,"[""египет"",""тбилиси г"",""получение визы"",""сша"",""green card грин карта"",""индонезия"",""грузия"",""бали"",""шарм-эль-шейх""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,получение визы,,,,,,,,,,бали,,,,,,,,,,,,,,,
7,"[""математика"",""протест"",""книги"",""белоруссия"",""оппозиция"",""программирование"",""физика"",""IT"",""канада"",""белорусы"",""гомель г"",""выборы""]",,,,,"it,программирование",,,книги,оппозиция,,,,,,,,,,,,,,,протест,,,,,,,программирование,"протест,выборы",,,,,,,,,,,,,,,,,,,,программирование,,,,,,,,,,,
10,"[""tarantool"",""mongodb nosql"",""облачные решение (clouds)"",""работа с данными"",""linux линукс"",""школота"",""hyip рисковые проекты"",""amazon web services aws"",""javascript js язык"",""фреймворк"",""фронтэнд frontend"",""вакансии"",""компьютерные игры"",""стокгольм"",""scylladb"",""операционные системы os"",""react native"",""cassandra"",""подготовка к тестам"",""devops"",""хранение данных"",""приставки"",""react"",""машинное обучение"",""маркетплейс платформа marketpalce"",""швеция"",""nosql"",""консоли"",""магазин"",""clickhouse"",""университеты вузы"",""amazon"",""база данных"",""node.js"",""криптовалюта"",""задачи"",""нижний новгород"",""финляндия"",""студенты"",""большие данные"",""ubuntu"",""IT"",""язык программирования""]",вакансии,,"магазин,hyip",,"it,операционные системы,os,язык программирования,frontend,devops,машинное обучение,linux,javascript js,react,clouds,react native",,"проекты,рисковые проекты",,,"маркетплейс платформа,магазин",hyip,,язык,,"компьютерные игры,приставки,линукс",компьютерные игры,,,,,,магазин,,,,,"проекты,рисковые проекты",проекты,операционные системы,,язык программирования,,"университеты вузы,студенты",,,,,,,,,,,"работа с данными,большие данные","работа с данными,машинное обучение,большие данные",,,,,,,"язык программирования,frontend,javascript js",,,,,,,,,,,
11,"[""nft токены"",""фитнес"",""убуд г"",""сёрфинг surfing серфинг"",""стокгольм"",""школа свободных наук"",""веб трансляции стриминг"",""украина"",""шведский язык"",""бали"",""платформа на блокчейн"",""швеция"",""попутчики"",""аренда"",""треш стримы"",""криптовалюта"",""allunic / x100invest.com"",""настольные игры"",""объявления"",""купи-продай"",""пирамида финансовая"",""индонезия"",""калининград г""]",объявления,объявления,"объявления,веб трансляции стриминг","объявления,попутчики",,пирамида финансовая,платформа на блокчейн,,пирамида финансовая,пирамида финансовая,пирамида финансовая,веб трансляции стриминг,язык,аренда,,настольные игры,,,,,,,,,,,,,,,,,,,"фитнес,surfing",,,попутчики,,,,,,,,,,"попутчики,бали",,,,,,,,,,,,,,,
12,"[""нижний новгород"",""паб бар кафе"",""обменник валюты фиат"",""санкт-петербург""]",,обменник валюты фиат,,,,,обменник валюты фиат,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15112,"[""IT"",""физтех мфти"",""база данных"",""язык программирования"",""tensorflow"",""python язык"",""боты"",""машинное обучение"",""посты дпс/гибдд"",""open source"",""университеты вузы"",""sql язык"",""анализ данных data science"",""open data science ods"",""университет информационных технологий механики и оптики итмо"",""калининград г"",""искусственный интеллект"",""нейронные сети"",""deep learning school dls dlschool.org""]",,,,,"it,боты,язык программирования,python,нейронные сети,машинное обучение,анализ данных,искусственный интеллект",,,,,,,,язык,,,,,,,,,,,,,,,,,,язык программирования,,университеты вузы,,,,,,,,,,,"data science,анализ данных,искусственный интеллект","машинное обучение,data science,анализ данных,искусственный интеллект,sql,open data science ods",,,,,,,"язык программирования,python,sql",,,,,,,,,,,
15113,"[""одежда"",""компьютерные игры"",""кинопаб"",""покупка билетов"",""киберспорт cybersport"",""объявления"",""chelsea фк челси"",""украшения"",""наручные часы"",""футбол"",""аренда"",""аудио-звуковые системы"",""скидки"",""виниловые пластинки"",""чемпионат"",""обувь"",""онлайн кинотеатр"",""фанаты""]",объявления,объявления,объявления,объявления,,,,,,онлайн кинотеатр,,,,аренда,компьютерные игры,компьютерные игры,,,скидки,"одежда,обувь","одежда,обувь",,,,,,,,,,,,,,футбол,"футбол,фанаты",,,,,,,,,,,,,,,,,,,,,,,,,,онлайн кинотеатр,
15115,"[""московский политех"",""навигационные приложения"",""москва"",""боты"",""книги"",""университеты вузы"",""flibusta"",""мобильные приложения"",""студенты""]",,,,,"боты,мобильные приложения",,,книги,,,,,,,,,,,,,,,,,,,,,,,,,"университеты вузы,студенты",мобильные приложения,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15117,"[""дуррес г"",""объявления"",""трейдинг"",""вакансии"",""албания"",""книги"",""боты"",""flibusta"",""фрилансеры"",""wechat"",""китай""]","объявления,вакансии","объявления,трейдинг",объявления,объявления,боты,,,книги,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Bulding training data and model finetuning

#### RU

In [60]:
# loading the dictionary for Russian language we built before

dict_ru = pd.read_excel('map_dict_ru_fin.xlsx')

In [61]:
dict_ru.head()

Unnamed: 0,patterns,categories,similarity,number_of_quotes
0,объявления,"Поиск работы, вакансии",0.319074,2948
1,объявления,"Акции, биржи, банки, ценные бумаги",0.307715,2948
2,объявления,"Заработок в интернете, онлайн доход",0.261601,2948
3,объявления,"Групповые поездки, поиск попутчиков",0.261384,2948
4,трейдинг,"Акции, биржи, банки, ценные бумаги",0.276952,2518


In [62]:
# classifying preprocessed source texts

interests_ru = tp.map_all(dict_df=dict_ru, mode='binary')

HBox(children=(IntProgress(value=0, description='Progress:', max=63), Label(value='0')))

In [112]:
interests_ru.head()

Unnamed: 0,"Поиск работы, вакансии","Акции, биржи, банки, ценные бумаги","Заработок в интернете, онлайн доход","Групповые поездки, поиск попутчиков",Финансы и инвестиции,"Акции, инвестиционные возможности, вложение денег",Литература и книги,"Политика, геополитика, экономика","Онлайн торговля, маркетплейсы, электронная коммерция",Маркетинг и реклама,"Информационные технологии, программирование","Вэб трансляции, стриминг, блогеры, ютуб каналы",Языки и лингвистика,"Аренда недвижимости, аренда имущества","Компьютерные игры, онлайн игры, игровые приставки",Игры и гейминг,"Военное дело, армия, контрактная служба",События и новости мира,"Закупки, скидки, дисконт, распродажа","Мода, стиль, красота",Мода и шоппинг,Бизнес и предпринимательство,Семья и дети,Секс и отношения,"Благотворительность, сбор средств, пожертования",Дизайн и архитектура,Инновационные стартапы и проекты,Технологии и инновации,"Компьютеры, ноутбуки, комплектующие","Компьютерные курсы, обучение программированию","Образование, образовательные платформы, ВУЗы, трейнинги","Менеджмент, управление, руководство",Право и законодательство,Гаджеты и электроника,Спорт и фитнес,Спортивные соревнования и чемпионаты,Активный отдых и экстремальные виды спорта,"Эмиграция, релокация, переезд в другую страну, получение визы","Развлечения, клубы, тусовки, вечеринки",Фотография и видеосъемка,Психология и психотерапия,Психология и саморазвитие,"Азарт, казино, ставки, букмекеры",Наука и исследования,"Аналитика, анализ данных, исследования",Религия и духовность,Философия и этика,"Путешествия, туризм, походы","Творчество, рукоделие, сделай сам, авторская работа",Ремонт и строительство,"Знаменитости, популярные персоны",Языки программирования,Домашние животные и питомцы,"Йога, медитации, энергетические практики",Социальные сети и коммуникации,История и археология,Здоровье и медицина,Экология и охрана природы,Кулинария и рецепты,Автомобили и мотоциклы,Киноиндустрия и кинематограф,Фильмы и сериалы,Музыка и концерты
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [69]:
%%time
# using the special method `get_train_data` to get data for model fine-tuning
# training data is automatically being split to TRAIN and TEST in parts specified in parameter `split` 
# and being saved to path specified in parameter `to_disk`

train = tp.get_train_data(label_data=interests_ru.astype('int').to_dict('records'), pattern_list=None, to_disk='./corpus/ru/cats/', split=0.2, label=None)

Using label_data (list or Series with special dict


HBox(children=(IntProgress(value=0, description='Progress:', max=15123), Label(value='0')))

Splitting data: TRAIN - 80.0%,  TEST - 20.0%
Training data locates:
./corpus/ru/cats/train.spacy
./corpus/ru/cats/dev.spacy
CPU times: total: 1min 48s
Wall time: 1min 48s


In [164]:
%%time
# we use pretrained Spacy model to fine-tune on training data we obtained

spacy.cli.train.train("./CONFIG/config.cfg", "./TRAINED_MODEL/", overrides={"paths.train": "./corpus/train.spacy", "paths.dev": "./corpus/dev.spacy"})

[38;5;4mℹ Saving to output directory: TRAINED_MODEL[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'morphologizer', 'parser', 'attribute_ruler',
'lemmatizer', 'ner', 'textcat_multilabel'][0m
[38;5;4mℹ Frozen components: ['tok2vec', 'morphologizer', 'parser', 'senter',
'attribute_ruler', 'lemmatizer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  POS_ACC  MORPH_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  LEMMA_ACC  ENTS_F  ENTS_P  ENTS_R  CATS_SCORE  SCORE 
---  ------  -------------  -------  ---------  -------  -------  -------  -------  -------  ---------  ------  ------  ------  ----------  ------
  0       0           0.30   100.00     100.00   100.00   100.00   100.00   100.00   100.00     100.00  100.00  100.00  100.00       51.09    0.92
  0    1000          78.21   100.00     100.00   100.00   100.00   100.00   100.00   100.00     100.00  100.00  100.00  100.00       92.92   

In [93]:
# loading our fine-tuned model and test it on unknown texts

ru_cat = spacy.load('../TRAINED_MODEL/ru_cat2')

test_data = ['ставки на спорт',
            'машинное обучение',
             'удаленное управление дронами',
            'нагрузочное тестирование',
            'курсы по торговле на электронных площадках',
            'разработка игр с открытым миром']

for data in test_data:
    print('             ', data)
    print('Is in patterns:', data in tp_en.unique_tokens.values)
    print('------------------------------------------------------------------')
    print('Found categories:')
    result = pd.Series(ru_cat(data).cats).round(2)
    print(result[result > 0.4])
    print('------------------------------------------------------------------')

              ставки на спорт
Is in patterns: False
------------------------------------------------------------------
Found categories:
Азарт, казино, ставки, букмекеры    0.97
dtype: float64
------------------------------------------------------------------
              машинное обучение
Is in patterns: False
------------------------------------------------------------------
Found categories:
Информационные технологии, программирование    1.0
Аналитика, анализ данных, исследования         1.0
dtype: float64
------------------------------------------------------------------
              удаленное управление дронами
Is in patterns: False
------------------------------------------------------------------
Found categories:
Менеджмент, управление, руководство    1.0
Право и законодательство               1.0
dtype: float64
------------------------------------------------------------------
              нагрузочное тестирование
Is in patterns: False
--------------------------------------

Undoubtedly, the generalization of our model is strictly connected with training data we have prepared  
and the model determines better those categories which have a lot of examples in the training data. 

#### EN


Getting train data and fine-tune the model the same way for English language

In [61]:
dict_en = pd.read_excel('map_dict_en_fin.xlsx')

In [62]:
dict_en.head()

Unnamed: 0,patterns,categories,similarity,number_of_quotes
0,it,"IT, programming",0.794361,2072
1,mlm,"Marketplace, online sales, e-commerce",0.244447,590
2,mlm,Marketing and advertising,0.200961,590
3,os,"IT, programming",0.195469,503
4,hyip,Earning money on the Internet,0.146453,445


In [63]:
interests_en = tp_en.map_all(dict_df=dict_en, mode='binary')

HBox(children=(IntProgress(value=0, description='Progress:', max=33), Label(value='0')))

In [64]:
interests_en.head()

Unnamed: 0,"IT, programming","Marketplace, online sales, e-commerce",Marketing and advertising,Earning money on the Internet,Renovation and construction,"Stocks, investment opportunities, investing money",Finance and investments,Games and gaming,"Computers, laptops, parts",Gadgets and electronics,Innovative startups and projects,Programming languages,Movies and TV series,"Streaming, vlog, blog, youtube channels","Analytics, data analysis, data research",Science and Research,Social networks and communications,"IT courses, programming courses","Education, educational platforms, high school, trainings",Business and entrepreneurship,Technology and innovation,"Purchases, discount, sales","Stocks, exchanges, banking, securities",Events and news of the world,Sports and fitness,"Humor, entertainment, clubs, parties",Sports competitions and championships,Travel and tourism,Design and architecture,Health and medicine,Photography and videography,"Computer games, MMORPG, games consoles","Management, administration, control"
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [158]:
%%time
#train = tp_en.get_train_data(label_data=interests_en.astype('int').to_dict('records'), pattern_list=None, to_disk='./corpus/en/cats/', split=0.2, label=None)

Using label_data (list or Series with special dict


HBox(children=(IntProgress(value=0, description='Progress:', max=15123), Label(value='0')))

Splitting data: TRAIN - 80.0%,  TEST - 20.0%
Training data locates:
./corpus/en/cats/train.spacy
./corpus/en/cats/dev.spacy
CPU times: total: 30.4 s
Wall time: 30.4 s


In [159]:
%%time
#

#spacy.cli.train.train("./CONFIG/config.cfg", "./TRAINED_MODEL/", overrides={"paths.train": "./corpus/train.spacy", "paths.dev": "./corpus/dev.spacy"})

[38;5;4mℹ Saving to output directory: TRAINED_MODEL[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler',
'lemmatizer', 'ner', 'textcat_multilabel'][0m
[38;5;4mℹ Frozen components: ['tok2vec', 'tagger', 'parser', 'senter',
'attribute_ruler', 'lemmatizer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_P  SENTS_R  SENTS_F  LEMMA_ACC  ENTS_F  ENTS_P  ENTS_R  CATS_SCORE  SCORE 
---  ------  -------------  -------  -------  -------  -------  -------  -------  ---------  ------  ------  ------  ----------  ------
  0       0           0.32   100.00   100.00   100.00   100.00   100.00   100.00     100.00  100.00  100.00  100.00       50.19    0.93
  4    1000          19.39   100.00   100.00   100.00   100.00   100.00   100.00     100.00  100.00  100.00  100.00       98.81    1.01
 15    2000           1.57   100.00   100.00   100.0

In [94]:
en_cat = spacy.load('C:/Users/kadilnikov/Documents/TASKS/positive_tech/TRAINED_MODEL/en_cat')

test_data = ['free bets',
             'ski, snowboard, cycling',            
             'IT school',
            'mining farm',
            'counter strike',
            'java script']

for data in test_data:
    print('             ', data)
    #print('Is in patterns:', data in tp_en.unique_tokens.values)
    print('------------------------------------------------------------------')
    print('Found categories:')
    result = pd.Series(en_cat(data).cats).round(2)
    print(result[result > 0.4])
    print('------------------------------------------------------------------')
   

              free bets
------------------------------------------------------------------
Found categories:
Social networks and communications    0.77
dtype: float64
------------------------------------------------------------------
              ski, snowboard, cycling
------------------------------------------------------------------
Found categories:
Sports and fitness    0.99
dtype: float64
------------------------------------------------------------------
              IT school
------------------------------------------------------------------
Found categories:
IT, programming    1.0
dtype: float64
------------------------------------------------------------------
              mining farm
------------------------------------------------------------------
Found categories:
Stocks, investment opportunities, investing money    1.0
Finance and investments                              1.0
dtype: float64
------------------------------------------------------------------
             

# Extracting customers professions from text data

The second part of the notebook describes the obtaining information about clients' career and extracting it from the column with a set of words.s.

This task is a kind of word classification and labelling.

This pipeline is different from the previous one:  
<img src="./pictures/career_labelling.png" width=1500>

### RU

#### Cleaning text

In [49]:
tp2 = TextPreprocessing(text_col=df['carrier'].str.lower(), nlp=nlp)

In [50]:
tp2.extract(r'[А-Яа-яЁё -]+')
tp2.replace(r'\s+-+\s*|\s*-+\s+|\A\s*-+\s*\Z', repl=' ')
tp2.replace(r'\A\s+|\s+\Z|(?<=\s)(\s+)')
# replacing the acronyms
tp2.replace(r'(\A|(?<=\s))ип((?=\s)|\Z)', repl='предприниматель');

In [51]:
# result control
tp2.textcol_mod[(tp2.textcol_mod.notna())&(tp2.textcol_mod!='')].head()

513                                 учитель мбоу сош
528                  региональный центр недвижимости
561                                     адвокат апао
600    начальник отдела оценки ооо кф центр аналитик
605            генеральный директор зао бизнес центр
Name: carrier, dtype: object

#### Extracting career patterns 

In [52]:
# getting patterns
tp2.get_uniquetokens(sep=' ')

Final list length: 2009


0                       учитель
1                          мбоу
2                           сош
4                  региональный
5                         центр
                 ...           
2005        педагог-организатор
2006                    держава
2007    научно-производственный
2008                    комлекс
2009                      дедал
Length: 2009, dtype: object

In [53]:
# using the method below we can filter only the patterns on the topic we needed. This method uses similarity to obtain bindings.

career_patterns_ru = cat_tor.cat_sim(cat='профессия, специальность, предприниматель, работа, учащийся, руководитель, врач, начальник',
                                     pattern_list=tp2.unique_tokens, quoting=tp2.textcol_mod)

Starting NLP-processing for pattern_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=2009), Label(value='0')))

pattern_list processed

Starting quotes counting...


HBox(children=(IntProgress(value=0, description='Progress:', max=1493), Label(value='0')))

Successfully.
Mapping with quoting data...


In [55]:
career_patterns_ru.head()

Unnamed: 0,patterns,"профессия, специальность, предприниматель, работа, учащийся, руководитель, врач, начальник",number of quotes
135,руководитель,0.670854,88
722,инженер-программист,0.669125,11
24,инженер,0.643571,104
391,инженер-сметчик,0.643571,1
289,инженер-прграммист,0.643571,1


In [None]:
# save results to file to check and correct
#career_patterns_ru.to_excel('career_patterns_ru.xlsx')

In [57]:
career_patterns_ru = pd.read_excel('career_patterns_ru_checked.xlsx')['patterns']

In [58]:
career_patterns_ru.head()

0           руководитель
1    инженер-программист
2                инженер
3        инженер-сметчик
4     инженер-прграммист
Name: patterns, dtype: object

#### Bulding training data

In [None]:
train = tp2.get_train_data(pattern_list=career_patterns_ru, to_disk='./corpus/ru/ORTH/', label='CAREER')

In [84]:
help(tp2.get_train_data)

Help on method get_train_data in module nk_nlp1_5:

get_train_data(pattern_list=None, label=None, label_data=None, patterns_convert='ORTH', text_col=None, filtered=True, split=0.1, to_disk='./corpus/', aliquot=10, stratify=None) method of nk_nlp1_5.TextPreprocessing instance



In [None]:
train

#### Model fine-tuning

In [None]:
%%time

spacy.cli.train.train("./CONFIG/config.cfg", "./TRAINED_MODEL/", overrides={"paths.train": "./corpus/train.spacy", "paths.dev": "./corpus/dev.spacy"})

#### Checking the trained model with unknown examples

In [59]:
# loading fine-tuned model

trained_model = spacy.load('C:/Users/kadilnikov/Documents/TASKS/positive_tech/TRAINED_MODEL/career_ru')

In [63]:
# test the model on unknown texts. We 

unknown_careers = ['отоларинголог частной клиники "большие уши"',
                   '10 больница сельского поселения Петушки столяр, краснодеревщик',
                   'космонавт 7 галактического флота некроморфов',
                   'гитарист группы биттлз как всегда был на высоте',
                   'рядовой роты саперов методично проверял свою аммуницию']
for example in unknown_careers:
    print('Source text:')
    print(example)
    print()
    print('Labeling result:')
    [print(token.text, token.ent_type_) for token in trained_model(example)]
    print('-----------------------------------------------------------------------------')

Source text:
отоларинголог частной клиники "большие уши"

Labeling result:
отоларинголог CAREER
частной 
клиники 
" 
большие 
уши 
" 
-----------------------------------------------------------------------------
Source text:
10 больница сельского поселения Петушки столяр, краснодеревщик

Labeling result:
10 
больница 
сельского 
поселения 
Петушки 
столяр CAREER
, 
краснодеревщик CAREER
-----------------------------------------------------------------------------
Source text:
космонавт 7 галактического флота некроморфов

Labeling result:
космонавт CAREER
7 
галактического 
флота 
некроморфов 
-----------------------------------------------------------------------------
Source text:
гитарист группы биттлз как всегда был на высоте

Labeling result:
гитарист CAREER
группы 
биттлз 
как 
всегда 
был 
на 
высоте 
-----------------------------------------------------------------------------
Source text:
рядовой роты саперов методично проверял свою аммуницию

Labeling result:
рядовой CAREER
ро

#### Using trained model for extracting career info

In [65]:
# fine-tuned model already knows how to detect the career-related words and label it automatically when we put data into model

car_ru = TextPreprocessing(nlp=trained_model, text_col=tp2.textcol_mod)

In [66]:
# it remains only to extract this data from the model

df['career_labels_ru'] = car_ru.extract_ents(labels=['CAREER'], aliquot=10, filtered=True, rest=False)

HBox(children=(IntProgress(value=0, description='Progress', max=15123), Label(value='0')))

In [67]:
# checking the results

df[df['career_labels_ru'].notna()]['career_labels_ru'].head()

513               учитель
561               адвокат
600    начальник,аналитик
605              директор
606              менеджер
Name: career_labels_ru, dtype: object

In [None]:
# saving results
#df[['guid', 'carrier', 'career_labels_ru']].to_excel('career_model_results_ru.xlsx')

### EN

Do the same for English language

#### Cleaning text

In [68]:
tp2_en = TextPreprocessing(text_col=df['carrier'].str.lower(), nlp=nlp)

In [69]:
tp2_en.extract(r'[A-Za-z -]+')
tp2_en.replace(r'\s+-+\s*|\s*-+\s+|\A\s*-+\s*\Z', repl=' ')
tp2_en.replace(r'\A\s+|\s+\Z|(?<=\s)(\s+)');

In [70]:
# result control
tp2_en.textcol_mod[(tp2_en.textcol_mod.notna())&(tp2_en.textcol_mod!='')].head()

544                                     chief officer na
562                    student at hgskolen i sr-trndelag
589    middleware development engineer intel corporation
617                                                  h j
621                                                    -
Name: carrier, dtype: object

#### Extracting career patterns 

In [71]:
tp2_en.get_uniquetokens(sep=' ');

Final list length: 1632


In [72]:
career_patterns_en = cat_tor_en.cat_sim(cat='career, chief, manager, developer, designer, devops, programmer, head, CEO, lead, owner, student, doctor',
                                     pattern_list=tp2_en.unique_tokens, quoting=tp2_en.textcol_mod)

Starting NLP-processing for pattern_list


HBox(children=(IntProgress(value=0, description='Progress: ', max=1632), Label(value='0')))

pattern_list processed

Starting quotes counting...


HBox(children=(IntProgress(value=0, description='Progress:', max=1115), Label(value='0')))

Successfully.
Mapping with quoting data...


In [None]:
#career_patterns_en.to_excel('career_patterns_en.xlsx')

In [73]:
career_patterns_en = pd.read_excel('career_patterns_en_checked.xlsx')['patterns']

In [74]:
career_patterns_en.head()

0        specialist
1         associate
2    representative
3       constructor
4        consultant
Name: patterns, dtype: object

#### Bulding training data

In [None]:
tp2_en.get_train_data(pattern_list=career_patterns_en, to_disk='./corpus/', label='CAREER')

#### Model additional training

In [None]:
%%time
spacy.cli.train.train("./CONFIG/config.cfg", "./TRAINED_MODEL/", overrides={"paths.train": "./corpus/train.spacy", "paths.dev": "./corpus/dev.spacy"})

#### Checking the trained model with unknown examples

In [75]:
trained_model_en = spacy.load('C:/Users/kadilnikov/Documents/TASKS/positive_tech/TRAINED_MODEL/career_en')

In [76]:
unknown_careers = ['worker of furniture plant',
                   'sportsman club dinamo',
                   'sailor US navy',
                   'waiter hospital of holly diana',
                   'driver wallmart']
for example in unknown_careers:
    print(example)
    print()
    [print(token.text, token.ent_type_) for token in trained_model_en(example)]
    print('-----------------------------------------------------------------------------')

worker of furniture plant
Is in patterns: False

worker CAREER
of 
furniture 
plant 
-----------------------------------------------------------------------------
sportsman club dinamo
Is in patterns: False

sportsman CAREER
club 
dinamo 
-----------------------------------------------------------------------------
sailor US navy
Is in patterns: False

sailor CAREER
US 
navy 
-----------------------------------------------------------------------------
waiter hospital of holly diana
Is in patterns: False

waiter CAREER
hospital 
of 
holly 
diana 
-----------------------------------------------------------------------------
driver wallmart
Is in patterns: False

driver CAREER
wallmart 
-----------------------------------------------------------------------------


#### Using trained model for extracting career info

In [77]:
car_en = TextPreprocessing(nlp=trained_model_en, text_col=tp2_en.textcol_mod)

In [78]:
df['career_labels_en'] = car_en.extract_ents(labels=['CAREER'], aliquot=10, filtered=True, rest=False)

HBox(children=(IntProgress(value=0, description='Progress', max=15123), Label(value='0')))

In [79]:
df[df['career_labels_en'].notna()]['career_labels_en'].head()

544              chief,officer
562                    student
589                   engineer
657    chief,engineer,engineer
801                    manager
Name: career_labels_en, dtype: object

In [None]:
# saving results
#df[['guid', 'carrier', 'career_labels_en']].to_excel('career_model_results_en.xlsx')

### Results merging

In [80]:
df['career_labels_total'] = df['career_labels_ru'].str.cat(others=df['career_labels_en'], sep=',', na_rep='', join='outer')

In [81]:
df['career_labels_total'] = df['career_labels_total'].str.replace(pat=r'\A,|,\Z', repl='', regex=True)

In [83]:
# checking final result
df[df['career_labels_total'] != '']['career_labels_total'].head()

513          учитель
544    chief,officer
561          адвокат
562          student
589         engineer
Name: career_labels_total, dtype: object

In [55]:
df.to_excel('career_total_result_fin.xlsx')