## Creating csv file

In [1]:
import codecs
from bs4 import BeautifulSoup
import csv
import os

In [3]:
csv_file = open('dou_vacancies.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'description'])

22

In [4]:
directory = 'dou/rezult'
for filename in os.listdir(directory):    
    file = codecs.open(directory + '/' + filename, "r", 'utf-8')
    soup = BeautifulSoup(file)
    headline = soup.find('h1', class_="g-h2").get_text()
    vacancy = soup.find('div', class_="l-vacancy")
    descriptions = vacancy.find_all('div', class_="text b-typo vacancy-section")
    text = ''
    for description in descriptions:
        text += description.get_text()
    csv_writer.writerow([headline, text])

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('dou_vacancies.csv')

In [7]:
df.head()

Unnamed: 0,headline,description
0,"Junior ML engineer (Computer Vision, 3d)",\r\n— Python— хорошая математическая база для ...
1,Senior Java/BigData Engineer,\r\nExperience in Java development (5 years+);...
2,Java Engineer for HighLoad/BigData project | SM,\r\n— Professional Java experience 3+ years— O...
3,NLP Engineer,"\r\n— 2+ years experience with Python, includi..."
4,Data Engineer,\r\n— 2+ years of hands-on experience as a Dat...


## Text processing

In [16]:
df['is_data_engineer'] = df['headline'].apply(lambda x: 'data' in x.lower())
df.head()

Unnamed: 0,headline,description,is_data_engineer
0,"Junior ML engineer (Computer Vision, 3d)",\r\n— Python— хорошая математическая база для ...,False
1,Senior Java/BigData Engineer,\r\nExperience in Java development (5 years+);...,True
2,Java Engineer for HighLoad/BigData project | SM,\r\n— Professional Java experience 3+ years— O...,True
3,NLP Engineer,"\r\n— 2+ years experience with Python, includi...",False
4,Data Engineer,\r\n— 2+ years of hands-on experience as a Dat...,True


In [17]:
df_engineers = df.query('is_data_engineer == True').copy()
df_engineers.drop(['is_data_engineer'], axis=1, inplace=True)
df_engineers.head()

Unnamed: 0,headline,description
1,Senior Java/BigData Engineer,\r\nExperience in Java development (5 years+);...
2,Java Engineer for HighLoad/BigData project | SM,\r\n— Professional Java experience 3+ years— O...
4,Data Engineer,\r\n— 2+ years of hands-on experience as a Dat...
5,Middle Data Engineer (#14074599),\r\nOur Client`s mission to make it simple for...
6,Data engineer,\r\n— 2+ years of experience in Scala server-s...


In [18]:
df['is_data_engineer'].value_counts()

True     118
False     38
Name: is_data_engineer, dtype: int64

In [11]:
from nltk.tokenize import word_tokenize

In [19]:
def leave_relevant_technologies(lst_input):
    techno_list = ['Airflow', 'Apache', 'Asure', 'Bash', 'BigQuery', 'Databricks', 'Data Bricks', 'Data Factory', 
            'Delta Lake', 'Docker', 'ETL', 'Extract Load Transform', 'Fiddler', 'Flink', 'GCP', 'Google Cloud', 
            'Jenkins', 'Jira', 'Impala', 'InfluxDB',  'Kinesis', 'Linux', 'Mesos',  'Postman', 'PowerBI', 'PySpark', 
                   'R-Studio',   'snowflake', 'Spring',
            'SSIS', 'SSAS', 'QA', 'Tableau', 'Tensorflow']
    message_brokers = ['Kafka', 'RabbitMQ', 'AMQP', 'Redis']
    databases = ['SQL', 'NoSQL', 'SQL Server', 'Cassandra', 'Cosmos DB', 'MongoDB', 
                 'MySQL', 'RabGTD', 'Rabbit', 'Oracle', 'PostgreSQL']
    big_data = ['Hadoop', 'Hive', 'Hbase', 'MapReduce', 'Map Reduce', 'Pig', 'Spark']
    amazon = ['AWS', 'Redshift']
    formats = ['JSON', 'XML']
    prog_lang = ['C', 'C++', 'R', 'Kotlin', 'Scala', 'Java', 'JS', 'JavaScript', 'Python', 'ReactJS']
    soft_skills = ['Scrum', 'Agile']
    final_list = amazon + techno_list + formats+ message_brokers + databases + big_data + prog_lang + soft_skills
    lst_input = [w.lower() for w in lst_input]
    lst = [word for word in final_list if word.lower() in lst_input]
    return list(set(lst))

In [20]:
df_engineers['description'] = df_engineers['description'].apply(word_tokenize)
df_engineers['description'] = df_engineers['description'].apply(leave_relevant_technologies)

In [21]:
df_engineers.head(10)

Unnamed: 0,headline,description
1,Senior Java/BigData Engineer,"[Linux, Jenkins, Java, SQL, Apache, Hadoop, Sp..."
2,Java Engineer for HighLoad/BigData project | SM,"[AWS, Linux, MongoDB, Java]"
4,Data Engineer,"[Airflow, BigQuery, snowflake, Python, ETL, Ki..."
5,Middle Data Engineer (#14074599),"[Python, AWS, SQL]"
6,Data engineer,"[Kafka, Agile, Apache, Spark, Scala, Python, AWS]"
7,Middle Data Warehouse Engineer,"[PostgreSQL, Tableau, SQL, ETL, AWS, Redshift,..."
8,Senior Scala/Java Engineer for Exabeam (Data L...,"[Linux, Kafka, Java, Hadoop, Scala, Python, Ba..."
9,Middle Data Engineer,"[Scrum, SSIS, SSAS]"
10,Data Engineer (Scala/Spark),"[PostgreSQL, Hadoop, Jira, Redshift, SQL, Redi..."
13,Lead Big Data Engineer,"[ETL, Hadoop, AWS, SQL]"


In [22]:
result = df_engineers['description'].tolist()
result = [item for sublist in result for item in sublist]

In [23]:
from collections import Counter
cnt = Counter(result).most_common()

In [25]:
total_number = df_engineers.shape[0]
final_result = []
for pair in cnt:
    pair = list(pair)
    pair[1] = round(pair[1] / total_number * 100, 2)
    final_result.append(tuple(pair))

In [26]:
final_result

[('Python', 61.02),
 ('SQL', 58.47),
 ('AWS', 50.85),
 ('Spark', 44.07),
 ('ETL', 31.36),
 ('Kafka', 30.51),
 ('Java', 27.12),
 ('Hadoop', 22.03),
 ('PostgreSQL', 22.03),
 ('Airflow', 21.19),
 ('Apache', 19.49),
 ('Redshift', 18.64),
 ('Agile', 16.1),
 ('NoSQL', 16.1),
 ('Scala', 15.25),
 ('MySQL', 14.41),
 ('Docker', 14.41),
 ('BigQuery', 13.56),
 ('GCP', 13.56),
 ('MongoDB', 11.86),
 ('Hive', 11.86),
 ('Linux', 9.32),
 ('Cassandra', 9.32),
 ('snowflake', 8.47),
 ('R', 8.47),
 ('Jenkins', 7.63),
 ('QA', 7.63),
 ('Tableau', 6.78),
 ('MapReduce', 6.78),
 ('Hbase', 6.78),
 ('Scrum', 5.93),
 ('Redis', 5.93),
 ('PySpark', 5.93),
 ('Jira', 5.08),
 ('Oracle', 5.08),
 ('Kinesis', 4.24),
 ('RabbitMQ', 4.24),
 ('Bash', 3.39),
 ('SSIS', 3.39),
 ('JavaScript', 3.39),
 ('C', 3.39),
 ('Pig', 3.39),
 ('PowerBI', 3.39),
 ('Flink', 2.54),
 ('Tensorflow', 2.54),
 ('C++', 2.54),
 ('Impala', 2.54),
 ('Spring', 2.54),
 ('Databricks', 2.54),
 ('JSON', 2.54),
 ('Fiddler', 1.69),
 ('Kotlin', 1.69),
 ('Postma