In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv('vacancies_parsed.csv')
df_to_mark_up = df[['section_title']].dropna().drop_duplicates()
df_to_mark_up['target'] = 0
df_to_mark_up.reset_index(drop=True, inplace=True)
df_to_mark_up.to_csv('train_dataset_unmarked.csv')
df_to_mark_up.head()

Unnamed: 0,section_title,target
0,предстоять заниматься,0
1,ждать,0
2,знание система класс решение,0
3,предлагать,0
4,обязанность,0


Проанализировав первые 300 элементов датасета я выделил вот такие классы
section_title target<br>
0 чем предстоит заниматься/ключевые задачи<br>
1 требования/мы ждем от вас<br>
2 желательно<br>
3 что мы предлагаем/почему мы/общее описание команды/плюшки от компании<br>
4 условия работы<br>
5 текущий стек<br>

In [7]:
df_train = pd.read_csv('train_dataset_201.csv', index_col=0)
df_train.head()

Unnamed: 0,section_title,target
127,системный администратор предстоять,0
240,ключевой навык,1
145,владеть работать,1
103,наш условие,4
185,ждеть,1


In [8]:
df_train_numpy = df_train.to_numpy()
x = df_train_numpy[:, 0]
y = df_train_numpy[:, 1].astype('int')
counts = np.bincount(y)
print('Most common class has {} of {} total elements. So baseline is {}%'.format(np.max(counts), np.sum(counts), round(np.max(counts)/ np.sum(counts) * 100, 2)))
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=2020)

Most common class has 77 of 201 total elements. So baseline is 38.31%


In [10]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', KNeighborsClassifier(n_neighbors = 6,weights = 'distance',algorithm = 'brute'))])

model_knn = pipe.fit(x_train, y_train)
prediction = model_knn.predict(x_test)
print("KNN accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

KNN accuracy: 75.61%


In [11]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_knn, x_train, y_train, cv=5)
print("NKK Cross-Validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

NKK Cross-Validation accuracy: 0.78 (+/- 0.09)




In [13]:
df_full_filled = df.copy().dropna()
df_full_filled_numpy = df_full_filled.to_numpy()
full_prediction = model_knn.predict(df_full_filled_numpy[:, 2])
df_full_filled['target'] = full_prediction
print('{}/{} rows remained in dataset, others were deleted because of NaNs'.format(len(df_full_filled), len(df)))

5289/5926 rows remained in dataset, others were deleted because of NaNs


In [14]:
df_full_filled.to_csv('vacancies_parsed_sections_filled.csv')