In [1]:
import pandas as pd
import urllib3
import time
import numpy as np
import json 
import csv
import collections
from IPython.display import display
import math
import re
import spacy
from scipy import sparse
from math import sqrt

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn

In [None]:
df = pd.read_csv('data/vacs_train_features.csv')
df2 = pd.read_csv('data/vacs_test_features.csv')

In [None]:
df = pd.concat([df, df2])
df2 = None

In [None]:
from deeppavlov.models.tokenizers.ru_tokenizer import RussianTokenizer
tokenizer = RussianTokenizer(ngram_range=ngram_range=[1,2],lemmas=True,lowercase=True)
def my_tokenize(txt):
    ans = tokenizer([txt])
    return ans[0]

In [None]:
exclude_columns = ['Unnamed: 0',
 'index',
 'id',
 'name',
 'name.lemm',
 'area.name',
 'city',
 'company',
 'company_link',
 'publication_date',
 'salary_from',
 'salary_currency',
 'employment',
 'employment.name',
 'schedule',
 'schedule.name',
 'experience',
 'experience.name',
 'key_skills',
 'specializations',
 'specializations.names',
 'description.lemm',
 'type',
 'log_salary_from',
 'name0',
 'log_salary_normalized',
 'log_salary_normalized_year',
 'median_salary_year',
 'median_salary_year_month',
 'salary_normalized',
 'salary_normalized_year']

y = df['log_salary_from']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=50000, shuffle=False)

X_train['y'] = y_train
X_train = X_train[(X_train['salary_from'] >= 20000) & (X_train['salary_from'] <= 100000) & (X_train['cyrillic_percentage'] >= 0.75)]
y_train = X_train['y']

X_train = X_train.drop(exclude_columns, axis=1).drop(['y','extracted_salary'], axis=1)
extracted_salary = X_test['extracted_salary']
X_test = X_test.drop(exclude_columns, axis=1).drop(['extracted_salary'], axis=1)


In [None]:
vec = TfidfVectorizer(tokenizer=my_tokenize, max_df=0.8, min_df=100, max_features=2000, use_idf=True, sublinear_tf=False)
X_train_vec = vec.fit_transform(X_train['description'])
X_test_vec = vec.transform(X_test['description'])

In [None]:
scaler = MinMaxScaler()
X_train.drop(['description'], inplace=True, axis=1)
X_test.drop(['description'], inplace=True, axis=1)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train = pd.concat([pd.DataFrame(X_train_scaled), pd.DataFrame(X_train_vec.todense())], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_scaled), pd.DataFrame(X_test_vec.todense())], axis=1)

In [None]:
X_train.shape

In [None]:
cls = GradientBoostingRegressor(verbose=1, n_estimators=10, max_features='auto', max_depth=3)
cls.fit(X_train, y_train)

In [None]:
sklearn_train_pred = cls.predict(X_train)
print('RMSE train', sqrt(mean_squared_error(y_train, sklearn_train_pred)))
sklearn_test_pred = cls.predict(X_test)


In [None]:
l = 0
dif = []
for i,s in extracted_salary.iteritems():
    if not math.isnan(s) and s > 0:
        dif.append(abs(s - math.exp(sklearn_test_pred[l])))
        sklearn_test_pred[l] = math.log(s)
    l += 1

In [None]:
salary_pred = [round(math.exp(x)/1000)*1000 for x in sklearn_test_pred]

In [None]:
pd.DataFrame(salary_pred).to_csv('test_preds.csv')