In [1]:
import xgboost
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import gensim
import nltk
import pymorphy2
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

## Train W2V from other.csv

In [2]:
other_e_file = r'other.csv' 
stopwords = nltk.corpus.stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [None]:
# clean text and make a list of lists
all_words = []
with open(other_e_file, 'r', encoding='UTF-8') as other_e:
    for line in other_e.readlines()[1:]:
        soup = BeautifulSoup(line, "html5lib")
        tokenized = simple_preprocess(soup.text)
        for word in tokenized:
            if word in stopwords:
                tokenized.remove(word)
        for n, word in enumerate(tokenized):
            tokenized[n] = morph.normal_forms(word)[0]
        all_words.append(tokenized)

In [None]:
model = Word2Vec(all_words)

In [None]:
# train word2vec model from preprocessed other.csv
model.train(all_words, total_examples=len(all_words), epochs=10)

In [167]:
# save w2v model
model.save('model.vec')

## Read the training set

In [3]:
# read the w2v model previously saved
model = gensim.models.word2vec.Word2VecKeyedVectors.load('model.vec')

In [4]:
train_e_file = r'train.csv' 

In [414]:
# clean text and make a list of lists
all_lists = []
with open(train_e_file, 'r', encoding='UTF-8') as train_e:
    for line in train_e.readlines()[1:]:
        soup = BeautifulSoup(line, "html5lib")
        all_lists.append(soup.text.split('\t'))

In [415]:
# make a pandas dataframe from train data
train = pd.DataFrame(all_lists, columns=['num', 'name', 'descr', 'target']).drop('num', axis=1)

In [417]:
train.dropna(inplace=True)

In [7]:
# first-stage preprocessing func
def preproc(df):
    # tokenize words and delete words which are in stopword-list
    def tokenize_words(x):
        x = simple_preprocess(x)
        for word in x:
            if word in stopwords:
                x.remove(word)
        for n, word in enumerate(x):
            x[n] = morph.normal_forms(word)[0]
        return x
    df['name'] = df['name'].apply(tokenize_words)
    df['descr'] = df['descr'].apply(tokenize_words)
    
    # process 'target': delete '\n' symbol and transform to int
    if 'target' in df.columns:
        df['target'] = df['target'].str.strip('\n').astype('int')
    
    # create average vetors
    def text_to_vec(dct, model, size):
        text_vec = np.zeros((size,), dtype="float32")
        n_words = 0

        index2word_set = set(model.index2word)
        for word in dct:
            if word in index2word_set:
                n_words = n_words + 1
                text_vec = np.add(text_vec, model[word]) 

        if n_words != 0:
            text_vec /= n_words
        return text_vec
    def tovec(x):
        return text_to_vec(x, model.wv, 100)
    df['namevec'] = df['name'].apply(tovec)
    df['descrvec'] = df['descr'].apply(tovec)
    
    # drop superfluous
    df.drop(['name', 'descr'], axis=1, inplace=True)
    
    return df

In [419]:
df = preproc(train)

In [417]:
# dump the train with bounding vectors
with open('df.dump', 'wb') as df_dump:
    pickle.dump(df, df_dump)

In [47]:
# load the train with bounding vectors
with open('df.dump', 'rb') as df_dump:
    df = pickle.load(df_dump)

In [48]:
df.head()

Unnamed: 0,target,namevec,descrvec
0,1,"[-1.1923714, -1.037942, 0.81618166, 1.0052104,...","[-0.11000495, 0.143799, 0.77136016, 1.4077154,..."
1,0,"[-1.4009495, -1.752891, -1.6736608, 1.9459628,...","[-1.4900427, -0.7262686, -0.5407763, -1.015697..."
2,0,"[1.3747917, -0.9137062, 1.2883497, 1.4640479, ...","[0.53748846, 1.0444499, -0.20906413, -0.682313..."
3,0,"[-1.5560954, -5.5392475, -2.4810383, -0.101318...","[0.28356838, -0.6393874, -0.88733375, -1.92325..."
4,1,"[-0.7036357, -0.3635234, -0.6468989, 3.3174365...","[0.9476606, -0.10418407, 0.40457508, 0.4247652..."


## Cosinus distance

In [49]:
all_vecs_name = df[df['target']==1]['namevec'] # all vectors of df where target is 1
all_vecs_name_length = len(all_vecs_name) # length of all vectors of df where target is 1

all_vecs_descr = df[df['target']==1]['descrvec'] # all vectors of df where target is 1
all_vecs_descr_length = len(all_vecs_descr) # length of all vectors of df where target is 1

In [50]:
# count the average vector of target
avg_vec_name = np.zeros(100)
for vec in all_vecs_name:
    avg_vec_name = np.add(avg_vec_name, vec)
avg_vec_name /= all_vecs_name_length

avg_vec_descr = np.zeros(100)
for vec in all_vecs_descr:
    avg_vec_descr = np.add(avg_vec_descr, vec)
avg_vec_descr /= all_vecs_descr_length

In [51]:
# count the cos distance between average-target-vector and vector of every object in df
# second-stage of preprocessing
def addcos(df):
    def cos_dist_name(x):
        return 1-spatial.distance.cosine(avg_vec_name, np.array(x))
    def cos_dist_descr(x):
        return 1-spatial.distance.cosine(avg_vec_descr, np.array(x))
    df['cos_dist_name'] = df['namevec'].apply(cos_dist_name)
    df['cos_dist_descr'] = df['descrvec'].apply(cos_dist_descr)
    df.drop(['namevec', 'descrvec'], axis=1, inplace=True)
    return df

In [52]:
train_clean = addcos(df)

In [53]:
# what we'll put to the train
train_clean.head()

Unnamed: 0,target,cos_dist_name,cos_dist_descr
0,1,0.538825,0.307647
1,0,-0.0282,-0.082858
2,0,0.251001,0.172624
3,0,0.173686,-0.055483
4,1,0.810259,0.404339


In [54]:
train_clean.dropna(inplace=True)

## Train LR

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [57]:
X = train_clean.drop('target', axis=1)
# X = train_clean.drop('target', axis=1).iloc[:, 100:]
# X = train_clean.drop('target', axis=1).iloc[:, :100]
y = train_clean.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [440]:
lr = LogisticRegression(C=0.05, 
                        penalty='l1',
                        n_jobs=-1)

In [441]:
lr.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l1', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [442]:
predicted = lr.predict_proba(X_test)

In [443]:
roc_auc_score(y_test, predicted[:,1])

0.9011118632883041

## Train XGBoost

In [66]:
from xgboost import XGBClassifier

In [361]:
xgb = XGBClassifier(
                    max_depth=19,
                    min_child_weight=1,
                    subsample=0.85,
                    colsample_bytree=0.85,
                    learning_rate=0.2,
                    random_state=72,
                    n_estimators=50,
                    reg_lambda=1,
                    n_jobs=-1
                    )

xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict_proba(X_test)
roc_auc_score(y_test, xgb_predicted[:,1])

0.9730520291012713

## Train KNN

In [397]:
from sklearn.neighbors import KNeighborsClassifier

In [398]:
knn = KNeighborsClassifier(n_neighbors=100, 
                           weights='distance',
                           p=1,
                          n_jobs=-1)

In [399]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=100, p=1,
           weights='distance')

In [400]:
knn_predicted = knn.predict_proba(X_test)

In [401]:
roc_auc_score(y_test, knn_predicted[:,1])

0.9618008809692662

## Train RandomForest

In [409]:
from sklearn.ensemble import RandomForestClassifier

In [410]:
rf = RandomForestClassifier(n_estimators=800,
                           max_depth=25,
                            random_state=72,
                            n_jobs=-1,
                            min_samples_split=5,
                            class_weight='balanced'
                           )

In [411]:
%%time
rf.fit(X_train, y_train)

CPU times: user 5min 25s, sys: 7.73 s, total: 5min 33s
Wall time: 16 s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=25, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=800, n_jobs=-1, oob_score=False, random_state=72,
            verbose=0, warm_start=False)

In [412]:
rf_predicted = rf.predict_proba(X_test)

In [413]:
roc_auc_score(y_test, rf_predicted[:,1])

0.9681211131353059

## Averaging

In [414]:
knn = KNeighborsClassifier(n_neighbors=100, 
                           weights='distance',
                           p=1,
                          n_jobs=-1)
knn.fit(X_train, y_train)
knn_predicted = knn.predict_proba(X_test)[:, 1]

xgb = XGBClassifier(
                    max_depth=19,
                    min_child_weight=1,
                    subsample=0.85,
                    colsample_bytree=0.85,
                    learning_rate=0.2,
                    random_state=72,
                    n_estimators=50,
                    reg_lambda=1,
                    n_jobs=-1
                    )

xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict_proba(X_test)[:, 1]

rf = RandomForestClassifier(n_estimators=800,
                           max_depth=25,
                            n_jobs=-1,
                            min_samples_split=5,
                            class_weight='balanced'
                           )
rf.fit(X_train, y_train)
rf_predicted = rf.predict_proba(X_test)[:, 1]

df_avg = pd.DataFrame({'knn':knn_predicted, 
                       'xgb':xgb_predicted, 
                       'rf':rf_predicted
                      })  

df_avg['avg'] = (df_avg.knn + df_avg.xgb + df_avg.rf) / 3

df_avg.drop(['knn', 'xgb', 'rf'], axis=1, inplace=True)

In [450]:
roc_auc_score(y_test, df_avg)

0.9668606388767934

## Predict holdout

In [22]:
holdout_file = r'test.csv'

In [443]:
# repeat preproc procedures for test data
all_lists = []
with open(holdout_file, 'r', encoding='UTF-8') as holdout:
    for line in holdout.readlines()[1:]:
        soup = BeautifulSoup(line, "html5lib")
        all_lists.append(soup.text.split('\t'))

holdout = pd.DataFrame(all_lists, columns=['num', 'name', 'descr']).drop('num', axis=1)

In [445]:
holdout.head()

Unnamed: 0,name,descr
0,Дизайнер-консультант мебели,"Обязанности: Работа с клиентом в салоне,выезд..."
1,Продавец-консультант (ТЦ на Пушкина),Обязанности: ∙ консультирование покупателей по...
2,Менеджер по продажам,Торговый Дом «Форт» это ведущая компания Петер...
3,Продавец-консультант в магазин одежды (ТЦ Волн...,Требуются продавцы консультанты в магазин женс...
4,Специалист по охране труда,Обязанности: осуществление контроля по соблю...


In [446]:
holdout.dropna(inplace=True)
df_holdout = preproc(holdout)

In [448]:
with open('holdout_preproc.dump', 'wb') as holdout_preproc_dump:
    pickle.dump(df_holdout, holdout_preproc_dump)

In [476]:
with open('holdout_preproc.dump', 'rb') as holdout_preproc_dump:
    df_holdout = pickle.load(holdout_preproc_dump)

In [477]:
df_holdout.head()

Unnamed: 0,namevec,descrvec
0,"[-0.6419412, -1.829832, 2.7466059, 0.8133362, ...","[0.24646637, 0.3046802, -0.17986065, 0.2290453..."
1,"[-1.3852514, 0.28667063, -0.65046287, 0.525823...","[0.82405066, 1.1942462, 0.76160324, -0.872466,..."
2,"[2.74303, 2.4228055, -1.0153161, 0.69004047, -...","[0.628331, -0.070105955, 0.5001549, -0.5405085..."
3,"[0.20255001, 0.08419422, 0.7809986, 0.26706052...","[0.8405272, 0.5327271, 0.4663898, -0.80236864,..."
4,"[-0.63922423, -4.741979, -3.1055737, 0.7819421...","[-0.044651005, -0.4690158, -1.1494944, -0.2467..."


In [478]:
df_holdout = addcos(df_holdout)

In [479]:
df_holdout.head()

Unnamed: 0,cos_dist_name,cos_dist_descr
0,0.634055,0.639985
1,0.675732,0.744681
2,0.851968,0.650315
3,0.592465,0.636986
4,0.079021,-0.140517


In [480]:
# df_holdout.dropna(inplace=True)

In [481]:
# df_holdout.cos_dist_name.fillna(df_holdout.cos_dist_descr, inplace=True)
# df_holdout.cos_dist_descr.fillna(df_holdout.cos_dist_name, inplace=True)

In [482]:
xgb = XGBClassifier(
                    max_depth=19,
                    min_child_weight=1,
                    subsample=0.85,
                    colsample_bytree=0.85,
                    learning_rate=0.2,
                    random_state=72,
                    n_estimators=50,
                    reg_lambda=1,
                    n_jobs=-1
                    )

xgb.fit(X, y)
xgb_predicted = xgb.predict_proba(df_holdout)[:, 1]

In [483]:
df_holdout = df_holdout.fillna(method='ffill', axis=0)

In [484]:
rf = RandomForestClassifier(n_estimators=800,
                           max_depth=25,
                            n_jobs=-1,
                            min_samples_split=5,
                            class_weight='balanced'
                           )
rf.fit(X, y)
rf_predicted = rf.predict_proba(df_holdout)[:, 1]

In [485]:
knn = KNeighborsClassifier(n_neighbors=100, 
                           weights='distance',
                           p=1,
                          n_jobs=-1)
knn.fit(X, y)
knn_predicted = knn.predict_proba(df_holdout)[:, 1]



df_avg = pd.DataFrame({'knn':knn_predicted, 
                       'xgb':xgb_predicted, 
                       'rf':rf_predicted
                      })  

In [486]:
df_avg['avg'] = (df_avg.knn + df_avg.xgb + df_avg.rf) / 3

df_avg.drop(['knn', 'xgb', 'rf'], axis=1, inplace=True)

In [502]:
holdout_predict = pd.DataFrame({'id':list(range(200000,200000+len(df_avg))), 
                                'target':df_avg['avg'].tolist()})

In [503]:
holdout_predict.to_csv('res_predict_3_avg.csv', index=False)