In [1]:
%load_ext autoreload
%autoreload 2

import os
import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error

import catboost as ctb
from catboost import CatBoostRegressor, cv, Pool
from catboost.utils import select_threshold, get_fpr_curve, get_roc_curve

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import download
from nltk.corpus import stopwords
from pymystem3 import Mystem
import re
import gensim
from fse import IndexedList
from fse.models import Average
from gensim.models import KeyedVectors

from Levenshtein import distance as levenshtein_distance
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from nltk.stem import SnowballStemmer

from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

bad_symbols_re = re.compile('[,.«»!"#$%&\'()*+/:;<=>?@[\\]^_`{|}~]')
stopwords = stopwords.words(['russian', 'english'])
mystem = Mystem()

import sys
sys.path.append("/data1/vovan/shared_code/")
import shared_utils
import utils

%matplotlib inline
pd.set_option('max_column', None)

**data**

In [2]:
# load data
train_df = pd.read_csv("./data/train_issues.csv")
train_comment_df = pd.read_csv("./data/train_comments.csv")
test_df = pd.read_csv("./data/test_issues.csv")
test_comment_df = pd.read_csv("./data/test_comments.csv")
emp_df = pd.read_csv("./data/employees.csv")
solution_df = pd.read_csv("./data/sample_solution.csv")
summary_translated_df = pd.read_csv('./data/summary_translated.csv')

# add comments texts
train_comment_df['text_padded'] = train_comment_df['text'] + '. '
test_comment_df['text_padded'] = test_comment_df['text'] + '. '

train_df = train_df.merge(train_comment_df.groupby(['issue_id'], as_index = False)['text_padded'].sum()\
    .rename(columns = {'text_padded':'comments', 'issue_id':'id'}), on = ['id'], how = 'left', validate = '1:1')

test_df = test_df.merge(test_comment_df.groupby(['issue_id'], as_index = False)['text_padded'].sum()\
    .rename(columns = {'text_padded':'comments', 'issue_id':'id'}), on = ['id'], how = 'left', validate = '1:1')

# combine train / test
data_df = train_df.append(test_df, ignore_index = True)[['id', 'summary', 'created', 'project_id', 'comments', 'overall_worklogs']]
data_df['comments'].fillna('', inplace = True)
data_df['comments'] = data_df['comments'].map(lambda x: x.replace('\n', ' '))
data_df['comments'] = data_df['comments'].map(lambda x: x.replace('\t', ' '))
data_df['comments'] = data_df['comments'].map(lambda x: x.replace('\r', ' '))
data_df['summary_plus_comments'] = data_df['summary'] + '. ' + data_df['comments']

# add translation
data_df = data_df.merge(summary_translated_df, on = ['id'], how = 'left', validate = '1:1')

data_df.head(3)

Unnamed: 0,id,summary,created,project_id,comments,overall_worklogs,summary_plus_comments,summary_lang,summary_lang_fix,summary_translated
0,819952,"UI тесты по заказу ""Добро КейДжи""",2019-10-01 05:57:18.000,5,,1800.0,"UI тесты по заказу ""Добро КейДжи"".",ru,ru,UI tests commissioned by Dobro KG
1,819949,"UI тесты раздела ""Профиль""",2019-10-01 05:59:45.000,5,Приверила и приняла MR .,7200.0,"UI тесты раздела ""Профиль"". Приверила и принял...",ru,ru,"UI tests of the ""Profile"" section"
2,819947,"UI тесты раздела ""Личный счет""",2019-10-01 06:00:38.000,5,,14400.0,"UI тесты раздела ""Личный счет"".",ru,ru,"UI tests of the section ""Personal account"""


**generate preprocessed corpuses**

In [7]:
for text_col in ['summary', 'summary_translated', 'summary_plus_comments']:
    data_group_df = data_df[['id', text_col]].copy()
    
    for preprocess_tech in ['lemma', 'stem']:
        filename = text_col
        
        # get preprocessed data
        if preprocess_tech == 'lemma':
            data_group_df[f'text_preprocessed'] = data_group_df[text_col]\
                .map(lambda x: utils._text_preprocess(x))
            filename += '_lemma'
            
        elif preprocess_tech == 'stem':
            if 'translated' not in text_col:
                # double stemmer
                stemmer = SnowballStemmer(language='russian')
                data_group_df[f'text_preprocessed'] = data_group_df[text_col]\
                    .map(lambda x: utils._text_preprocess_stem(x, stemmer))
                stemmer = SnowballStemmer(language='english')
                data_group_df[f'text_preprocessed'] = data_group_df[f'text_preprocessed']\
                    .map(lambda x: utils._text_preprocess_stem(x, stemmer))
            else:
                # only english stemmer
                stemmer = SnowballStemmer(language='english')
                data_group_df[f'text_preprocessed'] = data_group_df[text_col]\
                    .map(lambda x: utils._text_preprocess_stem(x, stemmer))
            filename += '_stem'
        
        
        data_group_df[['id', f'text_preprocessed']]\
            .to_csv('./data/text_preprocess/' + filename + '.csv', index = False)    

**linreg val function**

In [3]:
def linreg_cv(merge_train_df):

    kfold_split = KFold(n_splits=10, shuffle=True, random_state=42)
    kfold_res = []
    pred_ids = []
    pred_result = []
    for train_index, test_index in kfold_split.split(merge_train_df):

        X_train, y_train = merge_train_df.iloc[train_index].drop(['id', 'overall_worklogs'], 1),  merge_train_df.iloc[train_index]['overall_worklogs']
        X_test, y_test = merge_train_df.iloc[test_index].drop(['id', 'overall_worklogs'], 1),  merge_train_df.iloc[test_index]['overall_worklogs']

        linreg = ElasticNet(alpha=0.1, l1_ratio=0.9) # LinearRegression()
        linreg.fit(X_train, y_train)
        preds = linreg.predict(X_test)
        val_res = r2_score(y_test, preds)
        kfold_res.append(val_res)

    return np.mean(kfold_res)

**generate tf-idfs**

In [12]:
text_dir = './data/text_preprocess/'
text_files = os.listdir(text_dir)

preprocess_result = []
for file in text_files:
    if '.csv' not in file:
        continue
        
    data_group_df = pd.read_csv(text_dir + file)
    
    for max_features in [25, 50, 100, 150, 275, 500, 1000]:
    
        vectorizer = TfidfVectorizer(max_features=max_features)#, ngram_range=(1, 3))
        X = vectorizer.fit_transform(data_group_df['text_preprocessed'])
        tfidf_features = ['tf_idf_' + x for x in vectorizer.get_feature_names()]
        summary_tfidf_df = pd.DataFrame(X.todense().tolist(), columns=tfidf_features)
        merge_df = pd.concat([data_group_df[['id']], summary_tfidf_df], axis = 1)
        
        # check features using linreg
        merge_df = merge_df.merge(data_df[['id', 'overall_worklogs']], on = ['id'], how = 'left', validate = '1:1')
        merge_train_df = merge_df[pd.notnull(merge_df['overall_worklogs'])].copy()
        cv_result = linreg_cv(merge_train_df)
        
        # save 
        file_save = f'./data/tf-idf/{file.split(".")[0]}_tfidf_{max_features}.csv'
        preprocess_result.append([file_save, cv_result])
        
#         merge_df.to_csv(file_save, index = False)

preprocess_result_df = pd.DataFrame(preprocess_result, columns = ['filename', 'cv_result'])
preprocess_result_df.sort_values(['cv_result'], ascending = False).head(5)

Unnamed: 0,filename,cv_result
18,./data/tf-idf/summary_translated_stem_tfidf_27...,0.053534
19,./data/tf-idf/summary_translated_stem_tfidf_50...,0.05262
20,./data/tf-idf/summary_translated_stem_tfidf_10...,0.050185
12,./data/tf-idf/summary_translated_lemma_tfidf_5...,0.04347
13,./data/tf-idf/summary_translated_lemma_tfidf_1...,0.042735


**generate bow**

In [119]:
text_dir = './data/text_preprocess/'
text_files = os.listdir(text_dir)

# preprocess_result = []
for file in text_files:
    if '.csv' not in file:
        continue
        
    data_group_df = pd.read_csv(text_dir + file)
    
    for max_features in [25, 50, 100, 150, 250, 500, 1000]:
    
        vectorizer = CountVectorizer(max_features=max_features)
        X = vectorizer.fit_transform(data_group_df['text_preprocessed'])
        tfidf_features = ['bow_' + x for x in vectorizer.get_feature_names()]
        summary_tfidf_df = pd.DataFrame(X.todense().tolist(), columns=tfidf_features)
        merge_df = pd.concat([data_group_df[['id']], summary_tfidf_df], axis = 1)
        
        # check features using linreg
        merge_df = merge_df.merge(data_df[['id', 'overall_worklogs']], on = ['id'], how = 'left', validate = '1:1')
        merge_train_df = merge_df[pd.notnull(merge_df['overall_worklogs'])].copy()
        cv_result = linreg_cv(merge_train_df)
        
        # save 
        file_save = f'./data/bow/{file.split(".")[0]}_bow_{max_features}.csv'
        preprocess_result.append([file_save, cv_result])
        
#         merge_df.to_csv(file_save, index = False)

preprocess_result_df = pd.DataFrame(preprocess_result, columns = ['filename', 'cv_result'])
preprocess_result_df.sort_values(['cv_result'], ascending = False).head(5)

**similarity w2v features**

In [32]:
text_dir = './data/text_preprocess/'
text_files = os.listdir(text_dir)

preprocess_result = []
for file in tqdm.tqdm(text_files, total = len(text_files)):
    if '.csv' not in file:
        continue
        
    data_group_df = pd.read_csv(text_dir + file)
    data_group_df = data_group_df.merge(data_df[['id', 'created', 'project_id', 'overall_worklogs']], how = 'left', on = ['id'], validate = '1:1')
    
    # train w2v model
    summary_data = dict(zip(data_group_df['id'], data_group_df['text_preprocessed'])) 
    summary_model = utils._train_w2v_model(summary_data)

    # collect similar issues time & similarity
    result = []
    for issue_id, group in data_group_df.groupby(['id']):

        issue_created = group['created'].values[0]
        issue_project = group['project_id'].values[0]

        possible_items_dict = utils._get_possible_products_pairs_v2(issue_id, summary_model, 25)
        possible_items_group = pd.DataFrame(possible_items_dict.keys(), columns = ['id'])\
            .merge(data_group_df, on = 'id', how = 'left')
        possible_items_group['similarity'] = possible_items_dict.values()

        # keep only created before main issue and from train
        possible_items_group = possible_items_group[(possible_items_group['created'] < issue_created)
                                                   &(pd.notnull(possible_items_group['overall_worklogs']))
                                                   &(possible_items_group['similarity'] > 0)]
        possible_items_group['time_since_issue'] = (pd.to_datetime(issue_created) - pd.to_datetime(possible_items_group['created'])).dt.total_seconds()
        if possible_items_group.shape[0] > 0:
            result_local = [issue_id]
            for keep_n in [1, 3, 5, 25, possible_items_group.shape[0]]:
                similar_time = np.median(possible_items_group['overall_worklogs'].values[:keep_n])
                similar_similarity = np.median(possible_items_group['similarity'].values[:keep_n])
                similar_time_since = np.median(possible_items_group['time_since_issue'].values[:keep_n])
                result_local.extend([similar_time, similar_similarity, similar_time_since])
            result.append(result_local)

    # merge info
    merge_df = pd.DataFrame(result, columns = ['id', 'sim_time_1', 'sim_sim_1', 'sim_since_1',
                                               'sim_time_3', 'sim_sim_3', 'sim_since_3', 
                                               'sim_time_5', 'sim_sim_5', 'sim_since_5', 
                                               'sim_time_25', 'sim_sim_25', 'sim_since_25', 
                                               'sim_time_all', 'sim_sim_all', 'sim_since_all'])
    
    file_save = f'./data/similarity_w2v/{file.split(".")[0]}_basic.csv'
    merge_df.to_csv(file_save, index = False)

**similarity intersect features**

In [None]:
text_dir = './data/text_preprocess/'
text_files = [x for x in os.listdir(text_dir) if 'comments' not in x]

preprocess_result = []
for file in tqdm.tqdm(text_files, total = len(text_files)):
    if '.csv' not in file:
        continue
        
    data_group_df = pd.read_csv(text_dir + file)
    data_group_df = data_group_df.merge(data_df[['id', 'created', 'project_id', 'overall_worklogs']], how = 'left', on = ['id'], validate = '1:1')
    
    # mask by create date
    df1 = data_group_df[['id', 'text_preprocessed', 'created']].copy()
    df1['key'] = 0
    df2 = df1.copy()
    df_merge = df1.merge(df2, on='key', how='outer')
    df_merge= df_merge[(df_merge.created_x > df_merge.created_y)].copy()

    # cross join
    df_merge['summary_intersect'] = df_merge.apply(
            lambda x: len(set(x['text_preprocessed_x'].split(' '))\
                  .intersection(x['text_preprocessed_y'].split(' '))), axis=1)

    # add data
    df_merge = df_merge.merge(data_group_df[['id', 'overall_worklogs', 'created']].rename(columns = {'id':'id_y'}), 
                              on = ['id_y'], how = 'left')
    df_merge = df_merge[pd.notnull(df_merge['overall_worklogs'])]
    df_merge = df_merge.sort_values(['id_x', 'summary_intersect'], ascending = False)
    df_merge['text_preprocessed_x_len'] = df_merge['text_preprocessed_x'].map(lambda x: len(x.split(' ')))
    df_merge['summary_intersect_rel'] = df_merge['summary_intersect'] / df_merge['text_preprocessed_x_len']
    df_merge['time_since_issue'] = (pd.to_datetime(df_merge['created_x']) - pd.to_datetime(df_merge['created_y'])).dt.total_seconds()

    # keep top-т
    df_merge_top= df_merge[df_merge['summary_intersect'] > 0].groupby(['id_x']).head(25)
    df_merge_top = df_merge_top[['id_x', 'overall_worklogs', 'summary_intersect_rel', 'time_since_issue']].rename(columns = {'id_x':'id'})

    # get stats 
    result = []
    for issue_id, possible_items_group in df_merge_top.groupby(['id']):
        result_local = [issue_id]
        for keep_n in [1, 3, 5, 25, possible_items_group.shape[0]]:
            similar_time = np.median(possible_items_group['overall_worklogs'].values[:keep_n])
            similar_similarity = np.median(possible_items_group['summary_intersect_rel'].values[:keep_n])
            similar_time_since = np.median(possible_items_group['time_since_issue'].values[:keep_n])
            result_local.extend([similar_time, similar_similarity, similar_time_since])
        result.append(result_local)

    # save result
    merge_df = pd.DataFrame(result, columns = ['id', 'sim_time_1', 'sim_sim_1', 'sim_since_1',
                                               'sim_time_3', 'sim_sim_3', 'sim_since_3', 
                                               'sim_time_5', 'sim_sim_5', 'sim_since_5', 
                                               'sim_time_25', 'sim_sim_25', 'sim_since_25', 
                                               'sim_time_all', 'sim_sim_all', 'sim_since_all'])

    file_save = f'./data/similarity_w2v/{file.split(".")[0]}_intersect_lev.csv'
    merge_df.to_csv(file_save, index = False)

 40%|████      | 2/5 [25:49<38:14, 764.69s/it]

**similarity pre-trained w2v**

In [180]:
# load glove vectors
# glove_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# get data
text_dir = './data/text_preprocess/'
text_files = os.listdir(text_dir)

preprocess_result = []
for file in tqdm.tqdm(text_files, total = len(text_files)):
    if '.csv' not in file:
        continue
        
    data_group_df = pd.read_csv(text_dir + file)
    data_group_df = data_group_df.merge(data_df[['id', 'created', 'project_id', 'overall_worklogs']], how = 'left', on = ['id'], validate = '1:1')
    
    # train w2v model
    summary_data = dict(zip(data_group_df['id'], data_group_df['text_preprocessed']))
    words = list(map(lambda x: x.split(' '), summary_data.values()))
    words = [[x for x in sent if x!= ''] for sent in words]
    
    # build a word2vec model on your dataset
    base_model = gensim.models.Word2Vec(
        vector_size=300, window=5, workers=1,
        min_count=5, epochs=25, seed=42, sg=1, negative=5,)
    base_model.build_vocab(words)
    total_examples = base_model.corpus_count
    
    # add GloVe's vocabulary & weights
    base_model.build_vocab([list(glove_vectors.key_to_index.keys())], update=True)

    # get avg model
    base_model = Average(base_model)
    base_model.train(IndexedList(words))
    base_model.sv.vocab = dict(zip(summary_data.keys(), base_model.sv.vectors))
    vector_length = 300
    summary_model = KeyedVectors(vector_length)
    key_list = list(base_model.sv.vocab.keys())
    vector_list = list(base_model.sv.vocab.values())
    summary_model.add_vectors(key_list, vector_list)
    
    # collect similar issues time & similarity
    result = []
    for issue_id, group in data_group_df.groupby(['id']):

        issue_created = group['created'].values[0]
        issue_project = group['project_id'].values[0]

        possible_items_dict = utils._get_possible_products_pairs_v2(issue_id, summary_model, 25)
        possible_items_group = pd.DataFrame(possible_items_dict.keys(), columns = ['id'])\
            .merge(data_group_df, on = 'id', how = 'left')
        possible_items_group['similarity'] = possible_items_dict.values()

        # keep only created before main issue and from train
        possible_items_group = possible_items_group[(possible_items_group['created'] < issue_created)
                                                   &(pd.notnull(possible_items_group['overall_worklogs']))
                                                   &(possible_items_group['similarity'] > 0)]
        possible_items_group['time_since_issue'] = (pd.to_datetime(issue_created) - pd.to_datetime(possible_items_group['created'])).dt.total_seconds()
        if possible_items_group.shape[0] > 0:
            result_local = [issue_id]
            for keep_n in [1, 3, 5, 25, possible_items_group.shape[0]]:
                similar_time = np.median(possible_items_group['overall_worklogs'].values[:keep_n])
                similar_similarity = np.median(possible_items_group['similarity'].values[:keep_n])
                similar_time_since = np.median(possible_items_group['time_since_issue'].values[:keep_n])
                result_local.extend([similar_time, similar_similarity, similar_time_since])
            result.append(result_local)

    # merge info
    merge_df = pd.DataFrame(result, columns = ['id', 'sim_time_1', 'sim_sim_1', 'sim_since_1',
                                               'sim_time_3', 'sim_sim_3', 'sim_since_3', 
                                               'sim_time_5', 'sim_sim_5', 'sim_since_5', 
                                               'sim_time_25', 'sim_sim_25', 'sim_since_25', 
                                               'sim_time_all', 'sim_sim_all', 'sim_since_all'])
    
    file_save = f'./data/similarity_w2v/{file.split(".")[0]}_w2v_pretrained.csv'
    merge_df.to_csv(file_save, index = False)

100%|██████████| 7/7 [11:59<00:00, 102.79s/it]


**choose best similarity type by corr with target**

In [182]:
sim_dir = './data/similarity_w2v/'
sim_files = os.listdir(sim_dir)

preprocess_result = []
for file in sim_files:
    if file == '.ipynb_checkpoints':
        continue
        
    sim_df = pd.read_csv(sim_dir + file)
    sim_df = sim_df.merge(data_df[['id', 'overall_worklogs']], on =['id'], how = 'left', validate= '1:1')
    
    sim_train_df = sim_df[pd.notnull(sim_df['overall_worklogs'])].copy()
    sim_test_df = sim_df[pd.isnull(sim_df['overall_worklogs'])].copy()
    
    for keep_n in ['1', '3', '5', '25', 'all']:
        
        sim_time_train = sim_train_df[f'sim_time_{keep_n}']
        sim_time_test = sim_test_df[f'sim_time_{keep_n}']
#         sim_sim_train = sim_train_df[f'sim_time_{keep_n}']
#         sim_since_train = sim_train_df[f'sim_since_{keep_n}']
        
        coverage_train = np.round(sim_time_train.dropna().shape[0] / train_df.shape[0] * 100, 1)
        coverage_test = np.round(sim_time_test.dropna().shape[0] / test_df.shape[0] * 100, 1)
        
        r2_train = np.round(r2_score(sim_train_df['overall_worklogs'], sim_time_train), 3)
        corr_train = np.round(sim_train_df[['overall_worklogs', f'sim_time_{keep_n}']].corr().values[0][1], 3)
        
        preprocess_result.append([file, keep_n, r2_train])

preprocess_result_df = pd.DataFrame(preprocess_result, columns = ['filename', 'keep_n', 'r2_train'])
preprocess_result_df.sort_values(['r2_train'], ascending = False).head(5)

Unnamed: 0,filename,keep_n,r2_train
26,summary_plus_comments_lemma_w2v_pretrained.csv,3,0.296
1,summary_translated_stem_intersect_lev.csv,3,0.293
16,summary_stem_intersect_lev.csv,3,0.289
41,summary_translated_lemma_w2v_pretrained.csv,3,0.288
66,summary_lemma_w2v_pretrained.csv,3,0.287


**improving linreg funciton**

In [117]:
# data_group_df = pd.read_csv('./data/bow/summary_translated_lemma_bow_100.csv')
data_group_df = pd.read_csv('./data/tf-idf/summary_translated_stem_tfidf_250.csv')
data_group_df = data_group_df.merge(data_df[['id', 'overall_worklogs']], on = ['id'], how = 'left', validate = '1:1')
data_group_train_df = data_group_df[pd.notnull(data_group_df['overall_worklogs'])].copy()

kfold_split = KFold(n_splits=10, shuffle=True, random_state=42)
# kfold_split = TimeSeriesSplit(n_splits=10)
kfold_res = []
pred_ids = []
pred_result = []
for train_index, test_index in kfold_split.split(data_group_train_df):

    X_train, y_train = data_group_train_df.iloc[train_index].drop(['id', 'overall_worklogs'], 1),  data_group_train_df.iloc[train_index]['overall_worklogs']
    X_test, y_test = data_group_train_df.iloc[test_index].drop(['id', 'overall_worklogs'], 1),  data_group_train_df.iloc[test_index]['overall_worklogs']

    linreg = ElasticNet(alpha=0.1, l1_ratio=0.9)
    linreg.fit(X_train, y_train)
    preds = linreg.predict(X_test)
    val_res = r2_score(y_test, preds)
    kfold_res.append(val_res)

np.mean(kfold_res)

0.053109398189843816

**predict with linreg**

- file = './data/tf-idf/summary_translated_stem_tfidf_275.csv
- cv = 0.053109398189843816
- public = -0.106485

In [109]:
data_group_df = pd.read_csv('./data/tf-idf/summary_translated_stem_tfidf_275.csv')
data_group_train_df = data_group_df[pd.notnull(data_group_df['overall_worklogs'])].copy()
data_group_test_df = data_group_df[pd.isnull(data_group_df['overall_worklogs'])].copy()

# fit
linreg = ElasticNet(alpha=0.1, l1_ratio=0.9)
linreg.fit(data_group_train_df.drop(['id', 'overall_worklogs'], 1), data_group_train_df['overall_worklogs'])

# predict
data_group_test_df['overall_worklogs'] = linreg.predict(data_group_test_df.drop(['id', 'overall_worklogs'], 1))

# save
save_df = solution_df.drop(['overall_worklogs'], 1)\
    .merge(data_group_test_df[['id', 'overall_worklogs']], on = ['id'], how = 'inner', validate= '1:1')
assert save_df.shape[0] == solution_df.shape[0]
assert all(save_df['id'] == solution_df['id'])
save_df['overall_worklogs'] = save_df['overall_worklogs'].apply(lambda x: max(60, x))
save_df.to_csv('./result/simple_submission.csv', index = False)
save_df.head(3)