In [19]:
%load_ext autoreload
%autoreload 2

import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error

import catboost as ctb
from catboost import CatBoostRegressor, cv, Pool
from catboost.utils import select_threshold, get_fpr_curve, get_roc_curve

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import download
from nltk.corpus import stopwords
from pymystem3 import Mystem
import re
import gensim
from fse import IndexedList
from fse.models import Average
from gensim.models import KeyedVectors

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV

from translate import Translator
from nltk.stem import SnowballStemmer
from langdetect import detect
from deep_translator import GoogleTranslator

bad_symbols_re = re.compile('[,.«»!"#$%&\'()*+/:;<=>?@[\\]^_`{|}~]')
stopwords = stopwords.words(['russian', 'english'])
mystem = Mystem()

import sys
sys.path.append("/data1/vovan/shared_code/")
import shared_utils
import utils

%matplotlib inline
pd.set_option('max_column', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
# load
train_df = pd.read_csv("./data/train_issues.csv")
train_comment_df = pd.read_csv("./data/train_comments.csv")
test_df = pd.read_csv("./data/test_issues.csv")
test_comment_df = pd.read_csv("./data/test_comments.csv")
emp_df = pd.read_csv("./data/employees.csv")
solution_df = pd.read_csv("./data/sample_solution.csv")

# add comments texts
train_comment_df['text_padded'] = train_comment_df['text'] + '. '
test_comment_df['text_padded'] = test_comment_df['text'] + '. '

train_df = train_df.merge(train_comment_df.groupby(['issue_id'], as_index = False)['text_padded'].sum()\
    .rename(columns = {'text_padded':'comments', 'issue_id':'id'}), on = ['id'], how = 'left', validate = '1:1')

test_df = test_df.merge(test_comment_df.groupby(['issue_id'], as_index = False)['text_padded'].sum()\
    .rename(columns = {'text_padded':'comments', 'issue_id':'id'}), on = ['id'], how = 'left', validate = '1:1')

# combine train / test
data_df = train_df.append(test_df, ignore_index = True)[['id', 'summary', 'comments']]
data_df['comments'].fillna('', inplace = True)
data_df.head(3)

Unnamed: 0,id,summary,comments
0,819952,"UI тесты по заказу ""Добро КейДжи""",
1,819949,"UI тесты раздела ""Профиль""",Приверила и приняла MR\n\n .
2,819947,"UI тесты раздела ""Личный счет""",


In [54]:
# detect language
def fix_lang(x):
    if x in ['ru', 'bg', 'mk', 'uk']:
        return 'ru'
    elif x in ['vi']:
        return 'vi'
    return 'en'

data_df['summary_lang'] = data_df['summary'].map(lambda x: detect(x))
data_df['summary_lang_fix'] = data_df['summary_lang'].map(lambda x: fix_lang(x))

# errors here
# data_df['comments_lang'] = data_df['comments'].map(lambda x: detect(x) if x.strip() != '' else None)
# data_df['comments_lang_fix'] = data_df['comments_lang'].map(lambda x: fix_lang(x) if x is not None else None)

In [57]:
# translate summary
result = []
todo_df = data_df[data_df['summary_lang_fix'] != 'en']
for i, row in tqdm.tqdm(todo_df.iterrows(), total = todo_df.shape[0]):
    
    summary_text = row['summary']
    summary_text_translated = GoogleTranslator(source='auto', target='en').translate(summary_text)
    
    result.append([row['id'], summary_text_translated])

100%|██████████| 1625/1625 [16:49<00:00,  1.61it/s]


In [58]:
# merge
result_df = pd.DataFrame(result, columns = ['id', 'summary_translated'])
data_df = data_df.merge(result_df, on = ['id'], how = 'left', validate = '1:1')
data_df['summary_translated'].fillna(data_df['summary'], inplace = True)

In [67]:
# save
data_df[['id', 'summary_lang', 'summary_lang_fix', 'summary_translated']]\
    .to_csv('./data/summary_translated.csv', index = False)