In [32]:
import pandas as pd
import numpy as np

import re
import string

import requests

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

from joblib import dump, load

# pip install transliterate
from transliterate import translit, get_available_language_codes


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
train = pd.read_csv('data/train_data.tsv', sep='\t')
train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer
0,0,"Общество с ограниченной ответственностью ""ЕВА""","""Langford Travel""",False
1,1,"Общество с ограниченной ответственностью ""ФОРВ...",AMARA,False
2,2,"ООО ""Меллита""","""MAX-TRANS"" Limited liability company",False
3,3,"ЗАКРЫТОЕ АКЦИОНЕРНОЕ ОБЩЕСТВО ""ПЕТОЙЛ"" (открыт...","""Computational Technologies"" Ltd",False
4,4,"Общество с ограниченной ответственностью ""Конц...","""Ad.Arty""",False


In [15]:
#Lowercase
train['ru_name'] = train['ru_name'].str.lower()
train['eng_name']= train['eng_name'].str.lower()

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer
0,0,"общество с ограниченной ответственностью ""ева""","""langford travel""",False
1,1,"общество с ограниченной ответственностью ""форв...",amara,False
2,2,"ооо ""меллита""","""max-trans"" limited liability company",False
3,3,"закрытое акционерное общество ""петойл"" (открыт...","""computational technologies"" ltd",False
4,4,"общество с ограниченной ответственностью ""конц...","""ad.arty""",False


In [16]:
#remove some punctuation

regex = re.compile('[%s]' % re.escape(string.punctuation))

train['ru_name'] = train['ru_name'].apply(lambda text: regex.sub(' ', text))
train['eng_name'] = train['eng_name'].apply(lambda text: regex.sub(' ', text))

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer
0,0,общество с ограниченной ответственностью ева,langford travel,False
1,1,общество с ограниченной ответственностью форв...,amara,False
2,2,ооо меллита,max trans limited liability company,False
3,3,закрытое акционерное общество петойл открыт...,computational technologies ltd,False
4,4,общество с ограниченной ответственностью конц...,ad arty,False


In [17]:
#tokenize
train['ru_name'] = train['ru_name'].apply(lambda str: WordPunctTokenizer().tokenize(str))
train['eng_name'] = train['eng_name'].apply(lambda str: WordPunctTokenizer().tokenize(str))

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer
0,0,"[общество, с, ограниченной, ответственностью, ...","[langford, travel]",False
1,1,"[общество, с, ограниченной, ответственностью, ...",[amara],False
2,2,"[ооо, меллита]","[max, trans, limited, liability, company]",False
3,3,"[закрытое, акционерное, общество, петойл, откр...","[computational, technologies, ltd]",False
4,4,"[общество, с, ограниченной, ответственностью, ...","[ad, arty]",False


In [18]:
#remove stopwords

common_rus_words = ['', 'общество', 'c', 'c', 'ограниченной', 'ответственностью', 'ооо', 'акционерное', 'зао', 'закрытое', 'компания', 'комиссия', 'ликвидационная',
                    'групп', 'сервис', 'о', 'организация', 'дом', 'оао', 'м', 'открытое', 'сформирована', 'ликвидатора', 'ответственностью',
                    'назначение', 'ао', 'предприятие', 'а', 'некоммерческая', 'некоммерческое', 'нп', 'общественная', 'ликвидации',
                    'г', 'ано', 'автономная', 'ассоциация', 'тд', 'торговый', 'лтд', 'ликвидационный', 'ликвидационной', 'ликвидационном', 'р', 'оо', 'ликвидационного']
                

common_eng_words = ['', 'company', 'limited', 'ltd', 'liability', 'llc', 'co', 'of', 'stock', 'joint', 'group', 'the', 'closed', 'open', 'ooo', 'oo', 'jsc', 'cjsc', 'house', 'ограниченной',
                    'ответственностью', 'liabiliti', 'public', 'liabilily', 'centre', 'industrial', 'scientific', 'budgetary', 'federal', 'state', 'educational', 'ооо', 'center', 'iiability', 'corporation', 'st', 
                    'kompany', 'global', 'innovations', 'international', 'stok']


common_rus_words = common_rus_words + list(stopwords.words('russian'))
common_eng_words = common_eng_words + list(stopwords.words('english'))

def remove_element(arr, common_words):
  temp_arr = []
  try:
    for word in arr:
      if word not in common_words and len(word)>1:
        temp_arr.append(word)
    return temp_arr
  except:
    return [] 

train['ru_name'] = train['ru_name'].apply(lambda arr: remove_element(arr, common_rus_words))
train['eng_name'] = train['eng_name'].apply(lambda arr: remove_element(arr, common_eng_words))

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer
0,0,[ева],"[langford, travel]",False
1,1,[форвард],[amara],False
2,2,[меллита],"[max, trans]",False
3,3,"[петойл, открыто, конкурсное, производство]","[computational, technologies]",False
4,4,"[концепт, трейд]","[ad, arty]",False


In [19]:
#transliterate text

train['ru_name'] = train['ru_name'].apply(lambda arr: ' '.join(arr))
train['eng_name'] = train['eng_name'].apply(lambda arr: ' '.join(arr))

train['ru_en_transliteration'] = train['ru_name'].apply(lambda text: translit(text, 'ru', reversed=True))
train['en_ru_transliteration'] = train['eng_name'].apply(lambda text: translit(text, 'ru'))

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer,ru_en_transliteration,en_ru_transliteration
0,0,ева,langford travel,False,eva,лангфорд травел
1,1,форвард,amara,False,forvard,амара
2,2,меллита,max trans,False,mellita,маx транс
3,3,петойл открыто конкурсное производство,computational technologies,False,petojl otkryto konkursnoe proizvodstvo,цомпутатионал течнологиес
4,4,концепт трейд,ad arty,False,kontsept trejd,ад арты


In [20]:
#split data
train['ru_name'] = train['ru_name'].apply(lambda str: str.split())
train['eng_name'] = train['eng_name'].apply(lambda str: str.split())  
train['ru_en_transliteration'] = train['ru_en_transliteration'].apply(lambda str: str.split())
train['en_ru_transliteration'] = train['en_ru_transliteration'].apply(lambda str: str.split())

train.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,answer,ru_en_transliteration,en_ru_transliteration
0,0,[ева],"[langford, travel]",False,[eva],"[лангфорд, травел]"
1,1,[форвард],[amara],False,[forvard],[амара]
2,2,[меллита],"[max, trans]",False,[mellita],"[маx, транс]"
3,3,"[петойл, открыто, конкурсное, производство]","[computational, technologies]",False,"[petojl, otkryto, konkursnoe, proizvodstvo]","[цомпутатионал, течнологиес]"
4,4,"[концепт, трейд]","[ad, arty]",False,"[kontsept, trejd]","[ад, арты]"


## Skip the part of feature generation and load it

In [0]:
LD = pd.read_csv('data/LD_train.csv')
RO = pd.read_csv('data/RO_train.csv')
JW = pd.read_csv('data/JW_train.csv')
DL = pd.read_csv('data/DL_train.csv')
BG = pd.read_csv('data/BG_train.csv')
TG = pd.read_csv('data/TG_train.csv')
CS = pd.read_csv('data/CS_train.csv')

In [0]:
#wrap up to single dataframe
data = {'LD_rus':LD['LD_rus'], 
        'LD_eng':LD['LD_eng'], 
        'JW_rus':JW['0'], 
        'JW_eng':JW['1'], 
        'RO_rus':RO['0'], 
        'RO_eng':RO['1'], 
        'DL_rus':DL['0'], 
        'DL_eng':DL['1'], 
        'TG_rus':TG['0'], 
        'TG_eng':TG['1'], 
        'BG_rus':BG['0'], 
        'BG_eng':BG['1'], 
        'CS_eng':CS['0'],
        'answer':train['answer']}
all_train = pd.DataFrame(data)

In [0]:
## little post processing and normalization
all_train['LD_rus'] = all_train['LD_rus'].apply(lambda x: 18 if x==100 else x)
all_train['LD_eng'] = all_train['LD_eng'].apply(lambda x: 18 if x==100 else x)

minn = all_train['LD_rus'].min()
maxx = all_train['LD_rus'].max()
all_train['LD_rus'] = all_train['LD_rus'].apply(lambda x: (x - minn)/(maxx - minn))

minn = all_train['LD_eng'].min()
maxx = all_train['LD_eng'].max()
all_train['LD_eng'] = all_train['LD_eng'].apply(lambda x: (x - minn)/(maxx - minn))
all_train['answer'] = all_train['answer'].apply(lambda x: int(x == True))

In [24]:
all_train.head()

Unnamed: 0,LD_rus,LD_eng,JW_rus,JW_eng,RO_rus,RO_eng,DL_rus,DL_eng,TG_rus,TG_eng,BG_rus,BG_eng,CS_eng,answer
0,0.135135,0.135135,0.5,0.5,0.222222,0.222222,1.0,1.0,0.0,0.0,0.0,0.0,0.439093,0
1,0.135135,0.135135,0.395238,0.395238,0.333333,0.333333,0.714286,0.714286,0.0,0.0,0.076923,0.076923,0.33017,0
2,0.162162,0.162162,0.492063,0.492063,0.4,0.4,1.0,1.0,0.076923,0.076923,0.090909,0.090909,0.389286,0
3,0.243243,0.243243,0.626374,0.626374,0.421053,0.4,0.923077,1.0,0.045455,0.045455,0.05,0.05,0.62216,0
4,0.108108,0.108108,0.483333,0.483333,0.285714,0.285714,1.0,1.0,0.1,0.1,0.125,0.125,0.196591,0


In [0]:
 X_train, X_test, y_train, y_test = train_test_split(all_train[['LD_rus',	'LD_eng',	'JW_rus',	'JW_eng',	'RO_rus',	'RO_eng','DL_rus',	'DL_eng',	'TG_rus',	'TG_eng',	'BG_rus',	'BG_eng']]
                                                     , all_train['answer'], test_size=0.2, random_state=42)

In [0]:
def precision(preds, trues):
  preds = np.array(preds)
  trues = np.array(trues)
  return np.sum(preds[trues == 1]) / np.sum(preds)

def recall(preds, trues):
  preds = np.array(preds)
  trues = np.array(trues)
  return np.sum(preds[trues == 1]) / np.sum(trues)

def f_score(preds, trues):
  p = precision(preds, trues)
  r = recall(preds, trues)
  return 2 * p * r / (p + r)

## Model with best score

In [30]:
#fit predict

from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 40).fit(X_train,y_train)
pred = random_forest.predict(X_test)

print('Precision: ',precision(pred, y_test))
print('Recall: ',recall(pred, y_test))
print('F1 score: ', f_score(pred, y_test))

Precision:  0.9254595276948182
Recall:  0.8257465454245763
F1 score:  0.8727642276422765


In [34]:
#save model
dump(random_forest, 'rf_kontur.joblib') 

['rf_kontur.joblib']