In [1]:
import pandas as pd
import numpy as np

import re
import string

import requests

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

from joblib import dump, load

#!pip install transliterate
from transliterate import translit, get_available_language_codes


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nariman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
test = pd.read_csv('data/test_data.tsv', sep='\t')
test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name
0,0,"ООО ""Технология-СМ""",TRANSMOROZ GMBH
1,1,Общество с ограниченной ответственностью Научн...,"""OBS"" Limited Liability Company"
2,2,"Общество с ограниченной ответственностью ""Торг...",Synthesis Engineering LTD
3,3,"ООО ""ТД ""Грумант""","""RMP Forwarder"" Company Limited"
4,4,"ООО ""Отис""","""Russian blinis"" limited liability company"


In [6]:
#Lowercase
test['ru_name'] = test['ru_name'].str.lower()
test['eng_name']= test['eng_name'].str.lower()

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name
0,0,"ооо ""технология-см""",transmoroz gmbh
1,1,общество с ограниченной ответственностью научн...,"""obs"" limited liability company"
2,2,"общество с ограниченной ответственностью ""торг...",synthesis engineering ltd
3,3,"ооо ""тд ""грумант""","""rmp forwarder"" company limited"
4,4,"ооо ""отис""","""russian blinis"" limited liability company"


In [7]:
#remove some punctuation

regex = re.compile('[%s]' % re.escape(string.punctuation))

test['ru_name'] = test['ru_name'].apply(lambda text: regex.sub(' ', text))
test['eng_name'] = test['eng_name'].apply(lambda text: regex.sub(' ', text))

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name
0,0,ооо технология см,transmoroz gmbh
1,1,общество с ограниченной ответственностью научн...,obs limited liability company
2,2,общество с ограниченной ответственностью торг...,synthesis engineering ltd
3,3,ооо тд грумант,rmp forwarder company limited
4,4,ооо отис,russian blinis limited liability company


In [8]:
#tokenize
test['ru_name'] = test['ru_name'].apply(lambda str: WordPunctTokenizer().tokenize(str))
test['eng_name'] = test['eng_name'].apply(lambda str: WordPunctTokenizer().tokenize(str))

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name
0,0,"[ооо, технология, см]","[transmoroz, gmbh]"
1,1,"[общество, с, ограниченной, ответственностью, ...","[obs, limited, liability, company]"
2,2,"[общество, с, ограниченной, ответственностью, ...","[synthesis, engineering, ltd]"
3,3,"[ооо, тд, грумант]","[rmp, forwarder, company, limited]"
4,4,"[ооо, отис]","[russian, blinis, limited, liability, company]"


In [9]:
#remove stopwords

common_rus_words = ['', 'общество', 'c', 'c', 'ограниченной', 'ответственностью', 'ооо', 'акционерное', 'зао', 'закрытое', 'компания', 'комиссия', 'ликвидационная',
                    'групп', 'сервис', 'о', 'организация', 'дом', 'оао', 'м', 'открытое', 'сформирована', 'ликвидатора', 'ответственностью',
                    'назначение', 'ао', 'предприятие', 'а', 'некоммерческая', 'некоммерческое', 'нп', 'общественная', 'ликвидации',
                    'г', 'ано', 'автономная', 'ассоциация', 'тд', 'торговый', 'лтд', 'ликвидационный', 'ликвидационной', 'ликвидационном', 'р', 'оо', 'ликвидационного']
                

common_eng_words = ['', 'company', 'limited', 'ltd', 'liability', 'llc', 'co', 'of', 'stock', 'joint', 'group', 'the', 'closed', 'open', 'ooo', 'oo', 'jsc', 'cjsc', 'house', 'ограниченной',
                    'ответственностью', 'liabiliti', 'public', 'liabilily', 'centre', 'industrial', 'scientific', 'budgetary', 'federal', 'state', 'educational', 'ооо', 'center', 'iiability', 'corporation', 'st', 
                    'kompany', 'global', 'innovations', 'international', 'stok']


common_rus_words = common_rus_words + list(stopwords.words('russian'))
common_eng_words = common_eng_words + list(stopwords.words('english'))

def remove_element(arr, common_words):
  temp_arr = []
  try:
    for word in arr:
      if word not in common_words and len(word)>1:
        temp_arr.append(word)
    return temp_arr
  except:
    return [] 

test['ru_name'] = test['ru_name'].apply(lambda arr: remove_element(arr, common_rus_words))
test['eng_name'] = test['eng_name'].apply(lambda arr: remove_element(arr, common_eng_words))

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name
0,0,"[технология, см]","[transmoroz, gmbh]"
1,1,"[научно, производственное, радиационный, контр...",[obs]
2,2,[спт],"[synthesis, engineering]"
3,3,[грумант],"[rmp, forwarder]"
4,4,[отис],"[russian, blinis]"


In [10]:
#transliterate text

test['ru_name'] = test['ru_name'].apply(lambda arr: ' '.join(arr))
test['eng_name'] = test['eng_name'].apply(lambda arr: ' '.join(arr))

test['ru_en_transliteration'] = test['ru_name'].apply(lambda text: translit(text, 'ru', reversed=True))
test['en_ru_transliteration'] = test['eng_name'].apply(lambda text: translit(text, 'ru'))

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,ru_en_transliteration,en_ru_transliteration
0,0,технология см,transmoroz gmbh,tehnologija sm,трансмороз гмбх
1,1,научно производственное радиационный контроль ...,obs,nauchno proizvodstvennoe radiatsionnyj kontrol...,обс
2,2,спт,synthesis engineering,spt,сынтхесис енгинееринг
3,3,грумант,rmp forwarder,grumant,рмп форwардер
4,4,отис,russian blinis,otis,руссиан блинис


In [11]:
#split text
test['ru_name'] = test['ru_name'].apply(lambda str: str.split())
test['eng_name'] = test['eng_name'].apply(lambda str: str.split())  
test['ru_en_transliteration'] = test['ru_en_transliteration'].apply(lambda str: str.split())
test['en_ru_transliteration'] = test['en_ru_transliteration'].apply(lambda str: str.split())

test.head()

Unnamed: 0.1,Unnamed: 0,ru_name,eng_name,ru_en_transliteration,en_ru_transliteration
0,0,"[технология, см]","[transmoroz, gmbh]","[tehnologija, sm]","[трансмороз, гмбх]"
1,1,"[научно, производственное, радиационный, контр...",[obs],"[nauchno, proizvodstvennoe, radiatsionnyj, kon...",[обс]
2,2,[спт],"[synthesis, engineering]",[spt],"[сынтхесис, енгинееринг]"
3,3,[грумант],"[rmp, forwarder]",[grumant],"[рмп, форwардер]"
4,4,[отис],"[russian, blinis]",[otis],"[руссиан, блинис]"


## Skip the part of feature generation and load it

In [2]:
LD = pd.read_csv('data/LD_test.csv')
RO = pd.read_csv('data/RO_test.csv')
JW = pd.read_csv('data/JW_test.csv')
DL = pd.read_csv('data/DL_test.csv')
BG = pd.read_csv('data/BG_test.csv')
TG = pd.read_csv('data/TG_test.csv')
CS = pd.read_csv('data/CS_test.csv')

In [3]:
#wrap up to single dataframe
data = {'LD_rus':LD['0'], 
        'LD_eng':LD['1'], 
        'JW_rus':JW['0'], 
        'JW_eng':JW['1'], 
        'RO_rus':RO['0'], 
        'RO_eng':RO['1'], 
        'DL_rus':DL['0'], 
        'DL_eng':DL['1'], 
        'TG_rus':TG['0'], 
        'TG_eng':TG['1'], 
        'BG_rus':BG['0'], 
        'BG_eng':BG['1'], 
        'CS_eng':CS['0']
        }

all_test = pd.DataFrame(data)

In [4]:
## little post processing and normalization
all_test['LD_rus'] = all_test['LD_rus'].apply(lambda x: 18 if x==100 else x)
all_test['LD_eng'] = all_test['LD_eng'].apply(lambda x: 18 if x==100 else x)

minn = all_test['LD_rus'].min()
maxx = all_test['LD_rus'].max()
all_test['LD_rus'] = all_test['LD_rus'].apply(lambda x: (x - minn)/(maxx - minn))

minn = all_test['LD_eng'].min()
maxx = all_test['LD_eng'].max()
all_test['LD_eng'] = all_test['LD_eng'].apply(lambda x: (x - minn)/(maxx - minn))

In [5]:
all_test.head()

Unnamed: 0,LD_rus,LD_eng,JW_rus,JW_eng,RO_rus,RO_eng,DL_rus,DL_eng,TG_rus,TG_eng,BG_rus,BG_eng,CS_eng
0,0.054545,0.048387,0.733333,0.733333,0.4,0.333333,1.0,0.909091,0.043478,0.041667,0.076923,0.076923,0.490041
1,0.090909,0.080645,0.597222,0.597222,0.222222,0.222222,1.0,1.0,0.0,0.0,0.0,0.0,0.550131
2,0.127273,0.112903,0.62963,0.62963,0.333333,0.333333,1.0,1.0,0.066667,0.066667,0.076923,0.076923,0.486147
3,0.090909,0.080645,0.650794,0.650794,0.4,0.4,0.888889,0.888889,0.0,0.0,0.0,0.0,0.426105
4,0.072727,0.064516,0.611111,0.611111,0.4,0.4,0.857143,0.857143,0.166667,0.166667,0.2,0.2,0.648827


In [8]:
clf = load('rf_kontur.joblib')

In [9]:
pred = clf.predict(all_test)

In [10]:
result = pd.DataFrame([True if i==1 else False for i in pred], columns=['answer'])
result.head()

Unnamed: 0,answer
0,False
1,False
2,False
3,False
4,False


In [12]:
result.to_csv('result.tsv',sep='\t', index=False)