In [59]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from scipy.sparse import  hstack
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import re
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
df = pd.read_csv('../data/train.csv',index_col = 0)

In [13]:
df 

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497815,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497816,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497817,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497818,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


## Чистка Датасета

In [14]:
df['name_1'] = df['name_1'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())
df['name_2'] = df['name_2'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())

In [15]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries ltd,enormous industrial trade pvt ltd,0
2,apcotex industries ltd,technocraft industries india ltd,0
3,rishichem distributors pvt ltd,dsa,0
4,powermax rubber factory,co one,0
5,tress a s,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0
497816,bnd trading co ltd,zhong shan yue liang economy trade imp exp co ...,0
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0
497818,shanghai kechuan trading co ltd,shanghai m g stationery inc,0


In [17]:
df.isna().sum(axis = 0)

name_1          0
name_2          0
is_duplicate    0
dtype: int64

## First try

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df[['name_1','name_2']], df['is_duplicate'], test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])

In [19]:
text_transformer = TfidfVectorizer()

In [20]:
X_train_text = text_transformer.fit_transform([*X_train['name_1'].to_list(),*X_train['name_1'].to_list()])
X_test_text = text_transformer.transform([*X_test['name_1'].to_list(),*X_test['name_1'].to_list()])

In [21]:
X_train_text.shape, X_test_text.shape

((746728, 16076), (248910, 16076))

In [22]:
X_train.shape, y_test.shape

((373364, 2), (124455,))

In [23]:
X_train_s = hstack([X_train_text[0:373364],X_train_text[373364:]])
X_test_s = hstack([X_test_text[0:124455],X_test_text[124455:]])

In [None]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='ovr', random_state=42, n_jobs=4)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(logit, X_train_s, y_train, cv=skf, scoring='f1_micro')

In [25]:
cv_results

array([0.99377285, 0.99334431, 0.99326396, 0.9930363 , 0.99351832])

In [None]:
logit.fit(X_train_s, y_train)
test_preds = logit.predict(X_test_s)
test_preds_proba = logit.predict_proba(X_test_s)

In [27]:
f1_score(y_test, test_preds, average = 'micro')

0.9936924992969346

In [28]:
f1_score(y_test, test_preds, average = 'macro')

0.6990249967091267

In [29]:
sklearn.metrics.roc_auc_score(y_test, test_preds)

0.6431775598001046

## Конкатенация

In [30]:
df['glu'] = df['name_1'] + ' ' + df['name_2']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df[['glu']], df['is_duplicate'], test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])
# text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
text_transformer = TfidfVectorizer()
X_train_text = text_transformer.fit_transform(X_train['glu'])
X_test_text = text_transformer.transform(X_test['glu'])

In [None]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='ovr', random_state=42, n_jobs=4)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(logit, X_train_s, y_train, cv=skf, scoring='f1_macro')

In [33]:
cv_results, cv_results.mean()

(array([0.70150014, 0.68095822, 0.67462636, 0.67646743, 0.68489966]),
 0.6836903605685175)

In [None]:
logit.fit(X_train_s, y_train)
test_preds = logit.predict(X_test_s)
test_preds_proba = logit.predict_proba(X_test_s)

In [35]:
f1_score(y_test, test_preds, average = 'macro')

0.6990249967091267

In [36]:
sklearn.metrics.roc_auc_score(y_test, test_preds)

0.6431775598001046

## Расстояния

In [37]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate,glu
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,iko industries ltd,enormous industrial trade pvt ltd,0,iko industries ltd enormous industrial trade ...
2,apcotex industries ltd,technocraft industries india ltd,0,apcotex industries ltd technocraft industries...
3,rishichem distributors pvt ltd,dsa,0,rishichem distributors pvt ltd dsa
4,powermax rubber factory,co one,0,powermax rubber factory co one
5,tress a s,longyou industries park zhejiang,0,tress a s longyou industries park zhejiang
...,...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0,bit mat products the goodyear tire and rubber ...
497816,bnd trading co ltd,zhong shan yue liang economy trade imp exp co ...,0,bnd trading co ltd zhong shan yue liang econo...
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0,xeikon industrial co ltd of dongguan city yi c...
497818,shanghai kechuan trading co ltd,shanghai m g stationery inc,0,shanghai kechuan trading co ltd shanghai m g ...


In [40]:
text_transformer = TfidfVectorizer()

In [41]:
X_text = text_transformer.fit_transform([*df['name_1'].to_list(),*df['name_1'].to_list()])
y = df['is_duplicate']

In [44]:
X_text.shape, df.shape

((995638, 16180), (497819, 4))

In [45]:
name_1_tf_idf = X_text[0:497819]
name_2_tf_idf = X_text[497819:]

In [115]:
df['indexes'] = df.apply(lambda x: x.name, axis = 1)

In [117]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate,glu,indexes
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,iko industries ltd,enormous industrial trade pvt ltd,0,iko industries ltd enormous industrial trade ...,1
2,apcotex industries ltd,technocraft industries india ltd,0,apcotex industries ltd technocraft industries...,2
3,rishichem distributors pvt ltd,dsa,0,rishichem distributors pvt ltd dsa,3
4,powermax rubber factory,co one,0,powermax rubber factory co one,4
5,tress a s,longyou industries park zhejiang,0,tress a s longyou industries park zhejiang,5
...,...,...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0,bit mat products the goodyear tire and rubber ...,497815
497816,bnd trading co ltd,zhong shan yue liang economy trade imp exp co ...,0,bnd trading co ltd zhong shan yue liang econo...,497816
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0,xeikon industrial co ltd of dongguan city yi c...,497817
497818,shanghai kechuan trading co ltd,shanghai m g stationery inc,0,shanghai kechuan trading co ltd shanghai m g ...,497818


In [122]:
df['dist'] = df.apply(lambda x: cosine_similarity(name_1_tf_idf.getrow(x.indexes - 1),
                                                  name_2_tf_idf.getrow(x.indexes - 1)), axis = 1)

In [133]:
df['dist'] = df['dist'].apply(lambda x: float(x))

In [135]:
df['dist'].describe()

count    497819.000000
mean          0.990967
std           0.094614
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: dist, dtype: float64

In [138]:
df[['name_1','name_1','is_duplicate','dist']]

Unnamed: 0_level_0,name_1,name_1,is_duplicate,dist
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,iko industries ltd,iko industries ltd,0,1.0
2,apcotex industries ltd,apcotex industries ltd,0,1.0
3,rishichem distributors pvt ltd,rishichem distributors pvt ltd,0,1.0
4,powermax rubber factory,powermax rubber factory,0,1.0
5,tress a s,tress a s,0,1.0
...,...,...,...,...
497815,bit mat products,bit mat products,0,1.0
497816,bnd trading co ltd,bnd trading co ltd,0,1.0
497817,xeikon industrial co ltd of dongguan city,xeikon industrial co ltd of dongguan city,0,1.0
497818,shanghai kechuan trading co ltd,shanghai kechuan trading co ltd,0,1.0
