In [255]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from scipy.sparse import  hstack
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import re
from sklearn.metrics.pairwise import cosine_similarity

In [256]:
df = pd.read_csv('../data/train.csv',index_col = 0)

In [257]:
df 

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497815,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497816,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497817,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497818,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


## Чистка Датасета

In [258]:
df['name_1'] = df['name_1'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())
df['name_2'] = df['name_2'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())

In [259]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries ltd,enormous industrial trade pvt ltd,0
2,apcotex industries ltd,technocraft industries india ltd,0
3,rishichem distributors pvt ltd,dsa,0
4,powermax rubber factory,co one,0
5,tress a s,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0
497816,bnd trading co ltd,zhong shan yue liang economy trade imp exp co ...,0
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0
497818,shanghai kechuan trading co ltd,shanghai m g stationery inc,0


In [260]:
df.isna().sum(axis = 0)

name_1          0
name_2          0
is_duplicate    0
dtype: int64

## First try

In [261]:
X_train, X_test, y_train, y_test = train_test_split(df[['name_1','name_2']], df['is_duplicate'], test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])

In [262]:
text_transformer = TfidfVectorizer()

In [263]:
X_train_text = text_transformer.fit_transform([*X_train['name_1'].to_list(),*X_train['name_1'].to_list()])
X_test_text = text_transformer.transform([*X_test['name_1'].to_list(),*X_test['name_1'].to_list()])

In [264]:
X_train_text.shape, X_test_text.shape

((746728, 16076), (248910, 16076))

In [265]:
X_train.shape, y_test.shape

((373364, 2), (124455,))

In [266]:
X_train_s = hstack([X_train_text[0:373364],X_train_text[373364:]])
X_test_s = hstack([X_test_text[0:124455],X_test_text[124455:]])

In [None]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='ovr', random_state=42, n_jobs=4)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(logit, X_train_s, y_train, cv=skf, scoring='f1')

In [268]:
cv_results,cv_results.mean()

(array([0.40613027, 0.36526181, 0.35263835, 0.35643564, 0.37305699]),
 0.3707046145516542)

In [None]:
logit.fit(X_train_s, y_train)
test_preds = logit.predict(X_test_s)
test_preds_proba = logit.predict_proba(X_test_s)

In [279]:
def get_metrics(predict, proba, target):
    print(f"f1: {f1_score(target, predict)}")
    print(f"f1 macro: {f1_score(target, predict,average = 'macro')}")
    print(f"recall: {sklearn.metrics.recall_score(target, predict)}")
    print(f"roc auc :{sklearn.metrics.roc_auc_score(target, proba[:,1])}")

In [280]:
get_metrics(test_preds,test_preds_proba,y_test)

f1: 0.4012204424103737
f1 macro: 0.6990249967091267
recall: 0.28743169398907104
roc auc :0.9648987164618259


## Конкатенация

In [272]:
df['glu'] = df['name_1'] + ' ' + df['name_2']

In [273]:
X_train, X_test, y_train, y_test = train_test_split(df[['glu']], df['is_duplicate'], test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])
# text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
text_transformer = TfidfVectorizer()
X_train_text = text_transformer.fit_transform(X_train['glu'])
X_test_text = text_transformer.transform(X_test['glu'])

In [None]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='ovr', random_state=42, n_jobs=4)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(logit, X_train_text, y_train, cv=skf, scoring='f1')

In [275]:
cv_results, cv_results.mean()

(array([0.76120959, 0.77372263, 0.77522478, 0.72502575, 0.76131687]),
 0.7592999230738603)

In [None]:
logit.fit(X_train_text, y_train)
test_preds = logit.predict(X_test_text)
test_preds_proba = logit.predict_proba(X_test_text)

In [282]:
get_metrics(test_preds,test_preds_proba,y_test)

f1: 0.7607891491985203
f1 macro: 0.8796100642307829
recall: 0.6743169398907104
roc auc :0.9780870645643853


## Очищенный скриптом датасет

In [283]:
df = pd.read_csv('../data/result.csv',index_col = 0)

In [284]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries,enormous industrial trade,0
2,apcotex industries,technocraft industries,0
3,rishichem distributors,dsa,0
4,powermax rubber factory,one,0
5,tress a s,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0
497816,bnd trading,zhong shan yue liang economy trade imp exp,0
497817,xeikon industrial of dongguan city,yi cheng trading of dongguan city,0
497818,kechuan trading,m g stationery,0


In [285]:
df['glu'] = df['name_1'] + ' ' + df['name_2']

In [286]:
df = df.dropna()

In [287]:
X_train, X_test, y_train, y_test = train_test_split(df[['glu']], df['is_duplicate'], test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])
# text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
text_transformer = TfidfVectorizer()
X_train_text = text_transformer.fit_transform(X_train['glu'])
X_test_text = text_transformer.transform(X_test['glu'])

In [None]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='ovr', random_state=42, n_jobs=4)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(logit, X_train_text, y_train, cv=skf, scoring='f1')

In [289]:
cv_results, cv_results.mean()

(array([0.70967742, 0.73312565, 0.71635611, 0.73394495, 0.71958763]),
 0.7225383518046427)

In [None]:
logit.fit(X_train_text, y_train)
test_preds = logit.predict(X_test_text)
test_preds_proba = logit.predict_proba(X_test_text)

In [291]:
get_metrics(test_preds,test_preds_proba,y_test)

f1: 0.7382716049382715
f1 macro: 0.8682748328223769
recall: 0.6542669584245077
roc auc :0.9686186455787411


## Расстояния

In [181]:
text_transformer = TfidfVectorizer()

In [182]:
X_text = text_transformer.fit_transform([*df['name_1'].to_list(),*df['name_1'].to_list()])
y = df['is_duplicate']

In [183]:
X_text.shape, df.shape

((995638, 16180), (497819, 4))

In [184]:
name_1_tf_idf = X_text[0:497819]
name_2_tf_idf = X_text[497819:]

In [185]:
df['indexes'] = df.apply(lambda x: x.name, axis = 1)

In [186]:
df['dist'] = df.apply(lambda x: cosine_similarity(name_1_tf_idf.getrow(x.indexes - 1),
                                                  name_2_tf_idf.getrow(x.indexes - 1)), axis = 1)

In [187]:
df['dist'] = df['dist'].apply(lambda x: float(x))

In [188]:
df['dist'].describe()

count    497819.000000
mean          0.990967
std           0.094614
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: dist, dtype: float64

In [189]:
df[['name_1','name_1','is_duplicate','dist']]

Unnamed: 0_level_0,name_1,name_1,is_duplicate,dist
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,iko industries ltd,iko industries ltd,0,1.0
2,apcotex industries ltd,apcotex industries ltd,0,1.0
3,rishichem distributors pvt ltd,rishichem distributors pvt ltd,0,1.0
4,powermax rubber factory,powermax rubber factory,0,1.0
5,tress a s,tress a s,0,1.0
...,...,...,...,...
497815,bit mat products,bit mat products,0,1.0
497816,bnd trading co ltd,bnd trading co ltd,0,1.0
497817,xeikon industrial co ltd of dongguan city,xeikon industrial co ltd of dongguan city,0,1.0
497818,shanghai kechuan trading co ltd,shanghai kechuan trading co ltd,0,1.0


## Калибровка