In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import re
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')

## Минимальная очистка

In [4]:
df = pd.read_csv('../data/train.csv',index_col = 0)

In [6]:
df['name_1'] = df['name_1'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())
df['name_2'] = df['name_2'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)).apply(
    lambda x: re.sub(r' +', ' ', x).lower())

In [7]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries ltd,enormous industrial trade pvt ltd,0
2,apcotex industries ltd,technocraft industries india ltd,0
3,rishichem distributors pvt ltd,dsa,0
4,powermax rubber factory,co one,0
5,tress a s,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0
497816,bnd trading co ltd,zhong shan yue liang economy trade imp exp co ...,0
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0
497818,shanghai kechuan trading co ltd,shanghai m g stationery inc,0


In [10]:
def get_tokenize_sentence(sent):
    data = []
    for i in word_tokenize(sent):
        data.append(i.lower()) 
    return data

In [18]:
name_1_sentences = df['name_1'].apply(lambda x: get_tokenize_sentence(x))

In [22]:
name_2_sentences = df['name_2'].apply(lambda x: get_tokenize_sentence(x))

In [34]:
word_2_vec_model = gensim.models.Word2Vec([*name_1_sentences.to_list(),
                                           *name_2_sentences.to_list()],
                                          min_count = 1,vector_size = 150,
                                          window = 5, epochs = 10)

In [48]:
vocab_w2v = list(word_2_vec_model.wv.index_to_key)

In [67]:
def sent_vector(sent):
    sent = [word for word in sent if word in vocab_w2v]
    return np.mean(word_2_vec_model.wv[sent], axis=0)

In [68]:
X_1 = [sent_vector(sent) for sent in name_1_sentences.to_list()]

In [69]:
X_2 = [sent_vector(sent) for sent in name_2_sentences.to_list()]

In [83]:
X = np.hstack([np.array(X_1),np.array(X_2)])
y = df['is_duplicate'].values

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])

In [87]:
logit = LogisticRegression(random_state=42, n_jobs=4)

In [None]:
logit.fit(X_train, y_train)
test_preds = logit.predict(X_test)
test_preds_proba = logit.predict_proba(X_test)

In [99]:
def get_metrics(predict, proba, target):
    print(f"f1: {f1_score(target, predict)}")
    print(f"f1 macro: {f1_score(target, predict,average = 'macro')}")
    print(f"recall: {sklearn.metrics.recall_score(target, predict)}")
    print(f"roc auc :{sklearn.metrics.roc_auc_score(target, proba[:,1])}")

In [100]:
get_metrics(test_preds,test_preds_proba,y_test)

f1: 0.42719999999999997
f1 macro: 0.7121544698376806
recall: 0.29180327868852457
roc auc :0.9397153993618137


## Очищенный датасет

In [90]:
df = pd.read_csv('../data/result.csv',index_col = 0)

In [155]:
df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries,enormous industrial trade,0
2,apcotex industries,technocraft industries,0
3,rishichem distributors,dsa,0
4,powermax rubber factory,one,0
5,tress a s,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber company,0
497816,bnd trading,zhong shan yue liang economy trade imp exp,0
497817,xeikon industrial of dongguan city,yi cheng trading of dongguan city,0
497818,kechuan trading,m g stationery,0


In [12]:
df = df.dropna()

In [13]:
def get_tokenize_sentence(sent):
    data = []
    for i in word_tokenize(sent):
        data.append(i.lower()) 
    return data

In [14]:
name_1_sentences = df['name_1'].apply(lambda x: get_tokenize_sentence(x))

In [15]:
name_2_sentences = df['name_2'].apply(lambda x: get_tokenize_sentence(x))

In [16]:
word_2_vec_model = gensim.models.Word2Vec([*name_1_sentences.to_list(),
                                           *name_2_sentences.to_list()],
                                          min_count = 1,vector_size = 150,
                                          window = 5, epochs = 10)

In [17]:
vocab_w2v = list(word_2_vec_model.wv.index_to_key)

In [18]:
def sent_vector(sent):
    sent = [word for word in sent if word in vocab_w2v]
    return np.mean(word_2_vec_model.wv[sent], axis=0)

In [19]:
X_1 = [sent_vector(sent) for sent in name_1_sentences.to_list()]
X_2 = [sent_vector(sent) for sent in name_2_sentences.to_list()]

In [20]:
X = np.hstack([np.array(X_1),np.array(X_2)])
y = df['is_duplicate'].values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=42, stratify = df['is_duplicate'])

In [None]:
logit = LogisticRegression(random_state=42, n_jobs=4)
logit.fit(X_train, y_train)
test_preds = logit.predict(X_test)
test_preds_proba = logit.predict_proba(X_test)

In [23]:
def get_metrics(predict, proba, target):
    print(f"f1: {f1_score(target, predict)}")
    print(f"f1 macro: {f1_score(target, predict,average = 'macro')}")
    print(f"recall: {sklearn.metrics.recall_score(target, predict)}")
    print(f"roc auc :{sklearn.metrics.roc_auc_score(target, proba[:,1])}")

In [24]:
get_metrics(test_preds,test_preds_proba,y_test)

f1: 0.5227272727272727
f1 macro: 0.7600859221335504
recall: 0.3774617067833698
roc auc :0.9348916058950985
