In [69]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

pd.set_option('display.max_colwidth', -1)
df = pd.read_csv('CSV/TransfixLeads2.0.csv',  dtype=str)
print(df.shape)
df.head(2)

(8130, 2)


  


Unnamed: 0,Id,Email
0,00Q1J00000dhNbeUAE,miistrategic@gmail.com
1,00Q1J00000gX93YUAS,kagtrucking@yahoo.com


In [70]:
# Clean the data
df.dropna()
# df['Email'] = df['Email'].str.replace('[^a-zA-Z]', '')
# df['Email'] = df['Email'].str.replace(r'[^\w\s]+', '')

lead_emails = df['Email']

In [71]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


In [72]:
# get Tf-IDF Matrix
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(lead_emails.apply(lambda x: np.str_(x)))


In [73]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [74]:
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.7)


In [75]:
def get_matches_df(sparse_matrix, email_vector,email_ids, top=5):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    left_name_Ids = np.empty([nr_matches], dtype=object)
    right_name_Ids = np.empty([nr_matches], dtype=object)
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(nr_matches):        
        left_name_Ids[index] = email_ids[sparserows[index]]
        left_side[index] = email_vector[sparserows[index]]

        right_name_Ids[index] = email_ids[sparsecols[index]]
        right_side[index] = email_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({
                        'SFDC_ID':  left_name_Ids,
                        'left_side': left_side,
                        'right_SFDC_ID':right_name_Ids,
                          'right_side': right_side,
                           'similairity': similairity})

In [76]:
name_Ids = df['Id']
matches_df = get_matches_df(matches, lead_emails,name_Ids, top=8130)






In [77]:
matches_df = matches_df[matches_df['similairity'] > 0.9] 




In [78]:
matches_df.sort_values('SFDC_ID' )



Unnamed: 0,SFDC_ID,left_side,right_SFDC_ID,right_side,similairity
4095,00Q1J00000byQY9UAM,daniel.garcia@asd-tech.mx,00Q1J00000byQY9UAM,daniel.garcia@asd-tech.mx,1.0
28,00Q1J00000ciPNiUAM,z.a@yahoo.com,00Q1J00000ciPNiUAM,z.a@yahoo.com,1.0
27,00Q1J00000ciPUNUA2,edmondg20@yahoo.com,00Q1J00000ciPUNUA2,edmondg20@yahoo.com,1.0
25,00Q1J00000ciPdvUAE,plugo@hopeglobal.com,00Q1J00000ciPdvUAE,plugo@hopeglobal.com,1.0
31,00Q1J00000ciPuwUAE,naveengadi2017@gmail.com,00Q1J00000ciPuwUAE,naveengadi2017@gmail.com,1.0
...,...,...,...,...,...
60,00Q1J00000gZ5tJUAS,ltaylor@ultimatetruckingsolutions.com,00Q1J00000gZ5tJUAS,ltaylor@ultimatetruckingsolutions.com,1.0
4087,00Q1J00000gZ5voUAC,irojas62@icloud.com,00Q1J00000gZ5voUAC,irojas62@icloud.com,1.0
61,00Q1J00000gZ5xHUAS,jstruckingcorp@yahoo.com,00Q1J00000gZ5xHUAS,jstruckingcorp@yahoo.com,1.0
339,00Q1J00000gZ6EEUA0,gaccarriersinc@gmail.com,00Q1J00000gZ6EEUA0,gaccarriersinc@gmail.com,1.0


In [79]:
mask = matches_df['SFDC_ID'] == matches_df['right_SFDC_ID'] 

#if mask is true item in df['SFDC_ID'] is selected else item in df['right_SFDC_ID'] is selected

matches_df[mask]



Unnamed: 0,SFDC_ID,left_side,right_SFDC_ID,right_side,similairity
0,00Q1J00000dhNbeUAE,miistrategic@gmail.com,00Q1J00000dhNbeUAE,miistrategic@gmail.com,1.0
1,00Q1J00000gX93YUAS,kagtrucking@yahoo.com,00Q1J00000gX93YUAS,kagtrucking@yahoo.com,1.0
3,00Q1J00000gX9XmUAK,mastertrans.dip@gmail.com,00Q1J00000gX9XmUAK,mastertrans.dip@gmail.com,1.0
4,00Q1J00000gXAzXUAW,eminent.retha@gmail.com,00Q1J00000gXAzXUAW,eminent.retha@gmail.com,1.0
5,00Q1J00000exKUOUA2,dispatch@caintransport.com,00Q1J00000exKUOUA2,dispatch@caintransport.com,1.0
...,...,...,...,...,...
8124,00Q1J00000exwfAUAQ,admin@dykemantrans.com,00Q1J00000exwfAUAQ,admin@dykemantrans.com,1.0
8125,00Q1J00000exuleUAA,jbolek@hmsmfg.com,00Q1J00000exuleUAA,jbolek@hmsmfg.com,1.0
8126,00Q1J00000eyCKtUAM,ajaypalsingh.bedi@adityabirla.com,00Q1J00000eyCKtUAM,ajaypalsingh.bedi@adityabirla.com,1.0
8127,00Q1J00000eyCKuUAM,mgraceffa@chartenex.com,00Q1J00000eyCKuUAM,mgraceffa@chartenex.com,1.0


In [80]:
# Get names of indexes for which column mask == true
indexNames = matches_df[ mask ].index
# Delete these row indexes from dataFrame
matches_df.drop(indexNames , inplace=True)



In [81]:
matches_df.sort_values('similairity',ascending=True)



Unnamed: 0,SFDC_ID,left_side,right_SFDC_ID,right_side,similairity
8128,00Q1J00000eyCKuUAM,mgraceffa@chartenex.com,00Q1J00000ewKyRUAU,mgraceffa@charternex.com,0.901131
1072,00Q1J00000ewKyRUAU,mgraceffa@charternex.com,00Q1J00000eyCKuUAM,mgraceffa@chartenex.com,0.901131
4647,00Q1J00000exxArUAI,schelseatransportation@outloock.com,00Q1J00000eyB1AUAU,schelseatransportation@outlook.com,0.906159
7191,00Q1J00000eyB1AUAU,schelseatransportation@outlook.com,00Q1J00000exxArUAI,schelseatransportation@outloock.com,0.906159
3740,00Q1J00000exEhEUAU,jeff@shipmfx.com,00Q1J00000exjmdUAA,jeff@shipmfx.com,1.000000
...,...,...,...,...,...
6238,00Q1J00000exFXKUA2,dispatch@kingdomtrucklines.com,00Q1J00000exjmcUAA,dispatch@kingdomtrucklines.com,1.000000
3895,00Q1J00000exGW2UAM,backhauls@krupptrucking.com,00Q1J00000exF4cUAE,backhauls@krupptrucking.com,1.000000
2542,00Q1J00000exEnPUAU,dispatch@kingdomtrucklines.com,00Q1J00000exjmcUAA,dispatch@kingdomtrucklines.com,1.000000
2543,00Q1J00000exEnPUAU,dispatch@kingdomtrucklines.com,00Q1J00000exFXKUA2,dispatch@kingdomtrucklines.com,1.000000


In [82]:
matches_df.to_csv ('./Results/LeadDups.csv',index = False, header=True)

In [83]:
mask = matches_df['SFDC_ID'] < matches_df['right_SFDC_ID'] 

# creates a new column checking True vs False, 

#if mask is true item in df['SFDC_ID'] is selected else item in df['right_SFDC_ID'] is selected

matches_df['col1'] = matches_df['SFDC_ID'].where(mask, matches_df['right_SFDC_ID'])

#same as above but a column for df['right_SFDC_ID']
matches_df['col2'] = matches_df['right_SFDC_ID'].where(mask, matches_df['SFDC_ID'])

# checks for duplicates in `col1` and `col2` and removes last duplicate
matches_df = matches_df.drop_duplicates(subset=['col1'])



In [84]:
matches_df.sort_values('similairity',ascending=True)

matches_df['similairity']= matches_df['similairity'].astype(str)


In [85]:
matches_df.to_csv ('./Results/LeadDups2.csv',index = False, header=True)



In [86]:
test_df = matches_df.groupby(['SFDC_ID', 'left_side' ], as_index=False)[['right_SFDC_ID', 'right_side', 'similairity']].agg(lambda x: ','.join(x))

test_df.head(500)


Unnamed: 0,SFDC_ID,left_side,right_SFDC_ID,right_side,similairity
0,00Q1J00000ewKyRUAU,mgraceffa@charternex.com,00Q1J00000eyCKuUAM,mgraceffa@chartenex.com,0.9011308461791817
1,00Q1J00000exEMfUAM,hktruckingllc@gmail.com,00Q1J00000exFXJUA2,hktruckingllc@gmail.com,1.0000000000000004
2,00Q1J00000exEZ1UAM,deltalinescorp@gmail.com,00Q1J00000exjjDUAQ,deltalinescorp@gmail.com,1.0000000000000002
3,00Q1J00000exEaMUAU,dispatch@rogerstransport.org,00Q1J00000exJFkUAM,dispatch@rogerstransport.org,0.9999999999999998
4,00Q1J00000exEb2UAE,bekexpressinc2017@gmail.com,00Q1J00000exFhyUAE,bekexpressinc2017@gmail.com,1.0000000000000002
...,...,...,...,...,...
94,00Q1J00000eyFauUAE,bcandmissd@gmail.com,00Q1J00000eyNWsUAM,bcandmissd@gmail.com,0.9999999999999998
95,00Q1J00000eyNAbUAM,bcandmissd@gmail.com,00Q1J00000eyNWsUAM,bcandmissd@gmail.com,0.9999999999999998
96,00Q1J00000eyNWiUAM,bcandmissd@gmail.com,"00Q1J00000exngBUAQ,00Q1J00000ey8WlUAI,00Q1J00000exm5rUAA,00Q1J00000exGKLUA2,00Q1J00000ey8cfUAA,00Q1J00000exx8QUAQ,00Q1J00000exGjAUAU,00Q1J00000exiLXUAY,00Q1J00000eyNWsUAM,00Q1J00000exGJrUAM","bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com,bcandmissd@gmail.com","0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998,0.9999999999999998"
97,00Q1J00000gWzZ1UAK,bcandmissd@gmail.com,00Q1J00000eyNWsUAM,bcandmissd@gmail.com,0.9999999999999998


In [87]:
test_df.to_csv ('./Results/LeadDups3.csv',index = False, header=True)

