# V-Lookup

In [67]:
# We import the Python libraries that we are going to use.

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from sys import exit

## Using the Email parameter

### Cleansing of SF contacts

In [68]:
# We read the file that contains the SF contacts.

df1 = pd.read_csv('input/sf_contacts.csv', encoding='utf-8').dropna(subset=['Contact ID'])

In [69]:
# We change the format of the Last Modified Date column from %d/%m/%Y to %Y-%m-%d.

df1['Last Modified Date'] = pd.to_datetime(df1['Last Modified Date']).dt.strftime('%Y-%m-%d')

In [70]:
# We create a DataFrame without NaN values in the Email column.
# We create a list with the values that not contain '@'.
# We remove those values without '@'.
# We sort by Email and then by Last Modified Date.
# We drop duplicates by Email and we keep the first one.

df1_email = df1.dropna(subset=['Email'])
bad_email = df1_email[~df1_email['Email'].str.lower().str.contains('@')]['Email'].tolist()
df1_email = df1_email[~df1_email['Email'].isin(bad_email)].reset_index(drop=True)
df1_email = df1_email.sort_values(by=['Email', 'Last Modified Date'], ascending=[True, False])
df1_email = df1_email.drop_duplicates(subset='Email', keep='first')

### Cleansing of external contacts.

In [71]:
# We read the file that contains the external contacts. 

df3 = pd.read_csv('input/ext_data.csv', encoding='utf-8' )

In [72]:
# We clean the LinkedIn URL column if the external file has it. 

if 'LinkedIn URL' in df3.columns:
    for i in range(len(df3)):
        if 'linkedin.com' in str(df3['LinkedIn URL'][i]):
            df3['LinkedIn URL'][i] = 'www.linkedin.com' + df3['LinkedIn URL'][i].split('linkedin.com')[-1]

In [73]:
# We create a new DataFrame without NaN values in the Email column.
# We remove duplicate contacts by Email.

df3_email = df3.dropna(subset=['Email'])
df3_email = df3_email.drop_duplicates(subset='Email', keep='first')

In [74]:
# We merge the two DataFrames by Email. 

inner_email = pd.merge(df1_email,
                      df3_email,
                      on = 'Email',
                      how = 'inner',
                      indicator = True).drop(columns = '_merge')

In [75]:
# We create a new list with the inner values. 

inner_email_list = inner_email['Email'].tolist()

In [76]:
inner_email_0 = inner_email[['Email', 'Contact ID']]

In [77]:
old = pd.merge(inner_email_0,
                      df3_email,
                      on = 'Email',
                      how = 'inner',
                      indicator = True).drop(columns = '_merge')

In [78]:
# We create two DataFrames with the external information that contains new and old contacts. 

# old = df3[df3['Email'].isin(inner_email_list)].reset_index(drop=True)
new = df3[~df3['Email'].isin(inner_email_list)].reset_index(drop=True)

In [79]:
# We export to the output folder. 

old.to_csv('output/old_contacts.csv', index = False, encoding = 'utf-8-sig')
new.to_csv('output/new_contacts.csv', index = False, encoding = 'utf-8-sig')

In [80]:
if 'LinkedIn URL' in df3.columns:
    1+1
else:
    exit()

## Using the LinkedIn URL parameter

### Cleansing the old-new DataFrames.

In [81]:
# We create a DataFrame without NaN values in the LinkedIn column.
# We create a list with values that not contain 'linkedin.com'.
# We remove those values without 'linkedin.com'
# We clean the LinkedIn URL column.
# We sort by LinkedIn URL and then by Last Modified Date.
# We drop duplicates by LinkedIn URL and we keep the first one.

df1_ln = df1.dropna(subset=['LinkedIn URL'])
bad_ln = df1_ln[~df1_ln['LinkedIn URL'].str.lower().str.contains('linkedin.com')]['LinkedIn URL'].tolist()
df1_ln = df1_ln[~df1_ln['LinkedIn URL'].isin(bad_ln)].reset_index(drop=True)
df1_ln['LinkedIn URL'] = ('www.linkedin.com'+df1_ln['LinkedIn URL'].str.split('.linkedin.com').str[-1]).str.strip()
df1_ln = df1_ln.sort_values(by=['LinkedIn URL', 'Last Modified Date'], ascending=[True, False])
df1_ln = df1_ln.drop_duplicates(subset='LinkedIn URL', keep='first')

In [82]:
# We create a DataFrame (from 'old' DataFrame) without NaN values in the LiknedIn URL column.
# We create a list with values that not contain 'linkedin.com'.
# We remove those values. 
# We drop duplicates by LinkedIn URL.

old_ln = old.dropna(subset=['LinkedIn URL'])
bad_ln_old = old_ln[~old_ln['LinkedIn URL'].str.lower().str.contains('linkedin.com')]['LinkedIn URL'].tolist()
old_ln = old_ln[~old_ln['LinkedIn URL'].isin(bad_ln_old)].reset_index(drop=True)
old_ln = old_ln.drop_duplicates(subset='LinkedIn URL', keep='first')

In [83]:
# We create a DataFrame (from 'new' DataFrame) without NaN values in the LiknedIn URL column.
# We create a list with values that not contain 'linkedin.com'.
# We remove those values. 
# We drop duplicates by LinkedIn URL.

new_ln = new.dropna(subset=['LinkedIn URL'])
bad_ln_new = new_ln[~new_ln['LinkedIn URL'].str.lower().str.contains('linkedin.com')]['LinkedIn URL'].tolist()
new_ln = new_ln[~new_ln['LinkedIn URL'].isin(bad_ln_new)].reset_index(drop=True)
new_ln = new_ln.drop_duplicates(subset='LinkedIn URL', keep='first')

In [84]:
# We merge df1_ln and old_ln by 'LinkedIn URL'.

inner_ln_old = pd.merge(df1_ln,
                      old_ln,
                      on = 'LinkedIn URL',
                      how = 'inner',
                      indicator = True).drop(columns = '_merge')

In [85]:
# We merge df1_ln and new_ln by 'LinkedIn URL'.

inner_ln_new = pd.merge(df1_ln,
                      new_ln,
                      on = 'LinkedIn URL',
                      how = 'inner',
                      indicator = True).drop(columns = '_merge')

In [86]:
# We create inner lists.

inner_ln_new_list = inner_ln_new['LinkedIn URL'].tolist()
inner_ln_old_list = inner_ln_old['LinkedIn URL'].tolist()

In [87]:
# We create DataFrames; old_new means old Email and new LinkedIn. 

old_new = old[~old['LinkedIn URL'].isin(inner_ln_old_list)].reset_index(drop=True)
new_new = new[~new['LinkedIn URL'].isin(inner_ln_new_list)].reset_index(drop=True)
old_old = old[old['LinkedIn URL'].isin(inner_ln_old_list)].reset_index(drop=True)
new_old = new[new['LinkedIn URL'].isin(inner_ln_new_list)].reset_index(drop=True)

In [88]:
new_old2 = df1_ln[df1_ln['LinkedIn URL'].isin(inner_ln_new_list)].reset_index(drop=True)


In [89]:
new_old2.drop(['Email', 'Last Modified Date'], axis=1, inplace=True)

In [90]:
inner_ln_new2 = pd.merge(new_old,
                        new_old2,
                        on = 'LinkedIn URL',
                        how = 'inner',indicator = True).drop(columns = '_merge')

In [91]:
frames = [old_new, old_old, inner_ln_new2]

In [92]:
result = pd.concat(frames)

In [93]:
old_old2 = result.drop_duplicates(subset='Contact ID', keep="first").reset_index(drop=True)

In [94]:
# We save only the new_new DataFrame because contains new Emails and new LinkedIn URL.

new_new.to_csv('output/new_new_contacts.csv')
old_old2.to_csv('output/old_old_contacts.csv')