In [61]:
import sys, os
sys.path.append(os.environ['minotaur'])

import datetime as dt
import re
import sys

import yaml
from urllib.parse import urlparse


%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('seaborn-notebook')


import pandas as pd
pd.set_option('display.max_columns', 30)
import numpy as np
import patsy
import statsmodels.api as sm

from pylab import rcParams
rcParams['figure.figsize'] = 14,3
rcParams['font.family'] = 'Open Sans'

from dbs import redshift
redshift.connect()

from __future__ import division

In [62]:
b = redshift.execute( """
select * from bearden_exports.organizations where in_business != 'closed' and organization_type IN ('gallery','dealer')
""")

In [63]:
a = redshift.execute("""
select id,website,billing_city as "city",billing_country as "country",billing_street as "street" from segment_salesforce.accounts where not is_deleted and website is not null""")
lweb = redshift.execute("""
select id as "lead_id",website,email,city,country,street from segment_salesforce.leads where not is_deleted and converted_date is NULL and website is not null""")
p = redshift.execute("""
select email_c as "email",lead_c as "lead_id",contact_c as "contact_id",account_c as "account_id" from segment_salesforce.patrons where not is_deleted""")

In [64]:
a = a.fillna(np.nan)
lweb = lweb.fillna(np.nan)
p = p.fillna(np.nan)
b = b.fillna(np.nan)

In [65]:
a.website = a.website.astype(str)
lweb.website = lweb.website.astype(str)

In [66]:
def strip(website):
    website = website.replace("http://","")
    website = website.replace("http:","")    
    website = website.replace("https://","") 
    website = website.replace("http:","")    
    website = re.sub(r'(www.)(?!com)',r'',website)
    urls = website.split('/')[0]
#     urls = urls.replace("]","")
#     urls = urls.replace("[","")    
    return urls
a.website = a.website.apply(strip)
b.website = b.website.apply(strip)
lweb.website = lweb.website.apply(strip)

In [67]:
a1 = a.set_index(['website'])
b1 = b.set_index(['website'])
lweb1 = lweb.set_index(['website'])
print(a1.shape)
print(b1.shape)
print(lweb1.shape)

(17308, 4)
(32318, 13)
(63197, 5)


In [68]:
a2 = a1[~a1.index.duplicated(keep='first')]
b2 = b1[~b1.index.duplicated(keep='first')]
lweb2 = lweb1[~lweb1.index.duplicated(keep='first')]

In [69]:
print(a1.index.size)
print(a2.index.size)
print(b1.index.size)
print(b2.index.size)


17308
16710
32318
29279


In [70]:
ab = a2.combine_first(b2)
print(ab.index.size)
ab.head()

37079


Unnamed: 0_level_0,bearden_id,city,country,email,id,in_business,latitude,location,longitude,organization_name,organization_type,phone_number,sources,street,tag_names
website,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,,,,,001C000001dBi7qIAC,,,,,,,,,,
+32 (0)475 34 01 11,,,,,001C000001bchgTIAQ,,,,,,,,,,
+33 6 09 02 05 35,,,,,001C000001bcymXIAQ,,,,,,,,,,
-,,Monte Carlo,Monaco,,001C000001QmZpGIAV,,,,,,,,,,
.arnes.si,21104.0,,,,,unknown,0.0,"Stari trg 21 Ljubljana, Slovenia",0.0,Galerija Škuc,gallery,,"[""Gallery Locator""]",,


In [71]:
ab = ab[~ab.index.duplicated(keep=False)]
ab[ab.bearden_id.notnull()].count()

bearden_id           29279
city                 15131
country              15750
email                19125
id                    8910
in_business          29279
latitude             26193
location             26259
longitude            26193
organization_name    27079
organization_type    29279
phone_number         18604
sources              29279
street                6671
tag_names            14688
dtype: int64

In [72]:
not_matched = ab[ab.id.isnull()]
not_matched.count()

bearden_id           20369
city                  6901
country               7447
email                12027
id                       0
in_business          20369
latitude             17672
location             17682
longitude            17672
organization_name    18409
organization_type    20369
phone_number         11752
sources              20369
street                   0
tag_names             8909
dtype: int64

In [73]:
lead_matched = lweb2.combine_first(not_matched)
lead_matched = lead_matched[~lead_matched.index.duplicated(keep=False)]
not_matched = lead_matched[lead_matched.lead_id.isnull()]
not_matched_without_email = not_matched[not_matched.email.isnull()]
not_matched_with_email = not_matched[not_matched.email.notnull()]
print(not_matched.count())

bearden_id           11889
city                  4022
country               4357
email                 5924
id                       0
in_business          11889
latitude             10375
lead_id                  0
location             10381
longitude            10375
organization_name    10780
organization_type    11889
phone_number          7198
sources              11889
street                   0
tag_names             5493
dtype: int64


In [74]:
print(not_matched_without_email.count())

bearden_id           5965
city                 2054
country              2270
email                   0
id                      0
in_business          5965
latitude             4622
lead_id                 0
location             4625
longitude            4622
organization_name    4897
organization_type    5965
phone_number         2261
sources              5965
street                  0
tag_names            1627
dtype: int64


In [75]:
print(not_matched_with_email.count())

bearden_id           5924
city                 1968
country              2087
email                5924
id                      0
in_business          5924
latitude             5753
lead_id                 0
location             5756
longitude            5753
organization_name    5883
organization_type    5924
phone_number         4937
sources              5924
street                  0
tag_names            3866
dtype: int64


In [76]:
not_matched_with_email['url'] = not_matched_with_email.index
not_matched_with_email = not_matched_with_email.set_index(['email'])
p = p.set_index(['email'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
not_matched_with_email = not_matched_with_email[~not_matched_with_email.index.duplicated(keep=False)]
p = p[~p.index.duplicated(keep=False)]
email_matched = p.combine_first(not_matched_with_email)
email_matched = email_matched[~email_matched.index.duplicated(keep=False)]

In [78]:
not_matched = email_matched[(email_matched.bearden_id.notnull()) & email_matched.lead_id.isnull() & email_matched.contact_id.isnull()]
email_matched = email_matched[(email_matched.bearden_id.notnull()) & (email_matched.lead_id.notnull() | email_matched.contact_id.notnull())]

In [79]:
not_matched['website'] = not_matched.url
not_matched['email'] = not_matched.index
del not_matched['url']
not_matched = not_matched.set_index('website')

In [80]:
not_matched = pd.concat([not_matched,not_matched_without_email])
not_matched = not_matched[~not_matched.index.duplicated(keep=False)]

In [81]:
not_matched.count()

account_id               0
bearden_id           10283
city                  3358
contact_id               0
country               3675
email                 4318
id                       0
in_business          10283
latitude              8831
lead_id                  0
location              8834
longitude             8831
organization_name     9182
organization_type    10283
phone_number          5970
sources              10283
street                   0
tag_names             4534
dtype: int64

In [83]:
not_matched.to_csv('new_leads.csv')