In [2]:
import sys, os
sys.path.append(os.environ['minotaur'])

import datetime

import yaml

%matplotlib inline
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import patsy
import statsmodels.api as sm

from pylab import rcParams
rcParams['figure.figsize'] = 14,3
rcParams['font.family'] = 'Open Sans'

from dbs import redshift
redshift.connect()

from __future__ import division

  from pandas.core import datetools


In [3]:
tasks = redshift.execute("""
SELECT t.activity_date,
       COALESCE(c.email,l.email) AS email,
       COALESCE(c.account_id,l.id) AS company_id,
       'sales' as source_table,
       t.type_c as activity_type,
       t.call_c as call_type
FROM segment_salesforce.tasks t
  LEFT JOIN segment_salesforce.contacts c
         ON t.who_id = c.id
        AND NOT c.is_deleted
  LEFT JOIN segment_salesforce.leads l
         ON t.who_id = l.id
        AND NOT l.is_deleted
WHERE NOT t.is_deleted AND (t.type_c = 'Email' OR t.type_c = 'Call')
""")
emails = redshift.execute("""
SELECT e.activity_date,
       p.email,
       p.company_id,
       'marketing' as source_table,
       'marketing_email' as activity_type,
       NULL as call_type
FROM marketo.email_delivers e
  INNER JOIN analytics.salesforce_patrons p ON e.lead_id = p.marketo_id
""")
opps = redshift.execute("""
SELECT id,
       account_id,
       stage_name,
       amount,
       created_date,
       close_date
       FROM segment_salesforce.opportunities
WHERE NOT is_deleted
AND   type = 'Activation'
AND (stage_name = 'Closed Won' OR stage_name = 'Closed Lost')
""")
site_landings = redshift.execute("""
SELECT sl.sent_at as activity_date,
       meam.email,
       sp.company_id,
       'site' as source_table,
       'site_visit' as activity_type,
       NULL as call_type
FROM analytics.site_landings sl
  INNER JOIN analytics.marketo_email_alias_mapping meam ON sl.looker_visitor_id = meam.looker_visitor_id
  INNER JOIN analytics.salesforce_patrons sp on meam.email = sp.email""")

In [4]:
t = tasks
e = emails
sl = site_landings
o = opps

In [5]:
con = pd.concat([t,e,sl],axis=0)

In [6]:
con.activity_type = con.activity_type.astype(str)
con.activity_type = con['activity_type'].apply(str.lower)
con.activity_type.unique()

array(['email', 'call', 'marketing_email', 'site_visit'], dtype=object)

In [7]:
con.call_type.unique()

array([None, 'No Answer', 'Connected', 'Left Voicemail', 'Not in Service',
       'Rejected', 'Wrong Number', 'Busy'], dtype=object)

will work with only 1 input

for activity_type, call_type in con:
    if call_type == 'Connected':
        con.activity_type = con.activity_type.replace('call', 'call_connect')
    elif call_type == 'Rejected':
        con.activity_type = con.activity_type.replace('call', 'call_reject')
        continue
con.activity_type.unique()        

In [8]:
con.loc[con['call_type'] == 'Connected', 'activity_type'] = 'call_connect'
con.activity_type.unique()        

array(['email', 'call', 'call_connect', 'marketing_email', 'site_visit'], dtype=object)

In [9]:
# trying to predict amount of time between activity and close. 
# trying to minimize time between activity and opportunity that is successful
# create dataset that combines site visits, emails, tasks with email address for each row
# add in opportunity
# sum up type by company_id and email


In [10]:
opps = pd.merge(con,o,left_on='company_id',right_on='account_id',how='left')

In [11]:
opps.email.size

2515334

In [12]:
opps['activity_date'] = pd.to_datetime(opps['activity_date'])
opps['created_date'] = pd.to_datetime(opps['created_date'])
opps['close_date'] = pd.to_datetime(opps['close_date'])

In [13]:
opps['days'] = (opps.created_date - opps.activity_date).astype('timedelta64[D]')
opps['has_opp'] = ((opps.days <= 30) & (opps.days >= 0))
opps.head()

Unnamed: 0,activity_date,email,company_id,source_table,activity_type,call_type,id,account_id,stage_name,amount,created_date,close_date,days,has_opp
0,2017-03-31,berlinblueart@mail.de,001C000001ZdVpGIAV,sales,email,,006C0000015ebgjIAA,001C000001ZdVpGIAV,Closed Won,6840.0,2017-03-08 10:40:41,2017-03-31,-23.0,False
1,2017-03-31,info@rolandoanselmi.com,001C000001QmZtsIAF,sales,email,,006C0000012K9cKIAS,001C000001QmZtsIAF,Closed Lost,,2016-02-25 19:36:18,2016-03-31,-400.0,False
2,2017-03-31,info@rolandoanselmi.com,001C000001QmZtsIAF,sales,email,,006C0000015ecvgIAA,001C000001QmZtsIAF,Closed Lost,,2017-03-08 14:27:16,2017-04-22,-23.0,False
3,2017-03-31,danielle@daniellearnaud.com,001C000001T2HzFIAV,sales,email,,006C0000011hwlFIAQ,001C000001T2HzFIAV,Closed Lost,,2016-01-22 10:21:54,2016-02-29,-434.0,False
4,2017-03-31,danielle@daniellearnaud.com,001C000001T2HzFIAV,sales,email,,006C000001255qFIAQ,001C000001T2HzFIAV,Closed Lost,,2016-02-12 10:33:46,2016-03-31,-413.0,False


In [14]:
hops = opps[opps.has_opp == True].copy()
hops.head()

Unnamed: 0,activity_date,email,company_id,source_table,activity_type,call_type,id,account_id,stage_name,amount,created_date,close_date,days,has_opp
9,2017-03-31,galeria@lbcontemporaryart.com,001C000001Y4R5vIAF,sales,email,,006C0000015gPCBIA2,001C000001Y4R5vIAF,Closed Lost,,2017-03-31 14:25:36,2017-05-15,0.0,True
27,2017-03-31,tracy@arkitip.com,001C000001WBBzZIAX,sales,email,,006C0000015gRHWIA2,001C000001WBBzZIAX,Closed Lost,,2017-03-31 19:32:17,2017-05-12,0.0,True
33,2017-03-31,azabawa@oneriverschool.com,001C000001aSZTjIAO,sales,email,,006C0000015gRdLIAU,001C000001aSZTjIAO,Closed Lost,,2017-03-31 20:54:03,2017-05-15,0.0,True
39,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call,No Answer,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True
43,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call_connect,Connected,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True


In [15]:
hops['company_opp'] = hops.company_id.map(str) + hops.id.map(str)
hops.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,activity_date,email,company_id,source_table,activity_type,call_type,id,account_id,stage_name,amount,created_date,close_date,days,has_opp,company_opp
9,2017-03-31,galeria@lbcontemporaryart.com,001C000001Y4R5vIAF,sales,email,,006C0000015gPCBIA2,001C000001Y4R5vIAF,Closed Lost,,2017-03-31 14:25:36,2017-05-15,0.0,True,001C000001Y4R5vIAF006C0000015gPCBIA2
27,2017-03-31,tracy@arkitip.com,001C000001WBBzZIAX,sales,email,,006C0000015gRHWIA2,001C000001WBBzZIAX,Closed Lost,,2017-03-31 19:32:17,2017-05-12,0.0,True,001C000001WBBzZIAX006C0000015gRHWIA2
33,2017-03-31,azabawa@oneriverschool.com,001C000001aSZTjIAO,sales,email,,006C0000015gRdLIAU,001C000001aSZTjIAO,Closed Lost,,2017-03-31 20:54:03,2017-05-15,0.0,True,001C000001aSZTjIAO006C0000015gRdLIAU
39,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call,No Answer,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True,001C000001Une4SIAR006C0000015gUc7IAE
43,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call_connect,Connected,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True,001C000001Une4SIAR006C0000015gUc7IAE


In [16]:
company_opps = hops.company_opp.unique()

In [17]:
df = []
for company_opp in company_opps:
    d = dict(hops[hops.company_opp == company_opp].activity_type.value_counts())
    d['id'] = company_opp
    df += [d]
df = pd.DataFrame(df)


In [18]:
df = df.fillna(0)
df2 = df.set_index(['id'],drop=True)
df2.head()

Unnamed: 0_level_0,call,call_connect,email,marketing_email,site_visit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
001C000001Y4R5vIAF006C0000015gPCBIA2,0.0,1.0,4.0,0.0,0.0
001C000001WBBzZIAX006C0000015gRHWIA2,0.0,1.0,4.0,0.0,0.0
001C000001aSZTjIAO006C0000015gRdLIAU,1.0,1.0,6.0,1.0,0.0
001C000001Une4SIAR006C0000015gUc7IAE,4.0,2.0,2.0,0.0,0.0
001C000001aTieQIAS006C0000015gxsRIAQ,4.0,0.0,10.0,2.0,9.0


In [19]:
won = hops.set_index('company_opp',drop=True)
won['is_won'] = won.stage_name == 'Closed Won'
won.head()

Unnamed: 0_level_0,activity_date,email,company_id,source_table,activity_type,call_type,id,account_id,stage_name,amount,created_date,close_date,days,has_opp,is_won
company_opp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001C000001Y4R5vIAF006C0000015gPCBIA2,2017-03-31,galeria@lbcontemporaryart.com,001C000001Y4R5vIAF,sales,email,,006C0000015gPCBIA2,001C000001Y4R5vIAF,Closed Lost,,2017-03-31 14:25:36,2017-05-15,0.0,True,False
001C000001WBBzZIAX006C0000015gRHWIA2,2017-03-31,tracy@arkitip.com,001C000001WBBzZIAX,sales,email,,006C0000015gRHWIA2,001C000001WBBzZIAX,Closed Lost,,2017-03-31 19:32:17,2017-05-12,0.0,True,False
001C000001aSZTjIAO006C0000015gRdLIAU,2017-03-31,azabawa@oneriverschool.com,001C000001aSZTjIAO,sales,email,,006C0000015gRdLIAU,001C000001aSZTjIAO,Closed Lost,,2017-03-31 20:54:03,2017-05-15,0.0,True,False
001C000001Une4SIAR006C0000015gUc7IAE,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call,No Answer,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True,False
001C000001Une4SIAR006C0000015gUc7IAE,2017-04-01,roxana@galateca.ro,001C000001Une4SIAR,sales,call_connect,Connected,006C0000015gUc7IAE,001C000001Une4SIAR,Closed Lost,,2017-04-03 08:55:02,2017-05-18,2.0,True,False


In [20]:
won = won[['is_won']].copy()
won = won[~won.index.duplicated(keep='last')]
won.index.size

6667

In [21]:
final = df2.join(won)
final.shape
final.to_csv('final.csv')

(6667, 6)

In [22]:
final.head(10)

Unnamed: 0_level_0,call,call_connect,email,marketing_email,site_visit,is_won
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001C000001Y4R5vIAF006C0000015gPCBIA2,0.0,1.0,4.0,0.0,0.0,False
001C000001WBBzZIAX006C0000015gRHWIA2,0.0,1.0,4.0,0.0,0.0,False
001C000001aSZTjIAO006C0000015gRdLIAU,1.0,1.0,6.0,1.0,0.0,False
001C000001Une4SIAR006C0000015gUc7IAE,4.0,2.0,2.0,0.0,0.0,False
001C000001aTieQIAS006C0000015gxsRIAQ,4.0,0.0,10.0,2.0,9.0,True
001C000001aTiknIAC006C0000015hNtaIAE,2.0,0.0,9.0,1.0,16.0,True
001C000001aTjjpIAC006C0000015gwjGIAQ,0.0,2.0,3.0,0.0,0.0,False
001C000001Nz0wAIAR006C0000015gZzDIAU,0.0,1.0,1.0,6.0,0.0,False
001C000001QmZnkIAF006C0000015gd3iIAA,0.0,3.0,1.0,2.0,0.0,False
001C000001WBXZtIAP006C0000015gfx4IAA,0.0,2.0,1.0,0.0,6.0,True


In [23]:
final.corr()

Unnamed: 0,call,call_connect,email,marketing_email,site_visit,is_won
call,1.0,0.173449,0.166288,0.079285,0.039403,-0.006366
call_connect,0.173449,1.0,0.060847,0.108858,0.048779,-0.010833
email,0.166288,0.060847,1.0,0.002574,0.120178,0.15716
marketing_email,0.079285,0.108858,0.002574,1.0,0.158325,-0.033227
site_visit,0.039403,0.048779,0.120178,0.158325,1.0,0.128645
is_won,-0.006366,-0.010833,0.15716,-0.033227,0.128645,1.0


In [24]:
X = final.copy()
del X['is_won']

In [25]:
y = final['is_won'].copy()

In [28]:
from sklearn.linearmodels import SGDRegressor

ModuleNotFoundError: No module named 'sklearn.linearmodels'