In [131]:
import pandas as pd
import numpy as np

In [3]:
account = pd.read_csv('data/account.csv')
contact = pd.read_csv('data/contact.csv')
opportunity = pd.read_csv('data/opportunity.csv')
task = pd.read_csv('data/task.csv')

In [132]:
# Let's create some features using task.
task.head()

Unnamed: 0,ID,WHOID,CALLDISPOSITION,CALLTYPE,CALLDURATIONINSECONDS,ISCLOSED,ACCOUNTID,ISHIGHPRIORITY,PRIORITY,STATUS,WHATID,TASKSUBTYPE
0,00Tak000001SknKEAS,003ak000002nqxxAAA,Interested - Follow Up Needed,Internal,68,False,001ak00000IruOBAAZ,False,Normal,Not Started,006ak000002sEVcAAM,Task
1,00Tak000001SknLEAS,003ak000002nqxxAAA,Not Interested,Outbound,45,False,001ak00000IruOBAAZ,False,Low,Not Started,006ak000002sEVcAAM,Call
2,00Tak000001SknMEAS,003ak000002nqxxAAA,Not Interested,Inbound,39,False,001ak00000IruOBAAZ,True,High,Not Started,006ak000002sEVcAAM,Email
3,00Tak000001SknNEAS,003ak000002nqxxAAA,Call Successful,Inbound,348,True,001ak00000IruOBAAZ,True,High,Completed,006ak000002sEVcAAM,Email
4,00Tak000001SknOEAS,003ak000002nqxyAAA,Scheduled Call Back,Outbound,38,False,001ak00000IruOCAAZ,False,Low,Not Started,006ak000002sEVdAAM,Email


In [7]:
task.CALLDISPOSITION.value_counts()

Interested - Follow Up Needed    467
Call Successful                  465
Not Interested                   305
Call Unsuccessful                149
Scheduled Call Back               71
No Answer                         67
Name: CALLDISPOSITION, dtype: int64

In [9]:
task.CALLTYPE.value_counts()

Outbound    518
Internal    505
Inbound     501
Name: CALLTYPE, dtype: int64

In [11]:
task.ISCLOSED.value_counts()

False    1059
True      465
Name: ISCLOSED, dtype: int64

In [12]:
task.ISHIGHPRIORITY.value_counts()

False    904
True     620
Name: ISHIGHPRIORITY, dtype: int64

In [16]:
task.PRIORITY.value_counts()

High      620
Normal    619
Low       285
Name: PRIORITY, dtype: int64

In [17]:
task.STATUS.value_counts()

Not Started    910
Completed      465
In Progress    149
Name: STATUS, dtype: int64

In [18]:
task.TASKSUBTYPE.value_counts()

Email    703
Call     494
Task     327
Name: TASKSUBTYPE, dtype: int64

In [30]:
# CALLDISPOSITION
calldisposition = task.groupby(['WHATID', 'CALLDISPOSITION'])[['ID']].count().reset_index()
calldisposition.columns = ['WHATID', 'CALLDISPOSITION', 'CALLDISPOSITION_COUNT']
calldisposition

Unnamed: 0,WHATID,CALLDISPOSITION,CALLDISPOSITION_COUNT
0,006ak000002sAjJAAU,Call Successful,3
1,006ak000002sAjJAAU,Interested - Follow Up Needed,2
2,006ak000002sAjJAAU,No Answer,1
3,006ak000002sAjKAAU,Call Successful,1
4,006ak000002sAjKAAU,Interested - Follow Up Needed,1
...,...,...,...
1126,006ak000002sEZSAA2,Not Interested,2
1127,006ak000002sEZTAA2,Interested - Follow Up Needed,1
1128,006ak000002sEZUAA2,Call Successful,1
1129,006ak000002sEZUAA2,Interested - Follow Up Needed,1


In [31]:
print('Interested - Follow Up Needed: ', calldisposition[calldisposition.CALLDISPOSITION == 'Interested - Follow Up Needed']['CALLDISPOSITION_COUNT'].mean())
print('Call Successful: ', calldisposition[calldisposition.CALLDISPOSITION == 'Call Successful']['CALLDISPOSITION_COUNT'].mean())
print('Not Interested: ', calldisposition[calldisposition.CALLDISPOSITION == 'Not Interested']['CALLDISPOSITION_COUNT'].mean())
print('Call Unsuccessful: ', calldisposition[calldisposition.CALLDISPOSITION == 'Call Unsuccessful']['CALLDISPOSITION_COUNT'].mean())
print('Scheduled Call Back: ', calldisposition[calldisposition.CALLDISPOSITION == 'Scheduled Call Back']['CALLDISPOSITION_COUNT'].mean())
print('No Answer: ', calldisposition[calldisposition.CALLDISPOSITION == 'No Answer']['CALLDISPOSITION_COUNT'].mean())

Interested - Follow Up Needed:  1.4778481012658229
Call Successful:  1.4622641509433962
Not Interested:  1.3203463203463204
Call Unsuccessful:  1.1119402985074627
Scheduled Call Back:  1.0441176470588236
No Answer:  1.046875


In [88]:
# CALLTYPE
calltype = task.groupby(['WHATID', 'CALLTYPE'])[['ID']].count().reset_index()
calltype = calltype.pivot(index='WHATID', columns='CALLTYPE', values='ID').reset_index()
calltype.columns.name = None
calltype.columns = [col.upper() for col in calltype.columns]
calltype.INBOUND = calltype.INBOUND.fillna(0.0)
calltype.INTERNAL = calltype.INTERNAL.fillna(0.0)
calltype.OUTBOUND = calltype.OUTBOUND.fillna(0.0)
calltype

Unnamed: 0,WHATID,INBOUND,INTERNAL,OUTBOUND
0,006ak000002sAjJAAU,2.0,2.0,2.0
1,006ak000002sAjKAAU,0.0,2.0,3.0
2,006ak000002sAjLAAU,0.0,0.0,3.0
3,006ak000002sAjMAAU,2.0,1.0,1.0
4,006ak000002sAjNAAU,1.0,1.0,0.0
...,...,...,...,...
495,006ak000002sEZQAA2,1.0,2.0,1.0
496,006ak000002sEZRAA2,1.0,2.0,0.0
497,006ak000002sEZSAA2,2.0,1.0,1.0
498,006ak000002sEZTAA2,1.0,0.0,0.0


In [37]:
print('Inbound: ', calltype[calltype.CALLTYPE == 'Inbound']['CALLTYPE_COUNT'].mean())
print('Internal: ', calltype[calltype.CALLTYPE == 'Internal']['CALLTYPE_COUNT'].mean())
print('Outbound: ', calltype[calltype.CALLTYPE == 'Outbound']['CALLTYPE_COUNT'].mean())

Inbound:  1.5
Internal:  1.507462686567164
Outbound:  1.593846153846154


In [106]:
# CALLDURATIONINSECONDS
calldurationinseconds = task.groupby(['WHATID', 'TASKSUBTYPE'])[['CALLDURATIONINSECONDS']].sum().reset_index()
calldurationinseconds.columns = ['WHATID', 'TASKSUBTYPE', 'TOTAL_CALLDURATIONINSECONDS']
calldurationinseconds = calldurationinseconds[calldurationinseconds['TASKSUBTYPE'] == 'Call'][['WHATID', 'TOTAL_CALLDURATIONINSECONDS']]
calldurationinseconds.TOTAL_CALLDURATIONINSECONDS = calldurationinseconds.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)
calldurationinseconds

Unnamed: 0,WHATID,TOTAL_CALLDURATIONINSECONDS
0,006ak000002sAjJAAU,515
2,006ak000002sAjKAAU,621
4,006ak000002sAjLAAU,30
6,006ak000002sAjMAAU,564
13,006ak000002sAjQAAU,90
...,...,...
886,006ak000002sEZGAA2,256
888,006ak000002sEZHAA2,350
891,006ak000002sEZIAA2,143
903,006ak000002sEZQAA2,84


In [80]:
# ISHIGHPRIORITY
priority_task = task.groupby(['WHATID', 'ISHIGHPRIORITY'])['STATUS'].count().reset_index()
priority_task.ISHIGHPRIORITY = priority_task.ISHIGHPRIORITY.replace({False: 'OTHER_PRIORITY', True: 'HIGH_PRIORITY'})
priority_task = priority_task.pivot(index='WHATID', columns='ISHIGHPRIORITY', values='STATUS').reset_index()
priority_task.columns.name = None
priority_task.HIGH_PRIORITY = priority_task.HIGH_PRIORITY.fillna(0.0)
priority_task.OTHER_PRIORITY = priority_task.OTHER_PRIORITY.fillna(0.0)
priority_task

Unnamed: 0,WHATID,HIGH_PRIORITY,OTHER_PRIORITY
0,006ak000002sAjJAAU,3.0,3.0
1,006ak000002sAjKAAU,2.0,3.0
2,006ak000002sAjLAAU,1.0,2.0
3,006ak000002sAjMAAU,0.0,4.0
4,006ak000002sAjNAAU,1.0,1.0
...,...,...,...
495,006ak000002sEZQAA2,2.0,2.0
496,006ak000002sEZRAA2,1.0,2.0
497,006ak000002sEZSAA2,1.0,3.0
498,006ak000002sEZTAA2,1.0,0.0


In [81]:
# PRIORITY
priority = task.groupby(['WHATID', 'PRIORITY'])['STATUS'].count().reset_index()
priority = priority.pivot(index='WHATID', columns='PRIORITY', values='STATUS').reset_index()
priority.columns.name = None
priority.High = priority.High.fillna(0.0)
priority.Normal = priority.Normal.fillna(0.0)
priority.Low = priority.Low.fillna(0.0)
priority.columns = [col.upper() for col in priority.columns]
priority

Unnamed: 0,WHATID,HIGH,LOW,NORMAL
0,006ak000002sAjJAAU,3.0,1.0,2.0
1,006ak000002sAjKAAU,2.0,0.0,3.0
2,006ak000002sAjLAAU,1.0,1.0,1.0
3,006ak000002sAjMAAU,0.0,1.0,3.0
4,006ak000002sAjNAAU,1.0,0.0,1.0
...,...,...,...,...
495,006ak000002sEZQAA2,2.0,2.0,0.0
496,006ak000002sEZRAA2,1.0,0.0,2.0
497,006ak000002sEZSAA2,1.0,3.0,0.0
498,006ak000002sEZTAA2,1.0,0.0,0.0


In [86]:
# TASKSUBTYPE
tasksubtype = task.groupby(['WHATID', 'TASKSUBTYPE'])['STATUS'].count().reset_index()
tasksubtype = tasksubtype.pivot(index='WHATID', columns='TASKSUBTYPE', values='STATUS').reset_index()
tasksubtype.columns.name = None
tasksubtype.columns = [col.upper() for col in tasksubtype.columns]
tasksubtype.CALL = tasksubtype.CALL.fillna(0.0)
tasksubtype.EMAIL = tasksubtype.EMAIL.fillna(0.0)
tasksubtype.TASK = tasksubtype.TASK.fillna(0.0)
tasksubtype

Unnamed: 0,WHATID,CALL,EMAIL,TASK
0,006ak000002sAjJAAU,3.0,0.0,3.0
1,006ak000002sAjKAAU,4.0,1.0,0.0
2,006ak000002sAjLAAU,1.0,2.0,0.0
3,006ak000002sAjMAAU,1.0,3.0,0.0
4,006ak000002sAjNAAU,0.0,1.0,1.0
...,...,...,...,...
495,006ak000002sEZQAA2,1.0,3.0,0.0
496,006ak000002sEZRAA2,0.0,1.0,2.0
497,006ak000002sEZSAA2,2.0,2.0,0.0
498,006ak000002sEZTAA2,0.0,0.0,1.0


In [90]:
stage = opportunity[['ID', 'STAGENAME']].copy()
stage = stage[stage['STAGENAME'].isin(['Closed Lost', 'Closed Won'])].copy()
stage

Unnamed: 0,ID,STAGENAME
0,006ak000002sAjJAAU,Closed Won
1,006ak000002sAjKAAU,Closed Won
2,006ak000002sAjLAAU,Closed Lost
3,006ak000002sAjMAAU,Closed Lost
4,006ak000002sAjNAAU,Closed Lost
...,...,...
492,006ak000002sEZNAA2,Closed Lost
493,006ak000002sEZOAA2,Closed Lost
496,006ak000002sEZRAA2,Closed Lost
498,006ak000002sEZTAA2,Closed Lost


In [91]:
stage.STAGENAME.value_counts()

Closed Lost    287
Closed Won      66
Name: STAGENAME, dtype: int64

In [96]:
stage_calltype = stage.merge(calltype, left_on='ID', right_on='WHATID', how='left').drop(['ID', 'WHATID'], axis=1)

In [97]:
stage_calltype

Unnamed: 0,STAGENAME,INBOUND,INTERNAL,OUTBOUND
0,Closed Won,2.0,2.0,2.0
1,Closed Won,0.0,2.0,3.0
2,Closed Lost,0.0,0.0,3.0
3,Closed Lost,2.0,1.0,1.0
4,Closed Lost,1.0,1.0,0.0
...,...,...,...,...
348,Closed Lost,0.0,1.0,0.0
349,Closed Lost,1.0,0.0,1.0
350,Closed Lost,1.0,2.0,0.0
351,Closed Lost,1.0,0.0,0.0


In [102]:
# INBOUND
pd.pivot_table(stage_calltype, index="STAGENAME", values="INBOUND", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,INBOUND
0,Closed Lost,0.808362
1,Closed Won,1.590909


In [103]:
# INTERNAL
pd.pivot_table(stage_calltype, index="STAGENAME", values="INTERNAL", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,INTERNAL
0,Closed Lost,0.804878
1,Closed Won,1.378788


In [104]:
# OUTBOUND
pd.pivot_table(stage_calltype, index="STAGENAME", values="OUTBOUND", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,OUTBOUND
0,Closed Lost,0.794425
1,Closed Won,1.712121


In [108]:
stage_calldurationinseconds = stage.merge(calldurationinseconds, left_on='ID', right_on='WHATID', how='left').drop(['ID', 'WHATID'], axis=1)
stage_calldurationinseconds.TOTAL_CALLDURATIONINSECONDS = stage_calldurationinseconds.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)
stage_calldurationinseconds

Unnamed: 0,STAGENAME,TOTAL_CALLDURATIONINSECONDS
0,Closed Won,515.0
1,Closed Won,621.0
2,Closed Lost,30.0
3,Closed Lost,564.0
4,Closed Lost,0.0
...,...,...
348,Closed Lost,0.0
349,Closed Lost,0.0
350,Closed Lost,0.0
351,Closed Lost,0.0


In [110]:
# TOTAL_CALLDURATIONINSECONDS
pd.pivot_table(stage_calldurationinseconds, index="STAGENAME", values="TOTAL_CALLDURATIONINSECONDS", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,TOTAL_CALLDURATIONINSECONDS
0,Closed Lost,52.066202
1,Closed Won,533.090909


In [112]:
stage_priority = stage.merge(priority, left_on='ID', right_on='WHATID', how='left').drop(['ID', 'WHATID'], axis=1)
stage_priority

Unnamed: 0,STAGENAME,HIGH,LOW,NORMAL
0,Closed Won,3.0,1.0,2.0
1,Closed Won,2.0,0.0,3.0
2,Closed Lost,1.0,1.0,1.0
3,Closed Lost,0.0,1.0,3.0
4,Closed Lost,1.0,0.0,1.0
...,...,...,...,...
348,Closed Lost,1.0,0.0,0.0
349,Closed Lost,0.0,0.0,2.0
350,Closed Lost,1.0,0.0,2.0
351,Closed Lost,1.0,0.0,0.0


In [113]:
# HIGH PRIORITY
pd.pivot_table(stage_priority, index="STAGENAME", values="HIGH", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,HIGH
0,Closed Lost,0.923345
1,Closed Won,1.924242


In [114]:
# NORMAL PRIORITY
pd.pivot_table(stage_priority, index="STAGENAME", values="NORMAL", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,NORMAL
0,Closed Lost,1.010453
1,Closed Won,1.954545


In [115]:
# LOW PRIORITY
pd.pivot_table(stage_priority, index="STAGENAME", values="LOW", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,LOW
0,Closed Lost,0.473868
1,Closed Won,0.80303


In [116]:

stage_tasksubtype = stage.merge(tasksubtype, left_on='ID', right_on='WHATID', how='left').drop(['ID', 'WHATID'], axis=1)
stage_tasksubtype

Unnamed: 0,STAGENAME,CALL,EMAIL,TASK
0,Closed Won,3.0,0.0,3.0
1,Closed Won,4.0,1.0,0.0
2,Closed Lost,1.0,2.0,0.0
3,Closed Lost,1.0,3.0,0.0
4,Closed Lost,0.0,1.0,1.0
...,...,...,...,...
348,Closed Lost,0.0,1.0,0.0
349,Closed Lost,0.0,1.0,1.0
350,Closed Lost,0.0,1.0,2.0
351,Closed Lost,0.0,0.0,1.0


In [117]:
# CALL
pd.pivot_table(stage_tasksubtype, index="STAGENAME", values="CALL", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,CALL
0,Closed Lost,0.28223
1,Closed Won,3.348485


In [118]:
# EMAIL
pd.pivot_table(stage_tasksubtype, index="STAGENAME", values="EMAIL", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,EMAIL
0,Closed Lost,1.58885
1,Closed Won,0.439394


In [119]:
# TASK
pd.pivot_table(stage_tasksubtype, index="STAGENAME", values="TASK", aggfunc='mean').reset_index()

Unnamed: 0,STAGENAME,TASK
0,Closed Lost,0.536585
1,Closed Won,0.893939


In [121]:
calltype
calldurationinseconds
priority
tasksubtype

Unnamed: 0,WHATID,TOTAL_CALLDURATIONINSECONDS
0,006ak000002sAjJAAU,515
2,006ak000002sAjKAAU,621
4,006ak000002sAjLAAU,30
6,006ak000002sAjMAAU,564
13,006ak000002sAjQAAU,90
...,...,...
886,006ak000002sEZGAA2,256
888,006ak000002sEZHAA2,350
891,006ak000002sEZIAA2,143
903,006ak000002sEZQAA2,84


In [129]:
task_features = calltype.merge(calldurationinseconds, on='WHATID', how='outer').merge(priority, on='WHATID', how='outer').merge(tasksubtype, on='WHATID', how='outer')
task_features.columns = ['WHATID', 'NUM_INBOUND', 'NUM_INTERNAL', 'NUM_OUTBOUND', 'TOTAL_CALLDURATIONINSECONDS', 'NUM_PRIORITY_HIGH',
                         'NUM_PRIORITY_LOW', 'NUM_PRIORITY_NORMAL', 'NUM_CALL', 'NUM_EMAIL', 'NUM_TASK']
task_features.head()

Unnamed: 0,WHATID,NUM_INBOUND,NUM_INTERNAL,NUM_OUTBOUND,TOTAL_CALLDURATIONINSECONDS,NUM_PRIORITY_HIGH,NUM_PRIORITY_LOW,NUM_PRIORITY_NORMAL,NUM_CALL,NUM_EMAIL,NUM_TASK
0,006ak000002sAjJAAU,2.0,2.0,2.0,515.0,3.0,1.0,2.0,3.0,0.0,3.0
1,006ak000002sAjKAAU,0.0,2.0,3.0,621.0,2.0,0.0,3.0,4.0,1.0,0.0
2,006ak000002sAjLAAU,0.0,0.0,3.0,30.0,1.0,1.0,1.0,1.0,2.0,0.0
3,006ak000002sAjMAAU,2.0,1.0,1.0,564.0,0.0,1.0,3.0,1.0,3.0,0.0
4,006ak000002sAjNAAU,1.0,1.0,0.0,,1.0,0.0,1.0,0.0,1.0,1.0


In [130]:
task_features.to_csv('feature/task_features.csv', index=False)

In [134]:
# Let's see now account
account.head()

Unnamed: 0,HQ_LOCATION__C,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,YEARSTARTED,ACCOUNTSOURCE,ANNUALREVENUE,RATING,NUMBEROFEMPLOYEES,OWNERSHIP,INDUSTRY,TYPE,NAME,OWNER_INTENT_TO_SELL__C,ID
0,Florida,28.0,12618042.5,2024,Incubator/Accelerator Programs,9210250.0,Hot,47,Private,Media,Startup,Flores-Smith Enterprises,True,001ak00000IqvqPAAR
1,South Dakota,240.0,36166195.73,2018,Incubator/Accelerator Programs,27607783.0,Cold,139,Public,Technology,Growth Stage,"Case, Holden and Shepard Holdings",False,001ak00000IqvqQAAR
2,Alaska,181.0,18847734.44,2016,Direct Referrals,22985042.0,Cold,115,Public,Recreation,Growth Stage,"Griffin, Rivers and Higgins Enterprises",False,001ak00000IqvqRAAR
3,Indiana,82.0,5496170.0,2022,Networking Events,5496170.0,Warm,28,Subsidiary,Healthcare,Startup,Meza Inc Corporation,False,001ak00000IqvqSAAR
4,New Hampshire,32.0,28215066.73,2015,Business Brokers,47822147.0,Warm,240,Private,Finance,Growth Stage,Hansen Inc Holdings,True,001ak00000IqvqTAAR


In [135]:
account['ACCOUNTSOURCE'].value_counts()

Business Brokers                  100
Direct Referrals                   92
Industry Conferences               87
Incubator/Accelerator Programs     76
Networking Events                  76
Investment Banks                   69
Name: ACCOUNTSOURCE, dtype: int64

In [136]:
account.RATING.value_counts()

Warm    173
Cold    169
Hot     158
Name: RATING, dtype: int64

In [138]:
account.OWNERSHIP.value_counts()

Public        169
Subsidiary    166
Private       165
Name: OWNERSHIP, dtype: int64

In [139]:
account.INDUSTRY.value_counts()

Media                 25
Utilities             25
Finance               24
Not For Profit        22
Transportation        20
Banking               20
Food & Beverage       19
Environmental         19
Telecommunications    17
Consulting            17
Shipping              17
Biotechnology         16
Recreation            15
Electronics           15
Education             15
Engineering           15
Retail                14
Technology            14
Hospitality           14
Other                 14
Entertainment         13
Healthcare            13
Government            13
Insurance             12
Manufacturing         12
Communications        12
Machinery             12
Apparel               12
Construction          11
Agriculture           11
Energy                11
Chemicals             11
Name: INDUSTRY, dtype: int64

In [140]:
account.TYPE.value_counts()

Established     271
Startup         161
Growth Stage     68
Name: TYPE, dtype: int64

In [141]:
account.OWNER_INTENT_TO_SELL__C.value_counts()

True     261
False    239
Name: OWNER_INTENT_TO_SELL__C, dtype: int64

In [154]:
stage_account = opportunity[['ACCOUNTID', 'STAGENAME']].copy()
stage_account = stage_account[stage_account['STAGENAME'].isin(['Closed Lost', 'Closed Won'])].copy()
stage_account

Unnamed: 0,ACCOUNTID,STAGENAME
0,001ak00000IrTIfAAN,Closed Won
1,001ak00000IrtTVAAZ,Closed Won
2,001ak00000IrtTWAAZ,Closed Lost
3,001ak00000IrtTXAAZ,Closed Lost
4,001ak00000IrtTYAAZ,Closed Lost
...,...,...
492,001ak00000IruJtAAJ,Closed Lost
493,001ak00000IruJuAAJ,Closed Lost
496,001ak00000IruJxAAJ,Closed Lost
498,001ak00000IruJzAAJ,Closed Lost


In [164]:
owner_intent_to_sell = account[['ID', 'OWNER_INTENT_TO_SELL__C']].copy()
owner_intent_to_sell = owner_intent_to_sell.merge(stage_account, left_on='ID', right_on='ACCOUNTID', how='right')
oits_stage = owner_intent_to_sell.drop(columns=['ID', 'ACCOUNTID'])
oits_stage

Unnamed: 0,OWNER_INTENT_TO_SELL__C,STAGENAME
0,False,Closed Won
1,True,Closed Won
2,False,Closed Lost
3,True,Closed Lost
4,False,Closed Lost
...,...,...
348,True,Closed Lost
349,True,Closed Lost
350,True,Closed Lost
351,True,Closed Lost


In [169]:
# Owner intent to sell is False
oits_stage[oits_stage['OWNER_INTENT_TO_SELL__C'] == False].STAGENAME.value_counts()

Closed Lost    134
Closed Won      28
Name: STAGENAME, dtype: int64

In [170]:
# Owner intent to sell is True
oits_stage[oits_stage['OWNER_INTENT_TO_SELL__C'] == True].STAGENAME.value_counts()

Closed Lost    153
Closed Won      38
Name: STAGENAME, dtype: int64

In [171]:
# Let's see together with the opportunity name.
opportunity.head()

Unnamed: 0,ID,TRACKINGNUMBER__C,FORECASTCATEGORYNAME,FORECASTCATEGORY,ISWON,ISCLOSED,LEADSOURCE,TYPE,EXPECTEDREVENUE,PROBABILITY,AMOUNT,STAGENAME,NAME,ACCOUNTID
0,006ak000002sAjJAAU,True,Closed,Closed,True,True,Investment Banks,Existing Business,9803283.0,100.0,9803283.0,Closed Won,Investment,001ak00000IrTIfAAN
1,006ak000002sAjKAAU,True,Closed,Closed,True,True,Direct Referrals,Existing Business,769708300.0,100.0,769708300.0,Closed Won,Acquisition,001ak00000IrtTVAAZ
2,006ak000002sAjLAAU,True,Omitted,Omitted,False,True,Business Brokers,New Business,0.0,0.0,15345610.0,Closed Lost,Acquisition,001ak00000IrtTWAAZ
3,006ak000002sAjMAAU,True,Omitted,Omitted,False,True,Industry Conferences,New Business,0.0,0.0,14571650.0,Closed Lost,Acquisition,001ak00000IrtTXAAZ
4,006ak000002sAjNAAU,True,Omitted,Omitted,False,True,Direct Referrals,Existing Business,0.0,0.0,53095440.0,Closed Lost,Investment,001ak00000IrtTYAAZ


In [174]:
stage_account_name = opportunity[['ACCOUNTID', 'NAME']].copy()
stage_account_name

Unnamed: 0,ACCOUNTID,NAME
0,001ak00000IrTIfAAN,Investment
1,001ak00000IrtTVAAZ,Acquisition
2,001ak00000IrtTWAAZ,Acquisition
3,001ak00000IrtTXAAZ,Acquisition
4,001ak00000IrtTYAAZ,Investment
...,...,...
495,001ak00000IruJwAAJ,Investment
496,001ak00000IruJxAAJ,Acquisition
497,001ak00000IruJyAAJ,Acquisition
498,001ak00000IruJzAAJ,Acquisition


In [175]:
owner_intent_to_sell = account[['ID', 'OWNER_INTENT_TO_SELL__C']].copy()
owner_intent_to_sell = owner_intent_to_sell.merge(stage_account_name, left_on='ID', right_on='ACCOUNTID', how='right')
oits_stage = owner_intent_to_sell.drop(columns=['ID', 'ACCOUNTID'])
oits_stage

Unnamed: 0,OWNER_INTENT_TO_SELL__C,NAME
0,False,Investment
1,True,Acquisition
2,False,Acquisition
3,True,Acquisition
4,False,Investment
...,...,...
495,False,Investment
496,True,Acquisition
497,False,Acquisition
498,True,Acquisition


In [176]:
# Owner intent to sell is False
oits_stage[oits_stage['OWNER_INTENT_TO_SELL__C'] == False].NAME.value_counts()

Investment     183
Acquisition     56
Name: NAME, dtype: int64

In [177]:
# Owner intent to sell is True
oits_stage[oits_stage['OWNER_INTENT_TO_SELL__C'] == True].NAME.value_counts()

Acquisition    184
Investment      77
Name: NAME, dtype: int64

In [178]:
# Now let's crossed together
stage_account = opportunity[['ACCOUNTID', 'STAGENAME', 'NAME']].copy()
stage_account = stage_account[stage_account['STAGENAME'].isin(['Closed Lost', 'Closed Won'])].copy()
stage_account

Unnamed: 0,ACCOUNTID,STAGENAME,NAME
0,001ak00000IrTIfAAN,Closed Won,Investment
1,001ak00000IrtTVAAZ,Closed Won,Acquisition
2,001ak00000IrtTWAAZ,Closed Lost,Acquisition
3,001ak00000IrtTXAAZ,Closed Lost,Acquisition
4,001ak00000IrtTYAAZ,Closed Lost,Investment
...,...,...,...
492,001ak00000IruJtAAJ,Closed Lost,Acquisition
493,001ak00000IruJuAAJ,Closed Lost,Investment
496,001ak00000IruJxAAJ,Closed Lost,Acquisition
498,001ak00000IruJzAAJ,Closed Lost,Acquisition


In [179]:
owner_intent_to_sell = account[['ID', 'OWNER_INTENT_TO_SELL__C']].copy()
owner_intent_to_sell = owner_intent_to_sell.merge(stage_account, left_on='ID', right_on='ACCOUNTID', how='right')
oits_stage = owner_intent_to_sell.drop(columns=['ID', 'ACCOUNTID'])
oits_stage

Unnamed: 0,OWNER_INTENT_TO_SELL__C,STAGENAME,NAME
0,False,Closed Won,Investment
1,True,Closed Won,Acquisition
2,False,Closed Lost,Acquisition
3,True,Closed Lost,Acquisition
4,False,Closed Lost,Investment
...,...,...,...
348,True,Closed Lost,Acquisition
349,True,Closed Lost,Investment
350,True,Closed Lost,Acquisition
351,True,Closed Lost,Acquisition


In [180]:
# Owner intent to sell is False and Acquisition
oits_stage[(oits_stage['OWNER_INTENT_TO_SELL__C'] == False) & (oits_stage['NAME'] == 'Acquisition')].STAGENAME.value_counts()

Closed Lost    37
Closed Won      3
Name: STAGENAME, dtype: int64

In [181]:
# Owner intent to sell is True and Acquisition
oits_stage[(oits_stage['OWNER_INTENT_TO_SELL__C'] == True) & (oits_stage['NAME'] == 'Acquisition')].STAGENAME.value_counts()

Closed Lost    111
Closed Won      28
Name: STAGENAME, dtype: int64

In [187]:
# Owner intent to sell is False and Investment
oits_stage[(oits_stage['OWNER_INTENT_TO_SELL__C'] == False) & (oits_stage['NAME'] == 'Investment')].STAGENAME.value_counts()

Closed Lost    97
Closed Won     25
Name: STAGENAME, dtype: int64

In [188]:
# Owner intent to sell is True and Investment
oits_stage[(oits_stage['OWNER_INTENT_TO_SELL__C'] == True) & (oits_stage['NAME'] == 'Investment')].STAGENAME.value_counts()

Closed Lost    42
Closed Won     10
Name: STAGENAME, dtype: int64

In [195]:
account.RATING.value_counts()

Warm    173
Cold    169
Hot     158
Name: RATING, dtype: int64

In [202]:
stage_rating = opportunity[['ACCOUNTID', 'STAGENAME']].copy()
stage_rating = stage_rating[stage_rating['STAGENAME'].isin(['Closed Lost', 'Closed Won'])].copy()
account_rating = account[['ID', 'RATING']].copy()
account_rating = account_rating.merge(stage_rating, left_on='ID', right_on='ACCOUNTID', how='right')
account_rating = account_rating.drop(columns=['ID', 'ACCOUNTID'])
account_rating

Unnamed: 0,RATING,STAGENAME
0,Warm,Closed Won
1,Cold,Closed Won
2,Warm,Closed Lost
3,Hot,Closed Lost
4,Cold,Closed Lost
...,...,...
348,Warm,Closed Lost
349,Warm,Closed Lost
350,Warm,Closed Lost
351,Hot,Closed Lost


In [204]:
# Warm Rating
account_rating[account_rating['RATING'] == 'Warm'].STAGENAME.value_counts()

Closed Lost    96
Closed Won     29
Name: STAGENAME, dtype: int64

In [205]:
# Cold Rating
account_rating[account_rating['RATING'] == 'Cold'].STAGENAME.value_counts()

Closed Lost    99
Closed Won     16
Name: STAGENAME, dtype: int64

In [206]:
# Hot Rating
account_rating[account_rating['RATING'] == 'Hot'].STAGENAME.value_counts()

Closed Lost    92
Closed Won     21
Name: STAGENAME, dtype: int64

In [208]:
account.head()

Unnamed: 0,HQ_LOCATION__C,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,YEARSTARTED,ACCOUNTSOURCE,ANNUALREVENUE,RATING,NUMBEROFEMPLOYEES,OWNERSHIP,INDUSTRY,TYPE,NAME,OWNER_INTENT_TO_SELL__C,ID
0,Florida,28.0,12618042.5,2024,Incubator/Accelerator Programs,9210250.0,Hot,47,Private,Media,Startup,Flores-Smith Enterprises,True,001ak00000IqvqPAAR
1,South Dakota,240.0,36166195.73,2018,Incubator/Accelerator Programs,27607783.0,Cold,139,Public,Technology,Growth Stage,"Case, Holden and Shepard Holdings",False,001ak00000IqvqQAAR
2,Alaska,181.0,18847734.44,2016,Direct Referrals,22985042.0,Cold,115,Public,Recreation,Growth Stage,"Griffin, Rivers and Higgins Enterprises",False,001ak00000IqvqRAAR
3,Indiana,82.0,5496170.0,2022,Networking Events,5496170.0,Warm,28,Subsidiary,Healthcare,Startup,Meza Inc Corporation,False,001ak00000IqvqSAAR
4,New Hampshire,32.0,28215066.73,2015,Business Brokers,47822147.0,Warm,240,Private,Finance,Growth Stage,Hansen Inc Holdings,True,001ak00000IqvqTAAR


In [210]:
contact.head()

Unnamed: 0,MAILINGSTATE,DEPARTMENT,ID,TITLE,NAME,ACCOUNTID
0,,Finance,003ak000002hSkgAAE,CFO,Sean Forbes,001ak00000IADT3AAP
1,,Finance,003ak000002hSkwAAE,CFO,Avi Green,001ak00000IADT8AAP
2,West Virginia,Other,003ak000002nqKoAAI,VP,Kristy Huffman,001ak00000IrTIfAAN
3,Tennessee,Finance,003ak000002nqKpAAI,Finance Manager,Bethany Frye,001ak00000IrtTVAAZ
4,Louisiana,Finance,003ak000002nqKqAAI,Finance Manager,Lawrence Powers,001ak00000IrtTWAAZ


In [211]:
contact.DEPARTMENT.value_counts()

Finance            138
Chief Executive    127
Legal              120
Other              117
Name: DEPARTMENT, dtype: int64

In [212]:
contact.TITLE.value_counts()

COO                    75
CFO                    67
Executive              63
CEO                    62
Finance Manager        60
Director or Manager    60
CTO                    59
VP                     56
Name: TITLE, dtype: int64

In [213]:
opportunity.head()

Unnamed: 0,ID,TRACKINGNUMBER__C,FORECASTCATEGORYNAME,FORECASTCATEGORY,ISWON,ISCLOSED,LEADSOURCE,TYPE,EXPECTEDREVENUE,PROBABILITY,AMOUNT,STAGENAME,NAME,ACCOUNTID
0,006ak000002sAjJAAU,True,Closed,Closed,True,True,Investment Banks,Existing Business,9803283.0,100.0,9803283.0,Closed Won,Investment,001ak00000IrTIfAAN
1,006ak000002sAjKAAU,True,Closed,Closed,True,True,Direct Referrals,Existing Business,769708300.0,100.0,769708300.0,Closed Won,Acquisition,001ak00000IrtTVAAZ
2,006ak000002sAjLAAU,True,Omitted,Omitted,False,True,Business Brokers,New Business,0.0,0.0,15345610.0,Closed Lost,Acquisition,001ak00000IrtTWAAZ
3,006ak000002sAjMAAU,True,Omitted,Omitted,False,True,Industry Conferences,New Business,0.0,0.0,14571650.0,Closed Lost,Acquisition,001ak00000IrtTXAAZ
4,006ak000002sAjNAAU,True,Omitted,Omitted,False,True,Direct Referrals,Existing Business,0.0,0.0,53095440.0,Closed Lost,Investment,001ak00000IrtTYAAZ


In [215]:
opportunity.columns

Index(['ID', 'TRACKINGNUMBER__C', 'FORECASTCATEGORYNAME', 'FORECASTCATEGORY',
       'ISWON', 'ISCLOSED', 'LEADSOURCE', 'TYPE', 'EXPECTEDREVENUE',
       'PROBABILITY', 'AMOUNT', 'STAGENAME', 'NAME', 'ACCOUNTID'],
      dtype='object')