In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
account = pd.read_csv('data/account.csv')
contact = pd.read_csv('data/contact.csv')
opportunity = pd.read_csv('data/opportunity.csv')
task = pd.read_csv('data/task.csv')

In [None]:
opportunity.merge(task, left_on='ID', right_on='ACCOUNTID', how='left', suffixes=('_ACC', '_OPP'))

In [4]:
task.head()

Unnamed: 0,ID,WHOID,CALLDISPOSITION,CALLTYPE,CALLDURATIONINSECONDS,ISCLOSED,ACCOUNTID,ISHIGHPRIORITY,PRIORITY,STATUS,WHATID,TASKSUBTYPE
0,00Tak000001UtkrEAC,003ak000002s2rhAAA,Call Successful,Inbound,331,True,001ak00000JD8JNAA1,True,High,Completed,006ak000002wGT4AAM,Email
1,00Tak000001UtksEAC,003ak000002s2rhAAA,Call Successful,Inbound,430,True,001ak00000JD8JNAA1,False,Normal,Completed,006ak000002wGT4AAM,Email
2,00Tak000001UtktEAC,003ak000002s2riAAA,Interested - Follow Up Needed,Internal,110,False,001ak00000JD8JWAA1,True,High,Not Started,006ak000002wGT5AAM,Email
3,00Tak000001UtkuEAC,003ak000002s2riAAA,Call Successful,Internal,545,True,001ak00000JD8JWAA1,True,High,Completed,006ak000002wGT5AAM,Call
4,00Tak000001UtkvEAC,003ak000002s2riAAA,Interested - Follow Up Needed,Outbound,66,False,001ak00000JD8JWAA1,True,High,Not Started,006ak000002wGT5AAM,Call


In [5]:
calltype = task.groupby(['WHATID', 'CALLTYPE'])[['ID']].count().reset_index()
calltype = calltype.pivot(index='WHATID', columns='CALLTYPE', values='ID').reset_index()
calltype.columns.name = None
calltype.columns = [col.upper() for col in calltype.columns]
calltype.INBOUND = calltype.INBOUND.fillna(0.0)
calltype.INTERNAL = calltype.INTERNAL.fillna(0.0)
calltype.OUTBOUND = calltype.OUTBOUND.fillna(0.0)
calltype.head()

Unnamed: 0,WHATID,INBOUND,INTERNAL,OUTBOUND
0,006ak000002wDgUAAU,1.0,1.0,1.0
1,006ak000002wDgVAAU,1.0,0.0,3.0
2,006ak000002wDgWAAU,2.0,1.0,1.0
3,006ak000002wDgXAAU,1.0,1.0,0.0
4,006ak000002wDgYAAU,1.0,0.0,3.0


In [6]:
calldurationinseconds = task.groupby(['WHATID', 'TASKSUBTYPE'])[['CALLDURATIONINSECONDS']].sum().reset_index()
calldurationinseconds.columns = ['WHATID', 'TASKSUBTYPE', 'TOTAL_CALLDURATIONINSECONDS']
calldurationinseconds = calldurationinseconds[calldurationinseconds['TASKSUBTYPE'] == 'Call'][['WHATID', 'TOTAL_CALLDURATIONINSECONDS']]
calldurationinseconds.TOTAL_CALLDURATIONINSECONDS = calldurationinseconds.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)
calldurationinseconds.head()

Unnamed: 0,WHATID,TOTAL_CALLDURATIONINSECONDS
2,006ak000002wDgVAAU,77
4,006ak000002wDgWAAU,201
8,006ak000002wDgYAAU,721
15,006ak000002wDgdAAE,104
18,006ak000002wDgeAAE,410


In [7]:
priority_task = task.groupby(['WHATID', 'ISHIGHPRIORITY'])['STATUS'].count().reset_index()
priority_task.ISHIGHPRIORITY = priority_task.ISHIGHPRIORITY.replace({False: 'OTHER_PRIORITY', True: 'HIGH_PRIORITY'})
priority_task = priority_task.pivot(index='WHATID', columns='ISHIGHPRIORITY', values='STATUS').reset_index()
priority_task.columns.name = None
priority_task.HIGH_PRIORITY = priority_task.HIGH_PRIORITY.fillna(0.0)
priority_task.OTHER_PRIORITY = priority_task.OTHER_PRIORITY.fillna(0.0)
priority_task

Unnamed: 0,WHATID,HIGH_PRIORITY,OTHER_PRIORITY
0,006ak000002wDgUAAU,1.0,2.0
1,006ak000002wDgVAAU,1.0,3.0
2,006ak000002wDgWAAU,2.0,2.0
3,006ak000002wDgXAAU,1.0,1.0
4,006ak000002wDgYAAU,1.0,3.0
...,...,...,...
445,006ak000002wIASAA2,2.0,1.0
446,006ak000002wIATAA2,2.0,0.0
447,006ak000002wIAUAA2,1.0,1.0
448,006ak000002wIAVAA2,2.0,0.0


In [8]:
priority = task.groupby(['WHATID', 'PRIORITY'])['STATUS'].count().reset_index()
priority = priority.pivot(index='WHATID', columns='PRIORITY', values='STATUS').reset_index()
priority.columns.name = None
priority.High = priority.High.fillna(0.0)
priority.Normal = priority.Normal.fillna(0.0)
priority.Low = priority.Low.fillna(0.0)
priority.columns = [col.upper() for col in priority.columns]
priority.head()

Unnamed: 0,WHATID,HIGH,LOW,NORMAL
0,006ak000002wDgUAAU,1.0,0.0,2.0
1,006ak000002wDgVAAU,1.0,1.0,2.0
2,006ak000002wDgWAAU,2.0,1.0,1.0
3,006ak000002wDgXAAU,1.0,0.0,1.0
4,006ak000002wDgYAAU,1.0,2.0,1.0


In [9]:
tasksubtype = task.groupby(['WHATID', 'TASKSUBTYPE'])['STATUS'].count().reset_index()
tasksubtype = tasksubtype.pivot(index='WHATID', columns='TASKSUBTYPE', values='STATUS').reset_index()
tasksubtype.columns.name = None
tasksubtype.columns = [col.upper() for col in tasksubtype.columns]
tasksubtype.CALL = tasksubtype.CALL.fillna(0.0)
tasksubtype.EMAIL = tasksubtype.EMAIL.fillna(0.0)
tasksubtype.TASK = tasksubtype.TASK.fillna(0.0)
tasksubtype.head()

Unnamed: 0,WHATID,CALL,EMAIL,TASK
0,006ak000002wDgUAAU,0.0,1.0,2.0
1,006ak000002wDgVAAU,2.0,0.0,2.0
2,006ak000002wDgWAAU,3.0,0.0,1.0
3,006ak000002wDgXAAU,0.0,1.0,1.0
4,006ak000002wDgYAAU,3.0,0.0,1.0


In [10]:
len(calltype), len(calldurationinseconds), len(priority), len(tasksubtype)

(450, 294, 450, 450)

In [12]:
features = calltype.merge(calldurationinseconds, on='WHATID', how='left')
features = features.merge(priority, on='WHATID', how='outer')
features = features.merge(tasksubtype, on='WHATID', how='outer')
features.TOTAL_CALLDURATIONINSECONDS = features.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)
features.columns = ['WHATID', 'NUM_INBOUND', 'NUM_INTERNAL', 'NUM_OUTBOUND', 'TOTAL_CALLDURATIONINSECONDS', 'NUM_PRIORITY_HIGH',
                         'NUM_PRIORITY_LOW', 'NUM_PRIORITY_NORMAL', 'NUM_CALL', 'NUM_EMAIL', 'NUM_TASK']
features.head()

Unnamed: 0,WHATID,NUM_INBOUND,NUM_INTERNAL,NUM_OUTBOUND,TOTAL_CALLDURATIONINSECONDS,NUM_PRIORITY_HIGH,NUM_PRIORITY_LOW,NUM_PRIORITY_NORMAL,NUM_CALL,NUM_EMAIL,NUM_TASK
0,006ak000002wDgUAAU,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0
1,006ak000002wDgVAAU,1.0,0.0,3.0,77.0,1.0,1.0,2.0,2.0,0.0,2.0
2,006ak000002wDgWAAU,2.0,1.0,1.0,201.0,2.0,1.0,1.0,3.0,0.0,1.0
3,006ak000002wDgXAAU,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
4,006ak000002wDgYAAU,1.0,0.0,3.0,721.0,1.0,2.0,1.0,3.0,0.0,1.0


In [18]:
dataset = features.merge(opportunity[['ID', 'STAGENAME']], left_on='WHATID', right_on='ID', how='left')
dataset.drop(columns=['ID'], inplace=True)
dataset.head()

Unnamed: 0,WHATID,NUM_INBOUND,NUM_INTERNAL,NUM_OUTBOUND,TOTAL_CALLDURATIONINSECONDS,NUM_PRIORITY_HIGH,NUM_PRIORITY_LOW,NUM_PRIORITY_NORMAL,NUM_CALL,NUM_EMAIL,NUM_TASK,STAGENAME
0,006ak000002wDgUAAU,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,Closed Won
1,006ak000002wDgVAAU,1.0,0.0,3.0,77.0,1.0,1.0,2.0,2.0,0.0,2.0,Closed Won
2,006ak000002wDgWAAU,2.0,1.0,1.0,201.0,2.0,1.0,1.0,3.0,0.0,1.0,Closed Won
3,006ak000002wDgXAAU,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,Closed Won
4,006ak000002wDgYAAU,1.0,0.0,3.0,721.0,1.0,2.0,1.0,3.0,0.0,1.0,Closed Won


In [19]:
def encode_categorical_columns(data, categorical_columns):
    label_encoders = {}
    
    for column in categorical_columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    return data, label_encoders

In [20]:
dataset, label_encoders = encode_categorical_columns(dataset, categorical_columns=['STAGENAME'])

In [22]:
dataset.head()

Unnamed: 0,WHATID,NUM_INBOUND,NUM_INTERNAL,NUM_OUTBOUND,TOTAL_CALLDURATIONINSECONDS,NUM_PRIORITY_HIGH,NUM_PRIORITY_LOW,NUM_PRIORITY_NORMAL,NUM_CALL,NUM_EMAIL,NUM_TASK,STAGENAME
0,006ak000002wDgUAAU,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,1
1,006ak000002wDgVAAU,1.0,0.0,3.0,77.0,1.0,1.0,2.0,2.0,0.0,2.0,1
2,006ak000002wDgWAAU,2.0,1.0,1.0,201.0,2.0,1.0,1.0,3.0,0.0,1.0,1
3,006ak000002wDgXAAU,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1
4,006ak000002wDgYAAU,1.0,0.0,3.0,721.0,1.0,2.0,1.0,3.0,0.0,1.0,1


In [36]:
ftrs = dataset.drop(columns=['WHATID', 'STAGENAME'])
trgt = dataset['STAGENAME']
ids = dataset['WHATID']

In [37]:
scaler = StandardScaler()
ftrs[ftrs.columns] = scaler.fit_transform(ftrs[ftrs.columns])

In [38]:
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(ftrs, trgt, ids, test_size=0.3, random_state=42)

In [39]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [40]:
y_pred = model.predict(X_test)

In [41]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9037037037037037
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87        47
           1       0.95      0.90      0.92        88

    accuracy                           0.90       135
   macro avg       0.89      0.91      0.90       135
weighted avg       0.91      0.90      0.90       135

Confusion Matrix:
 [[43  4]
 [ 9 79]]


In [42]:
coefficients = model.coef_[0]
feature_names = ftrs.columns

In [43]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

In [54]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
7,NUM_CALL,2.603132
4,NUM_PRIORITY_HIGH,1.457133
6,NUM_PRIORITY_NORMAL,1.238875
0,NUM_INBOUND,1.201453
2,NUM_OUTBOUND,1.162677
1,NUM_INTERNAL,1.133823
5,NUM_PRIORITY_LOW,0.737492
9,NUM_TASK,0.317993
3,TOTAL_CALLDURATIONINSECONDS,0.16057
8,NUM_EMAIL,-0.885882


In [45]:
recommender_dictionary = {
    "NUM_CALL": {
        "Pos": "Higher number of calls tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of calls tend to be associated with a Closed Won outcome"
    },
    "NUM_PRIORITY_HIGH": {
        "Pos": "Higher number of task with priority high tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of task with priority high tend to be associated with a Closed Won outcome"
    },
    "NUM_PRIORITY_NORMAL": {
        "Pos": "Higher number of task with priority normal tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of task with priority normal tend to be associated with a Closed Won outcome"
    },
    "NUM_INBOUND": {
        "Pos": "Higher number of inbound call type tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of inbound call type tend to be associated with a Closed Won outcome"
    },
    "NUM_OUTBOUND": {
        "Pos": "Higher number of outbound call type tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of outbound call type tend to be associated with a Closed Won outcome"
    },
    "NUM_INTERNAL": {
        "Pos": "Higher number of internal call type tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of internal call type tend to be associated with a Closed Won outcome"
    },
    "NUM_PRIORITY_LOW": {
        "Pos": "Higher number of task with priority low tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of task with priority low tend to be associated with a Closed Won outcome"
    },
    "NUM_TASK": {
        "Pos": "Higher number of task tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of task tend to be associated with a Closed Won outcome"
    },
    "TOTAL_CALLDURATIONINSECONDS": {
        "Pos": "Higher seconds in call durations tend to be associated with a Closed Won outcome",
        "Neg": "Lower seconds in call durations tend to be associated with a Closed Won outcome"
    },
    "NUM_EMAIL": {
        "Pos": "Higher number of emails tend to be associated with a Closed Won outcome",
        "Neg": "Lower number of emails tend to be associated with a Closed Won outcome"
    }
}

In [46]:
def print_feature_meaning(row):
    if row['Coefficient'] > 0:
        print(recommender_dictionary[row['Feature']]['Pos'])
    else:
        print(recommender_dictionary[row['Feature']]['Neg'])

In [47]:
feature_importance_df.apply(print_feature_meaning, axis=1)
None

Higher number of calls tend to be associated with a Closed Won outcome
Higher number of task with priority high tend to be associated with a Closed Won outcome
Higher number of task with priority normal tend to be associated with a Closed Won outcome
Higher number of inbound call type tend to be associated with a Closed Won outcome
Higher number of outbound call type tend to be associated with a Closed Won outcome
Higher number of internal call type tend to be associated with a Closed Won outcome
Higher number of task with priority low tend to be associated with a Closed Won outcome
Higher number of task tend to be associated with a Closed Won outcome
Higher seconds in call durations tend to be associated with a Closed Won outcome
Lower number of emails tend to be associated with a Closed Won outcome


In [50]:
pd.concat([feature_importance_df.head(2), feature_importance_df.tail(1)], axis=0).apply(print_feature_meaning, axis=1)
None

Higher number of calls tend to be associated with a Closed Won outcome
Higher number of task with priority high tend to be associated with a Closed Won outcome
Lower number of emails tend to be associated with a Closed Won outcome


In [52]:
features.to_csv("feature/features.csv", index=False)

In [70]:
filter_list = [['NUM_CALL'], ['NUM_PRIORITY_HIGH', 'NUM_PRIORITY_NORMAL', 'NUM_PRIORITY_LOW'],
               ['NUM_INBOUND', 'NUM_OUTBOUND', 'NUM_INTERNAL'], ['NUM_TASK'], ['NUM_EMAIL'],
               ['TOTAL_CALLDURATIONINSECONDS']]
keys = []
values = []
for filters in filter_list:
    feature_name = None
    feature_coeff = 0
    for filter in filters:
        filter_df = feature_importance_df[feature_importance_df.Feature == filter]
        if filter_df.Coefficient.values[0] >= feature_coeff:
            feature_name = filter_df.Feature.values[0]
            feature_coeff = filter_df.Coefficient.values[0]
    keys.append(feature_name)
    values.append(feature_coeff)

In [71]:
keys

['NUM_CALL',
 'NUM_PRIORITY_HIGH',
 'NUM_INBOUND',
 'NUM_TASK',
 None,
 'TOTAL_CALLDURATIONINSECONDS']

In [72]:
values

[2.603131940566835,
 1.457132644552604,
 1.2014525927348325,
 0.3179930429329765,
 0,
 0.16057024125577382]