In [None]:
!pip install snowflake-connector-python pandas
!pip install --upgrade snowflake-connector-python

Collecting snowflake-connector-python
  Using cached snowflake_connector_python-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Collecting asn1crypto<2.0.0,>0.24.0
  Using cached asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)
Collecting pyOpenSSL<25.0.0,>=16.2.0
  Using cached pyOpenSSL-24.1.0-py3-none-any.whl (56 kB)
Collecting sortedcontainers>=2.4.0
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting tomlkit
  Using cached tomlkit-0.12.5-py3-none-any.whl (37 kB)
Collecting platformdirs<5.0.0,>=2.6.0
  Using cached platformdirs-4.2.2-py3-none-any.whl (18 kB)
Collecting cryptography<43.0.0,>=3.1.0
  Using cached cryptography-42.0.7-cp39-abi3-manylinux_2_28_x86_64.whl (3.8 MB)
Installing collected packages: sortedcontainers, asn1crypto, tomlkit, platformdirs, cryptography, pyOpenSSL, snowflake-connector-python
  Attempting uninstall: platformdirs
    Found existing installation: platformdirs 2.5.2
    Not uninstallin

In [None]:
from snowflake.connector.pandas_tools import write_pandas
import snowflake.connector
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [None]:
snowflake_options = {
    "account": "XXXXXXXXXXXXXXXX",
    "user": "XXXXXXXXXXXXXXXX",
    "password": "XXXXXXXXXXXXXXXX",
    "database": "XXXXXXXXXXXXXXXX",
    "schema": "XXXXXXXXXXXXXXXX",
    "warehouse": "XXXXXXXXXXXXXXXX",
}

In [None]:
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

opportunity_query = "select * from salesforce_db.pre_data.opportunity"
task_query = "select * from salesforce_db.pre_data.task"

In [None]:
def get_table(ctx, query):
    cs = ctx.cursor()
    try:
        cs.execute(query)
        df = cs.fetch_pandas_all()
    finally:
        cs.close()
    return df

In [None]:
opportunity = get_table(ctx, opportunity_query)
task = get_table(ctx, task_query)

In [None]:
ctx.close()

In [None]:
calltype = task.groupby(['WHATID', 'CALLTYPE'])[['ID']].count().reset_index()
calltype = calltype.pivot(index='WHATID', columns='CALLTYPE', values='ID').reset_index()
calltype.columns.name = None
calltype.columns = [col.upper() for col in calltype.columns]
calltype.INBOUND = calltype.INBOUND.fillna(0.0)
calltype.INTERNAL = calltype.INTERNAL.fillna(0.0)
calltype.OUTBOUND = calltype.OUTBOUND.fillna(0.0)

In [None]:
calldurationinseconds = task.groupby(['WHATID', 'TASKSUBTYPE'])[['CALLDURATIONINSECONDS']].sum().reset_index()
calldurationinseconds.columns = ['WHATID', 'TASKSUBTYPE', 'TOTAL_CALLDURATIONINSECONDS']
calldurationinseconds = calldurationinseconds[calldurationinseconds['TASKSUBTYPE'] == 'Call'][['WHATID', 'TOTAL_CALLDURATIONINSECONDS']]
calldurationinseconds.TOTAL_CALLDURATIONINSECONDS = calldurationinseconds.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)

In [None]:
priority_task = task.groupby(['WHATID', 'ISHIGHPRIORITY'])['STATUS'].count().reset_index()
priority_task.ISHIGHPRIORITY = priority_task.ISHIGHPRIORITY.replace({False: 'OTHER_PRIORITY', True: 'HIGH_PRIORITY'})
priority_task = priority_task.pivot(index='WHATID', columns='ISHIGHPRIORITY', values='STATUS').reset_index()
priority_task.columns.name = None
priority_task.HIGH_PRIORITY = priority_task.HIGH_PRIORITY.fillna(0.0)
priority_task.OTHER_PRIORITY = priority_task.OTHER_PRIORITY.fillna(0.0)

In [None]:
priority = task.groupby(['WHATID', 'PRIORITY'])['STATUS'].count().reset_index()
priority = priority.pivot(index='WHATID', columns='PRIORITY', values='STATUS').reset_index()
priority.columns.name = None
priority.High = priority.High.fillna(0.0)
priority.Normal = priority.Normal.fillna(0.0)
priority.Low = priority.Low.fillna(0.0)
priority.columns = [col.upper() for col in priority.columns]

In [None]:
tasksubtype = task.groupby(['WHATID', 'TASKSUBTYPE'])['STATUS'].count().reset_index()
tasksubtype = tasksubtype.pivot(index='WHATID', columns='TASKSUBTYPE', values='STATUS').reset_index()
tasksubtype.columns.name = None
tasksubtype.columns = [col.upper() for col in tasksubtype.columns]
tasksubtype.CALL = tasksubtype.CALL.fillna(0.0)
tasksubtype.EMAIL = tasksubtype.EMAIL.fillna(0.0)
tasksubtype.TASK = tasksubtype.TASK.fillna(0.0)

In [None]:
features = calltype.merge(calldurationinseconds, on='WHATID', how='left')
features = features.merge(priority, on='WHATID', how='outer')
features = features.merge(tasksubtype, on='WHATID', how='outer')
features.TOTAL_CALLDURATIONINSECONDS = features.TOTAL_CALLDURATIONINSECONDS.fillna(0.0)
features.columns = ['WHATID', 'NUM_INBOUND', 'NUM_INTERNAL', 'NUM_OUTBOUND', 'TOTAL_CALLDURATIONINSECONDS', 'NUM_PRIORITY_HIGH',
                    'NUM_PRIORITY_LOW', 'NUM_PRIORITY_NORMAL', 'NUM_CALL', 'NUM_EMAIL', 'NUM_TASK']

In [None]:
dataset = features.merge(opportunity[['ID', 'STAGENAME']], left_on='WHATID', right_on='ID', how='left')
dataset.drop(columns=['ID'], inplace=True)

In [None]:
def encode_categorical_columns(data, categorical_columns):
    label_encoders = {}
    
    for column in categorical_columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    return data, label_encoders

In [None]:
dataset, label_encoders = encode_categorical_columns(dataset, categorical_columns=['STAGENAME'])
ftrs = dataset.drop(columns=['WHATID', 'STAGENAME'])
trgt = dataset['STAGENAME']
ids = dataset['WHATID']

In [None]:
scaler = StandardScaler()
ftrs[ftrs.columns] = scaler.fit_transform(ftrs[ftrs.columns])
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(ftrs, trgt, ids, test_size=0.3, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9037037037037037
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87        47
           1       0.95      0.90      0.92        88

    accuracy                           0.90       135
   macro avg       0.89      0.91      0.90       135
weighted avg       0.91      0.90      0.90       135

Confusion Matrix:
 [[43  4]
 [ 9 79]]


In [None]:
coefficients = model.coef_[0]
feature_names = ftrs.columns

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)
feature_importance_df.columns = ['FEATURE', 'COEFFICIENT']

In [None]:
feature_importance_query = """
    CREATE OR REPLACE TABLE activities_features (
        FEATURE STRING,
        COEFFICIENT FLOAT
    )
"""

In [None]:
snowflake_options['schema'] = 'feature_data'
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

In [None]:
cursor = ctx.cursor()
cursor.execute(feature_importance_query)

<snowflake.connector.cursor.SnowflakeCursor at 0x7f11687bc130>

In [None]:
cursor.close()
ctx.close()

In [None]:
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

In [None]:
def upload_to_snowflake(df, table_name):
    success, nchunks, nrows, _ = write_pandas(ctx, df, table_name)
    return success, nchunks, nrows

In [None]:
feature_importance_df.reset_index(drop=True, inplace=True)

In [None]:
upload_to_snowflake(feature_importance_df, 'ACTIVITIES_FEATURES')

(True, 1, 10)

In [None]:
ctx.close()