# Import Libraries

In [None]:
# pip install -q phonenumbers

In [None]:
# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import DecimalType

# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation

from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import model_registry
from snowflake.ml._internal.utils import identifier
from xgboost import XGBClassifier

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error

# Misc
import json
import joblib
import cachetools

# warning suppresion
import warnings; warnings.simplefilter('ignore')

import phonenumbers

In [None]:
# Make a Snowpark Connection


connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

In [None]:
# Specify the table name where we stored the diamonds dataset
# **nChange this only if you named your table something else in the data ingest notebook **
CONVERTEDONLY_LEADS_TABLE = 'CONVERTEDONLY'
input_tbl = f"{session.get_current_database()}.{session.get_current_schema()}.{CONVERTEDONLY_LEADS_TABLE}"

In [None]:
# First, we read in the data from a Snowflake table into a Snowpark DataFrame
leads_df = session.table(input_tbl)

In [None]:
leads_df = leads_df.drop('SALUTATION'
,'TITLE'
,'FIRSTNAME'
,'LASTNAME'
,'WEBSITE'
,'CONVERTEDCONTACTID'
,'CONVERTEDDATE'
,'CONVERTEDCONTACTID'
,'CONVERTEDORGANIZATIONID'
,'UPDATEDDATE'
,'EMPLOYEECOUNT'
,'LEADRATING'
,'FAX'
,'INDUSTRY'
,'OWNERUSERID'
,'RESPONSIBLEUSERID'
,'ADDRESSSTREET'
,'ADDRESSCITY'
,'ADDRESSSTATE'
,'ADDRESSPOSTCODE'
,'LASTACTIVITYDATE'
,'NEXTACTIVITYDATE'
,'VISIBLETO'
,'VISIBLETEAMID'
,'ORGANIZATIONNAME'
,'CREATEDUSERID'
,'IMAGEURL'
,'TAGS'
,'O_ID'
, 'CONVERTED'
, 'CONVERTEDOPPORTUNITYID')


In [None]:
# Checking for missing values in the leads_df EMAIL dataframe column
leads_df = leads_df.na.replace('', "unknown@unknown.nl", subset='EMAIL')
# leads_df.write.mode('overwrite').save_as_table('tmp_leadsonly')
leads_df.show()

In [None]:
# List of identified private email domains
private_domains = [
    'gmail.com', 'hotmail.com', 'orange.fr', 'wanadoo.fr', 'hotmail.fr',
    'yahoo.com', 'outlook.com', 'hotmail.co.uk', 'me.com', 'gmx.de',
    'icloud.com', 'yahoo.it', 'libero.it', 'web.de', 'kpnmail.nl',
    'yahoo.de', 'yahoo.fr', 'free.fr', 'telenet.be', 'live.fr',
    'otenet.gr', 'mac.com', 'yahoo.co.uk', 'laposte.net', 'uol.com.br',
    'casema.nl', 'aol.com', 't-online.de', 'unknown.com', 'yahoo.in',
    'gmx.fr', 'mail.com', 'mail.ru', 'live.it', 'msn.com',
    'yahoo.com.sg', 'hotmail.it', 'googlemail.com', 'hotmail.nl', 'ziggo.nl'
]

# Extracting the domain from the 'Email' column
leads_df = leads_df.withColumn(
    "Email_Domain",
    F.regexp_extract(F.col("Email"), '@([a-zA-Z0-9.-]+)$', 1)
)

# leads_df.write.mode('overwrite').save_as_table('tmp_leadsonly')
# leads_df.show()


In [None]:
# Classifying the email addresses as 'Private' or 'Business'
# merged_data_prefixed['c_Email_Type'] = merged_data_prefixed['c_Email_Domain'].apply(
#     lambda x: 'Private' if x in private_domains else 'Business')

# Creating a column to classify email as 'Private' or 'Business'
leads_df = leads_df.withColumn(
    "Email_Type",
    F.when(F.col("Email_Domain").isin(private_domains), "Private").otherwise("Business")
)
leads_df = leads_df.drop('EMAIL_DOMAIN')
# leads_df.write.mode('overwrite').save_as_table('tmp_leadsonly')

In [None]:
# Creating a new column 'LENGTH_LEADDESCRIPTION' indicating the length of the text in 'LEADDESCRIPTION'
leads_df = leads_df.withColumn("LENGTH_LEADDESCRIPTION", F.length(F.col("LEADDESCRIPTION")))
leads_df = leads_df.drop('LEADDESCRIPTION')
# leads_df.write.mode('overwrite').save_as_table('tmp_leadsonly')


In [None]:
# Normalize the LEN_LEADDESC_NORM column
snowml_mms = snowml.MinMaxScaler(input_cols=["LENGTH_LEADDESCRIPTION"], output_cols=["LEN_LEADDESC_NORM"])
leads_df = snowml_mms.fit(leads_df).transform(leads_df)

# Reduce the number of decimals
new_col = leads_df.col("LEN_LEADDESC_NORM").cast(DecimalType(7, 6))
leads_df = leads_df.with_column("LEN_LEADDESC_NORM", new_col)

leads_df = leads_df.drop('LENGTH_LEADDESCRIPTION')
# leads_df.show()

In [None]:
# Creating a new column 'PHONENUMBER' based on 'MOBILE' and 'PHONE'
leads_df = leads_df.withColumn(
    "PHONENUMBER",
    F.when(F.col("MOBILE").isNull() | (F.col("MOBILE") == ''), F.col("PHONE")).otherwise(F.col("MOBILE"))
)
# leads_df.write.mode('overwrite').save_as_table('TMP_LEADSONLY')

In [None]:
# Creating a new column 'COUNTRYCODE' based on 'PHONENUMBER'

def get_country_code(phone_number):
    # return phn.parse(phone_number, None)
    try:
        parsed_number = phonenumbers.parse(phone_number, None)
        if phonenumbers.is_valid_number(parsed_number):
            return '+' + str(parsed_number.country_code)
    except:
        pass
    return '+00'

# Assuming leads_df is your Snowflake DataFrame
leads_df_pandas = leads_df.to_pandas()

leads_df_pandas['COUNTRYCODE'] = leads_df_pandas['PHONENUMBER'].apply(
    lambda x: get_country_code(str(x)) if pd.notna(x) else '+00'
)

session.write_pandas(leads_df_pandas, "TEMP_LEADS_TABLE", auto_create_table=True)

# Read the data back into a Snowflake DataFrame
leads_df_snowflake = session.table("TEMP_LEADS_TABLE")

# Write the Snowflake DataFrame to a permanent table
# leads_df_snowflake.write.mode("overwrite").save_as_table("TMP_LEADSONLY")


In [None]:
# Checking for missing values in the leads_df EMAIL dataframe column
leads_df_snowflake = leads_df_snowflake.na.replace('', "Netherlands", subset='ADDRESSCOUNTRY')
leads_df_snowflake = leads_df_snowflake.na.replace('-', "Netherlands", subset='ADDRESSCOUNTRY')
# leads_df_snowflake.write.mode('overwrite').save_as_table('TMP_LEADSONLY')

In [None]:
# Splitting 'CreatedDate' into Year, Month, Week, Day, and Hour portions with specified format

# Assuming leads_df is your Snowflake DataFrame and 'CreatedDate' is a column in this DataFrame

# Extracting year, month, week, day, and hour from 'CreatedDate'
# Convert 'CreatedDate' to a timestamp type with the specified format
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedDate', F.to_timestamp('CreatedDate', 'MM/DD/YYYY HH12:MI:SS AM'))
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedYear', F.year('CreatedDate'))
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedMonth', F.month('CreatedDate'))
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedWeek', F.weekofyear('CreatedDate'))
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedDay', F.dayofweek('CreatedDate'))  # Note: In Snowflake, Sunday=0, Saturday=6
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedHour', F.hour('CreatedDate'))

# Adjust 'CreatedDay' to make Monday=1, Sunday=7 if needed
leads_df_snowflake = leads_df_snowflake.withColumn('CreatedDay', (F.col('CreatedDay') % 7) + 1)
# leads_df_snowflake.write.mode('overwrite').save_as_table('tmp_leadsonly')
# leads_df_snowflake.show()



In [None]:
leads_df_snowflake = leads_df_snowflake.drop('MOBILE', 'PHONE', 'PHONENUMBER', 'LEADSTATUSID', 'EMAIL', 'COUNTRYCODE', 'CUSTOMFIELDS', 'CREATEDYEAR', 'CREATEDDATE')
# leads_df_snowflake.write.mode('overwrite').save_as_table('TMP_LEADSONLY')


In [None]:
leads_df_snowflake.show()

In [None]:
# Encode categoricals to numeric columns
snowml_ohe = snowml.OneHotEncoder(input_cols=["LEADSOURCEID", "ADDRESSCOUNTRY", "EMAIL_TYPE"], output_cols=["LS_", "AD_", "ET_"])
# transformed_leads_df = snowml_ohe.fit(leads_df_snowflake).transform(leads_df_snowflake)
leads_df_snowflake = snowml_ohe.fit(leads_df_snowflake).transform(leads_df_snowflake)

np.array(leads_df_snowflake.columns)

In [None]:
leads_df_snowflake = leads_df_snowflake.drop('ID', 'LEADSOURCEID', 'ADDRESSCOUNTRY', 'EMAIL_TYPE')
np.array(leads_df_snowflake.columns)
# transformed_leads_df.show()

In [None]:
# Categorize all the features for modeling

# Categorize all the features for processing
# CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
# CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"] # To name the ordinal encoded columns
# CATEGORICAL_COLUMNS_OHE = 'auto',
NUMERICAL_COLUMNS = ["CREATEDMONTH", "CREATEDWEEK", "CREATEDHOUR", "CREATEDDAY", "LEN_LEADDESC_NORM"]
ONE_HOT_ENCODED_COLUMNS = ["LEADSOURCEID", "ADDRESSCOUNTRY", "EMAIL_TYPE"]  # New list for one-hot encoded columns
# ONE_HOT_ENCODED_COLUMNS_OUTPUT = ["LS", "AC", "ET"]
ONE_HOT_ENCODED_COLUMNS_OUTPUT = ["OHE", "OHE", "OHE"]

LABEL_COLUMNS = ['O_OPPORTUNITYSTATE']
OUTPUT_COLUMNS = ['PREDICTED_OPPORTUNITYSTATE']


In [None]:
# Split the data into train and test sets
leads_df_snowflake_train, leads_df_snowflake_test = leads_df_snowflake.random_split(weights=[0.9, 0.1], seed=0)

# Run the train and test sets through the Pipeline object we defined earlier
# train_df = preprocessing_pipeline.fit(leads_df_snowflake_train).transform(leads_df_snowflake_train)
# test_df = preprocessing_pipeline.transform(leads_df_snowflake_test)

In [None]:
np.array(leads_df_snowflake_train.columns)

In [None]:
# Assuming 'train_df' is the transformed training dataframe
one_hot_encoded_columns = [col for col in leads_df_snowflake_train.columns 
                           if col.startswith(('LS__', '"AD__', '"ET__'))]

one_hot_encoded_columns = [col.replace('"', '') for col in one_hot_encoded_columns]

# one_hot_encoded_columns = [col for col in leads_df_snowflake_train.columns if 'OHE_' in col]

In [None]:
print(type(leads_df_snowflake_test))
print(type(leads_df_snowflake_train))


In [None]:
# Train
# classifier.fit(train_df[feature_columns], train_df[target_column])
# classifier.fit(leads_df_snowflake_train[feature_columns], leads_df_snowflake_train[target_column])
feature_columns = NUMERICAL_COLUMNS + one_hot_encoded_columns

leads_df_snowflake_train = leads_df_snowflake_train.to_pandas()
leads_df_snowflake_test = leads_df_snowflake_test.to_pandas()

X_train = leads_df_snowflake_train[feature_columns]
y_train = leads_df_snowflake_train['O_OPPORTUNITYSTATE']

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Now fit the model
# XGBClassifier.fit(leads_df_snowflake_train[feature_columns], leads_df_snowflake_train['O_OPPORTUNITYSTATE'])

# Predict
# predictions = classifier.predict(leads_df_snowflake_test[feature_columns])


In [None]:
# Assuming leads_df_snowflake_test is a Pandas DataFrame
leads_df_snowflake_test.columns = [col.replace('"', '') for col in leads_df_snowflake_test.columns]



In [None]:
predictions = xgb_classifier.predict(leads_df_snowflake_test[feature_columns])

In [None]:
# Get feature importance
feature_importance = xgb_classifier.feature_importances_

In [None]:
# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance_df)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Assuming 'leads_df_snowflake_test' is your test dataset
# and 'O_OPPORTUNITYSTATE' is the column with the true labels

y_test = leads_df_snowflake_test['O_OPPORTUNITYSTATE']

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Display metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assuming you have predictions and true labels
y_true = y_test  # Replace with your actual true labels
y_pred = predictions  # Replace with your model's predictions

# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot using seaborn
sns.heatmap(cm, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# !pip install shap

In [None]:
import shap

# Create a SHAP explainer object
explainer = shap.Explainer(xgb_classifier)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(leads_df_snowflake_test[feature_columns])

# Summarize the effects of all the features
shap.summary_plot(shap_values, leads_df_snowflake_test[feature_columns])


In [None]:
snippet = leads_df_snowflake_test.head(5)
# print(snippet.to_csv(index=False))
print(snippet.to_json(orient='records', lines=True))



In [None]:
# Choose a specific instance (e.g., the first instance in your test set)
instance_to_explain = leads_df_snowflake_test[feature_columns].iloc[0]

# Calculate SHAP values for this instance
shap_values_instance = explainer.shap_values(instance_to_explain)

# Visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values_instance, instance_to_explain)


In [None]:
session.close()