In [None]:
!pip install snowflake-connector-python pandas
!pip install --upgrade snowflake-connector-python


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from snowflake.connector.pandas_tools import write_pandas
import snowflake.connector
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [1]:
snowflake_options = {
    "account": "XXXXXXXXXXXXXXXX",
    "user": "XXXXXXXXXXXXXXXX",
    "password": "XXXXXXXXXXXXXXXX",
    "database": "XXXXXXXXXXXXXXXX",
    "schema": "XXXXXXXXXXXXXXXX",
    "warehouse": "XXXXXXXXXXXXXXXX",
}

In [None]:
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

account_query = "select * from salesforce_db.pre_data.account"
contact_query = "select * from salesforce_db.pre_data.contact"
opportunity_query = "select * from salesforce_db.pre_data.opportunity"
task_query = "select * from salesforce_db.pre_data.task"

In [None]:
def get_table(ctx, query):
    cs = ctx.cursor()
    try:
        cs.execute(query)
        df = cs.fetch_pandas_all()
    finally:
        cs.close()
    return df

In [None]:
account = get_table(ctx, account_query)
opportunity = get_table(ctx, opportunity_query)
contact = get_table(ctx, contact_query)
task = get_table(ctx, task_query)

In [None]:
ctx.close()

In [None]:
def encode_categorical_columns(data, categorical_columns):
    label_encoders = {}
    
    for column in categorical_columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    return data, label_encoders

def drop_columns(data, columns_to_drop):
    return data.drop(columns=columns_to_drop, axis=1)

def onehot_encode_categorical_columns(data, categorical_columns, label_encoders):
    onehot_encoded_df_list = []
    
    for column in categorical_columns:
        le = label_encoders[column]

        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoded_column = onehot_encoder.fit_transform(data[[column]])

        categories = le.inverse_transform(np.arange(len(le.classes_)))
        onehot_encoded_df = pd.DataFrame(onehot_encoded_column, columns=[f"{column}_{category}" for category in categories])
        onehot_encoded_df_list.append(onehot_encoded_df)

    data = data.drop(columns=categorical_columns)
    data = pd.concat([data.reset_index(drop=True)] + onehot_encoded_df_list, axis=1)
    
    return data, onehot_encoder

In [None]:
columns_to_drop = ['YEARSTARTED', 'NAME_ACC', 'FORECASTCATEGORYNAME', 'FORECASTCATEGORY', 'HASOVERDUETASK', 'ANNUALREVENUE', 'AMOUNT',
                   'ISWON', 'ISCLOSED', 'ACCOUNTID', 'PROBABILITY', 'LEADSOURCE', 'EXPECTEDREVENUE', 'ACCOUNTSOURCE', 'FISCALYEAR']
numerical_columns = ['GROWTH_RATE__C', 'TOTAL_FUNDING_TO_DATE__C', 'NUMBEROFEMPLOYEES']
categorical_columns = ['RATING', 'OWNERSHIP', 'TYPE_ACC', 'INDUSTRY', 'HQ_LOCATION__C', 'RANGE_ANNUALREVENUE']
binary_columns = ['OWNER_INTENT_TO_SELL__C', 'TYPE_OPP']
identificators = ['ID_ACC', 'ID_OPP']
targets = ['STAGENAME', 'NAME_OPP']

In [None]:
industry_mapping = {
    'Technology & Communications': [
        'Media', 'Telecommunications', 'Technology', 'Electronics', 'Communications'
    ],
    'Finance & Insurance': [
        'Finance', 'Banking', 'Insurance'
    ],
    'Consumer & Services': [
        'Not For Profit', 'Transportation', 'Food & Beverage', 'Environmental', 'Consulting', 
        'Shipping', 'Recreation', 'Education', 'Retail', 'Hospitality', 'Entertainment', 
        'Healthcare', 'Government', 'Apparel'
    ],
    'Industrial & Other': [
        'Utilities', 'Biotechnology', 'Engineering', 'Manufacturing', 'Machinery', 'Construction', 
        'Agriculture', 'Energy', 'Chemicals', 'Other'
    ]
}

In [None]:
region_mapping = {
    'Northeast': [
        'Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont',
        'New Jersey', 'New York', 'Pennsylvania'
    ],
    'Midwest': [
        'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin',
        'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'
    ],
    'Southeast': [
        'Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia',
        'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana'
    ],
    'Southwest': [
        'Arizona', 'New Mexico', 'Oklahoma', 'Texas'
    ],
    'West': [
        'Alaska', 'California', 'Colorado', 'Hawaii', 'Idaho', 'Montana', 'Nevada',
        'Oregon', 'Utah', 'Washington', 'Wyoming'
    ]
}

In [None]:
def map_industry(industry):
    for category, industries in industry_mapping.items():
        if industry in industries:
            return category
    return 'Unknown'

def categorize_revenue(revenue):
    if revenue <= 8.042523e6:
        return 'Low'
    elif revenue <= 6.428692e7:
        return 'Lower-Middle'
    elif revenue <= 3.813766e8:
        return 'Upper-Middle'
    else:
        return 'High'

def map_region(state):
    for region, states in region_mapping.items():
        if state in states:
            return region
    return 'Unknown'

In [None]:
def prepare_dataset(is_train=True):
    dataset = account.merge(opportunity, left_on='ID', right_on='ACCOUNTID', how='left', suffixes=('_ACC', '_OPP'))
    if is_train:
        dataset = dataset[dataset.STAGENAME.isin(['Closed Won', 'Closed Lost'])]
    else:
        dataset = dataset[~dataset.STAGENAME.isin(['Closed Won', 'Closed Lost'])]
    dataset['INDUSTRY'] = dataset['INDUSTRY'].apply(map_industry)
    dataset['RANGE_ANNUALREVENUE'] = dataset['ANNUALREVENUE'].apply(categorize_revenue)
    dataset['HQ_LOCATION__C'] = dataset['HQ_LOCATION__C'].apply(map_region)
    dataset, label_encoders = encode_categorical_columns(dataset, categorical_columns + targets + binary_columns)
    dataset = drop_columns(dataset, columns_to_drop)
    dataset, onehot_encoder = onehot_encode_categorical_columns(dataset, categorical_columns, label_encoders)
    return dataset, onehot_encoder

In [None]:
dataset, onehot_encoder = prepare_dataset()

In [None]:
features = dataset.drop(columns=identificators+['STAGENAME'])
target = dataset['STAGENAME']
ids = dataset[identificators]

In [None]:
scaler = StandardScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(features, target, ids, test_size=0.3, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.7185185185185186
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.52      0.59        52
           1       0.74      0.84      0.79        83

    accuracy                           0.72       135
   macro avg       0.71      0.68      0.69       135
weighted avg       0.71      0.72      0.71       135

Confusion Matrix:
 [[27 25]
 [13 70]]


In [None]:
coefficients = model.coef_[0]
feature_names = features.columns

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)
feature_importance_df.columns = ['FEATURE', 'COEFFICIENT']

In [None]:
feature_importance_query = """
    CREATE OR REPLACE TABLE companies_features (
        FEATURE STRING,
        COEFFICIENT FLOAT
    )
"""

In [None]:
features_query = """
    CREATE OR REPLACE TABLE features (
        GROWTH_RATE__C STRING,
        TOTAL_FUNDING_TO_DATE__C STRING,
        NUMBEROFEMPLOYEES STRING,
        OWNER_INTENT_TO_SELL__C STRING,
        ID_ACC STRING,
        TYPE_OPP STRING,
        STAGENAME STRING,
        NAME_OPP STRING,
        RATING_COLD STRING,
        RATING_HOT STRING,
        RATING_WARM STRING,
        OWNERSHIP_PRIVATE STRING,
        OWNERSHIP_PUBLIC STRING,
        OWNERSHIP_SUBSIDIARY STRING,
        TYPE_ACC_ESTABLISHED STRING,
        TYPE_ACC_GROWTH_STAGE STRING,
        TYPE_ACC_STARTUP STRING,
        INDUSTRY_CONSUMER_AND_SERVICES STRING,
        INDUSTRY_FINANCE_AND_INSURANCE STRING,
        INDUSTRY_INDUSTRIAL_AND_OTHER STRING,
        INDUSTRY_TECHNOLOGY_AND_COMMUNICATIONS STRING,
        HQ_LOCATION__C_MIDWEST STRING,
        HQ_LOCATION__C_NORTHEAST STRING,
        HQ_LOCATION__C_SOUTHEAST STRING,
        HQ_LOCATION__C_SOUTHWEST STRING,
        HQ_LOCATION__C_WEST STRING,
        RANGE_ANNUALREVENUE_HIGH STRING,
        RANGE_ANNUALREVENUE_LOW STRING,
        RANGE_ANNUALREVENUE_LOWER_MIDDLE STRING,
        RANGE_ANNUALREVENUE_UPPER_MIDDLE STRING
    )
"""

In [None]:
snowflake_options['schema'] = 'feature_data'
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

In [None]:
cursor = ctx.cursor()
cursor.execute(feature_importance_query)
cursor.execute(features_query)

<snowflake.connector.cursor.SnowflakeCursor at 0x7fa324a92380>

In [None]:
cursor.close()
ctx.close()

In [None]:
ctx = snowflake.connector.connect(
    user=snowflake_options['user'],
    password=snowflake_options['password'],
    account=snowflake_options['account'],
    warehouse=snowflake_options['warehouse'],
    database=snowflake_options['database'],
    schema=snowflake_options['schema']
)

In [None]:
def upload_to_snowflake(df, table_name):
    success, nchunks, nrows, _ = write_pandas(ctx, df, table_name)
    return success, nchunks, nrows

In [None]:
dataset_for_pred, _ = prepare_dataset(is_train=False)
dataset_for_pred.reset_index(drop=True, inplace=True)
dataset_for_pred.columns = [col.upper().replace(' ', '_').replace('&', 'AND').replace('-', '_') for col in dataset_for_pred.columns]
dataset_for_pred.drop(columns=['ID_OPP'], inplace=True)

In [None]:
upload_to_snowflake(dataset_for_pred, 'FEATURES')

(True, 1, 150)

In [None]:
feature_importance_df.reset_index(drop=True, inplace=True)
feature_importance_df.FEATURE = feature_importance_df.FEATURE.str.upper().str.replace(' ', '_').str.replace('&', 'AND').str.replace('-', '_')

In [None]:
upload_to_snowflake(feature_importance_df, 'COMPANIES_FEATURES')

(True, 1, 28)

In [None]:
ctx.close()