In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

In [2]:
account = pd.read_csv('data/account.csv')
contact = pd.read_csv('data/contact.csv')
opportunity = pd.read_csv('data/opportunity.csv')
task = pd.read_csv('data/task.csv')

In [3]:
dataset = account.merge(opportunity, left_on='ID', right_on='ACCOUNTID', how='left', suffixes=('_ACC', '_OPP'))

In [4]:
dataset.head()

Unnamed: 0,HQ_LOCATION__C,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,YEARSTARTED,ACCOUNTSOURCE,ANNUALREVENUE,RATING,NUMBEROFEMPLOYEES,OWNERSHIP,INDUSTRY,...,ISWON,ISCLOSED,LEADSOURCE,TYPE_OPP,EXPECTEDREVENUE,PROBABILITY,AMOUNT,STAGENAME,NAME_OPP,ACCOUNTID
0,Florida,28.0,12618042.5,2024,Incubator/Accelerator Programs,9210250.0,Hot,47,Private,Media,...,False,True,Incubator/Accelerator Programs,New Business,0.0,0.0,1013128.0,Closed Lost,Investment,001ak00000IqvqPAAR
1,South Dakota,240.0,36166195.73,2018,Incubator/Accelerator Programs,27607783.0,Cold,139,Public,Technology,...,False,False,Incubator/Accelerator Programs,Existing Business,11043113.2,20.0,55215570.0,Needs Analysis,Acquisition,001ak00000IqvqQAAR
2,Alaska,181.0,18847734.44,2016,Direct Referrals,22985042.0,Cold,115,Public,Recreation,...,False,False,Direct Referrals,Existing Business,1953728.57,50.0,3907457.0,Value Proposition,Investment,001ak00000IqvqRAAR
3,Indiana,82.0,5496170.0,2022,Networking Events,5496170.0,Warm,28,Subsidiary,Healthcare,...,False,True,Networking Events,New Business,0.0,0.0,769463.8,Closed Lost,Investment,001ak00000IqvqSAAR
4,New Hampshire,32.0,28215066.73,2015,Business Brokers,47822147.0,Warm,240,Private,Finance,...,False,True,Business Brokers,Existing Business,0.0,0.0,114294900.0,Closed Lost,Acquisition,001ak00000IqvqTAAR


In [60]:
columns_to_drop = ['HQ_LOCATION__C', 'YEARSTARTED', 'NAME_ACC', 'TRACKINGNUMBER__C', 'FORECASTCATEGORYNAME', 'FORECASTCATEGORY',
                   'ISWON', 'ISCLOSED', 'ACCOUNTID', 'PROBABILITY', 'LEADSOURCE', 'INDUSTRY', 'EXPECTEDREVENUE', 'ACCOUNTSOURCE']
numerical_columns = ['GROWTH_RATE__C', 'TOTAL_FUNDING_TO_DATE__C', 'ANNUALREVENUE', 'NUMBEROFEMPLOYEES', 'AMOUNT'] # PROBABILITY, 'EXPECTEDREVENUE', LEADSOURCE, INDUSTRY, ACCOUNTSOURCE
categorical_columns = ['RATING', 'OWNERSHIP', 'TYPE_ACC']
binary_columns = ['OWNER_INTENT_TO_SELL__C', 'TYPE_OPP']
identificators = ['ID_ACC', 'ID_OPP']
targets = ['STAGENAME', 'NAME_OPP']
# I need to include Industry, I need to group them in a way.
# Also I need to include HQ Location, and create a Revenue Range using AnnualRevenue

In [6]:
dataset.NAME_OPP.value_counts()

Investment     260
Acquisition    240
Name: NAME_OPP, dtype: int64

In [7]:
def encode_categorical_columns(data, categorical_columns):
    label_encoders = {}
    
    for column in categorical_columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    return data, label_encoders

In [8]:
def drop_columns(data, columns_to_drop):
    return data.drop(columns=columns_to_drop, axis=1)

In [9]:
def onehot_encode_categorical_columns(data, categorical_columns, label_encoders):
    onehot_encoded_df_list = []
    
    for column in categorical_columns:
        le = label_encoders[column]

        onehot_encoder = OneHotEncoder(sparse_output=False)
        onehot_encoded_column = onehot_encoder.fit_transform(data[[column]])

        categories = le.inverse_transform(np.arange(len(le.classes_)))
        onehot_encoded_df = pd.DataFrame(onehot_encoded_column, columns=[f"{column}_{category}" for category in categories])
        onehot_encoded_df_list.append(onehot_encoded_df)

    data = data.drop(columns=categorical_columns)
    data = pd.concat([data.reset_index(drop=True)] + onehot_encoded_df_list, axis=1)
    
    return data, onehot_encoder

In [10]:
dataset = dataset[dataset.STAGENAME.isin(['Closed Won', 'Closed Lost'])]
dataset, label_encoders = encode_categorical_columns(dataset, categorical_columns + targets + binary_columns)
dataset = drop_columns(dataset, columns_to_drop)
dataset, onehot_encoder = onehot_encode_categorical_columns(dataset, categorical_columns, label_encoders)

In [11]:
dataset.head()

Unnamed: 0,GROWTH_RATE__C,TOTAL_FUNDING_TO_DATE__C,ANNUALREVENUE,NUMBEROFEMPLOYEES,OWNER_INTENT_TO_SELL__C,ID_ACC,ID_OPP,TYPE_OPP,AMOUNT,STAGENAME,NAME_OPP,RATING_Cold,RATING_Hot,RATING_Warm,OWNERSHIP_Private,OWNERSHIP_Public,OWNERSHIP_Subsidiary,TYPE_ACC_Established,TYPE_ACC_Growth Stage,TYPE_ACC_Startup
0,28.0,12618040.0,9210250.0,47,1,001ak00000IqvqPAAR,006ak000002sEVlAAM,1,1013128.0,0,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,82.0,5496170.0,5496170.0,28,0,001ak00000IqvqSAAR,006ak000002sEVoAAM,1,769463.8,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,32.0,28215070.0,47822147.0,240,1,001ak00000IqvqTAAR,006ak000002sEVpAAM,0,114294900.0,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,117.0,53896920.0,70917002.0,355,0,001ak00000IqvqVAAR,006ak000002sEVrAAM,0,158144900.0,0,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,141.0,1162368000.0,867439110.0,4338,1,001ak00000IqvqZAAR,006ak000002sEVwAAM,0,1908366000.0,0,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [12]:
for variable, label_encoder in label_encoders.items():
    print(variable, ': ', label_encoder.classes_)
    print('===================================')

RATING :  ['Cold' 'Hot' 'Warm']
OWNERSHIP :  ['Private' 'Public' 'Subsidiary']
TYPE_ACC :  ['Established' 'Growth Stage' 'Startup']
STAGENAME :  ['Closed Lost' 'Closed Won']
NAME_OPP :  ['Acquisition' 'Investment']
OWNER_INTENT_TO_SELL__C :  [False  True]
TYPE_OPP :  ['Existing Business' 'New Business']


In [13]:
dataset.columns

Index(['GROWTH_RATE__C', 'TOTAL_FUNDING_TO_DATE__C', 'ANNUALREVENUE',
       'NUMBEROFEMPLOYEES', 'OWNER_INTENT_TO_SELL__C', 'ID_ACC', 'ID_OPP',
       'TYPE_OPP', 'AMOUNT', 'STAGENAME', 'NAME_OPP', 'RATING_Cold',
       'RATING_Hot', 'RATING_Warm', 'OWNERSHIP_Private', 'OWNERSHIP_Public',
       'OWNERSHIP_Subsidiary', 'TYPE_ACC_Established', 'TYPE_ACC_Growth Stage',
       'TYPE_ACC_Startup'],
      dtype='object')

In [14]:
dataset[['STAGENAME', 'NAME_OPP']]

Unnamed: 0,STAGENAME,NAME_OPP
0,0,1
1,0,1
2,0,0
3,0,0
4,0,0
...,...,...
348,0,0
349,0,1
350,1,1
351,0,0


In [15]:
features = dataset.drop(columns=identificators+['STAGENAME'])
target = dataset['STAGENAME']
ids = dataset[identificators]

In [16]:
scaler = StandardScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])

In [17]:
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(features, target, ids, test_size=0.4, random_state=42)

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 15,
    'verbose': -1
}
model = lgb.train(params, train_data, num_boost_round=100)

In [None]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

In [None]:
accuracy = accuracy_score(y_test, y_pred_binary)
report = classification_report(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)

In [None]:
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
y_test.value_counts()

In [None]:
pd.Series(y_pred_binary).value_counts()

In [None]:
# Feature importance
importance = model.feature_importance()
feature_names = features.columns

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

In [None]:
feature_importance_df.head(40)

In [None]:
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [63]:
# Train a Logistic Regression Classifier
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

In [64]:
y_pred = model.predict(X_test)

In [65]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5985915492957746
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.67      0.74       123
           1       0.07      0.16      0.10        19

    accuracy                           0.60       142
   macro avg       0.45      0.41      0.42       142
weighted avg       0.73      0.60      0.66       142

Confusion Matrix:
 [[82 41]
 [16  3]]


In [22]:
predictions_df = pd.DataFrame({
    'ID_ACC': ids_test['ID_ACC'],
    'Prediction': y_pred
})

In [23]:
# Feature importance (coefficients)
coefficients = model.coef_[0]
feature_names = features.columns

In [24]:
# Create a DataFrame for the feature importance
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

In [66]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
7,NAME_OPP,0.713761
10,RATING_Warm,0.627943
13,OWNERSHIP_Subsidiary,0.298136
14,TYPE_ACC_Established,0.292493
4,OWNER_INTENT_TO_SELL__C,0.25353
0,GROWTH_RATE__C,0.177965
6,AMOUNT,0.133195
16,TYPE_ACC_Startup,0.105643
5,TYPE_OPP,0.105643
11,OWNERSHIP_Private,0.094133


In [26]:
feature_importance_df[feature_importance_df.Feature == 'NAME_OPP']

Unnamed: 0,Feature,Coefficient
7,NAME_OPP,0.713761


In [27]:
label_encoders['NAME_OPP'].classes_

array(['Acquisition', 'Investment'], dtype=object)

In [40]:
# Predict probabilities
y_prob = model.predict_proba(X_test)[:, 1]

# Threshold
threshold = 0.5
y_pred_custom = (y_prob >= threshold).astype(int)

# Evaluate the model with the custom threshold
accuracy = accuracy_score(y_test, y_pred_custom)
report = classification_report(y_test, y_pred_custom)
conf_matrix = confusion_matrix(y_test, y_pred_custom)

print("Accuracy with custom threshold:", accuracy)
print("Classification Report with custom threshold:\n", report)
print("Confusion Matrix with custom threshold:\n", conf_matrix)

Accuracy with custom threshold: 0.5985915492957746
Classification Report with custom threshold:
               precision    recall  f1-score   support

           0       0.84      0.67      0.74       123
           1       0.07      0.16      0.10        19

    accuracy                           0.60       142
   macro avg       0.45      0.41      0.42       142
weighted avg       0.73      0.60      0.66       142

Confusion Matrix with custom threshold:
 [[82 41]
 [16  3]]


In [45]:
data_sample = X_test.iloc[[8]]
data_sample.reset_index(drop=True, inplace=True)

In [46]:
print(label_encoders['NAME_OPP'].classes_[data_sample.NAME_OPP])
model.predict_proba(data_sample)[:, 1]

['Investment']


array([0.50870644])

In [47]:
data_sample.NAME_OPP = 1 - data_sample.NAME_OPP

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sample.NAME_OPP = 1 - data_sample.NAME_OPP


In [48]:
print(label_encoders['NAME_OPP'].classes_[data_sample.NAME_OPP])
model.predict_proba(data_sample)[:, 1]

['Acquisition']


array([0.33649974])

In [54]:
dataset[dataset.OWNERSHIP_Public == 1.0].STAGENAME.value_counts()

0    102
1     16
Name: STAGENAME, dtype: int64

In [55]:
dataset[dataset.OWNERSHIP_Subsidiary == 1.0].STAGENAME.value_counts()

0    95
1    25
Name: STAGENAME, dtype: int64

In [56]:
dataset[dataset.OWNERSHIP_Private == 1.0].STAGENAME.value_counts()

0    90
1    25
Name: STAGENAME, dtype: int64

In [57]:
dataset[dataset['TYPE_ACC_Growth Stage'] == 1.0].STAGENAME.value_counts()

0    40
1     7
Name: STAGENAME, dtype: int64

In [58]:
dataset[dataset['TYPE_ACC_Established'] == 1.0].STAGENAME.value_counts()

0    157
1     39
Name: STAGENAME, dtype: int64

In [70]:
dataset.ANNUALREVENUE.describe()

count    3.530000e+02
mean     2.236609e+08
std      3.004198e+08
min      1.197959e+06
25%      8.042523e+06
50%      6.428692e+07
75%      3.813766e+08
max      9.960538e+08
Name: ANNUALREVENUE, dtype: float64

In [72]:
account.INDUSTRY.value_counts()

Media                 25
Utilities             25
Finance               24
Not For Profit        22
Transportation        20
Banking               20
Food & Beverage       19
Environmental         19
Telecommunications    17
Consulting            17
Shipping              17
Biotechnology         16
Recreation            15
Electronics           15
Education             15
Engineering           15
Retail                14
Technology            14
Hospitality           14
Other                 14
Entertainment         13
Healthcare            13
Government            13
Insurance             12
Manufacturing         12
Communications        12
Machinery             12
Apparel               12
Construction          11
Agriculture           11
Energy                11
Chemicals             11
Name: INDUSTRY, dtype: int64

In [73]:
industry_mapping = {
    'Technology & Communications': [
        'Media', 'Telecommunications', 'Technology', 'Electronics', 'Communications'
    ],
    'Finance & Insurance': [
        'Finance', 'Banking', 'Insurance'
    ],
    'Consumer & Services': [
        'Not For Profit', 'Transportation', 'Food & Beverage', 'Environmental', 'Consulting', 
        'Shipping', 'Recreation', 'Education', 'Retail', 'Hospitality', 'Entertainment', 
        'Healthcare', 'Government', 'Apparel'
    ],
    'Industrial & Other': [
        'Utilities', 'Biotechnology', 'Engineering', 'Manufacturing', 'Machinery', 'Construction', 
        'Agriculture', 'Energy', 'Chemicals', 'Other'
    ]
}

In [74]:
def map_industry(industry):
    for category, industries in industry_mapping.items():
        if industry in industries:
            return category
    return 'Unknown'

In [76]:
account['INDUSTRY'].apply(map_industry).value_counts()

Consumer & Services            223
Industrial & Other             138
Technology & Communications     83
Finance & Insurance             56
Name: INDUSTRY, dtype: int64

In [77]:
def categorize_revenue(revenue):
    if revenue <= 8.042523e6:
        return 'Low'
    elif revenue <= 6.428692e7:
        return 'Lower-Middle'
    elif revenue <= 3.813766e8:
        return 'Upper-Middle'
    else:
        return 'High'

In [79]:
account['ANNUALREVENUE'].apply(categorize_revenue).value_counts()

Low             135
Lower-Middle    123
High            123
Upper-Middle    119
Name: ANNUALREVENUE, dtype: int64

In [80]:
account.HQ_LOCATION__C.value_counts()

Alaska            17
Tennessee         17
Connecticut       16
Louisiana         16
Arizona           16
Delaware          14
Wyoming           14
New Jersey        14
North Dakota      13
Minnesota         13
Pennsylvania      12
Colorado          12
Alabama           12
New Mexico        11
Florida           11
Hawaii            11
New Hampshire     11
Idaho             11
Nebraska          11
Arkansas          11
Oklahoma          11
West Virginia     10
Texas             10
Mississippi       10
Utah              10
Georgia           10
Nevada             9
Kentucky           9
North Carolina     9
Missouri           9
Kansas             9
Ohio               9
Montana            9
Vermont            8
Wisconsin          8
Massachusetts      8
Virginia           8
South Carolina     8
Oregon             8
New York           7
Michigan           7
Maryland           7
California         7
Iowa               6
Washington         6
Illinois           5
South Dakota       5
Indiana      

In [81]:
region_mapping = {
    'Northeast': [
        'Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont',
        'New Jersey', 'New York', 'Pennsylvania'
    ],
    'Midwest': [
        'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin',
        'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'
    ],
    'Southeast': [
        'Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia',
        'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana'
    ],
    'Southwest': [
        'Arizona', 'New Mexico', 'Oklahoma', 'Texas'
    ],
    'West': [
        'Alaska', 'California', 'Colorado', 'Hawaii', 'Idaho', 'Montana', 'Nevada',
        'Oregon', 'Utah', 'Washington', 'Wyoming'
    ]
}

In [82]:
def map_region(state):
    for region, states in region_mapping.items():
        if state in states:
            return region
    return 'Unknown'

In [84]:
account['HQ_LOCATION__C'].apply(map_region).value_counts()

Southeast    152
West         114
Midwest      100
Northeast     86
Southwest     48
Name: HQ_LOCATION__C, dtype: int64