# Import Dataset and Libraries

In [164]:
import os

packages = ["imbalanced-learn", "catboost", "pycountry"]
os.system(" && ".join([f"pip install {package}" for package in packages]))

0

In [190]:
import numpy as np
import pandas as pd

RANDOM_STATE = 42
TRAIN_FILE = "./train.csv"
TEST_FILE = "./test.csv"

In [191]:
df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)
df_submission = pd.read_csv(TEST_FILE)

In [192]:
X, y = df_train.drop("is_converted", axis=1), df_train["is_converted"]
X_test = df_test.drop(["is_converted", "id"], axis=1)

# Data Preprocessing

## 1. Data Imputation

In [193]:
values = {
    **{column: 0 for column in X.columns},
    "inquiry_type": "Quotation or Purchase Consultation",
    "customer_country": "Not Found",
    "historical_existing_cnt": X["historical_existing_cnt"].mode()[0],
}

X.fillna(value=values, inplace=True)
X_test.fillna(value=values, inplace=True)

## 2. Data Cleaning 

### Feature: Customer Country

In [194]:
import pycountry


def preprocess_country(country):
    if not isinstance(country, str):
        return "Not Found"

    country = country.split("/")[-1].strip()
    country = pycountry.countries.get(name=country)
    return "Not Found" if country is None else country.alpha_3


X["customer_country"] = [
    preprocess_country(country) for country in X["customer_country"]
]

X_test["customer_country"] = [
    preprocess_country(country) for country in X_test["customer_country"]
]

### Feature: Business Unit ("business_unit")

In [195]:
X["business_unit"] = X["business_unit"].astype(
    pd.CategoricalDtype(categories=["ID", "AS", "IT", "Solution", "CM"])
)
X_test["business_unit"] = X_test["business_unit"].astype(
    pd.CategoricalDtype(categories=["ID", "AS", "IT", "Solution", "CM"])
)

### Feature: Inquiry Type ("inquiry_type")

In [196]:
inquiry_type_remap = {
    # Quotation: Asking about the product itself.
    "Q": [
        "Quotation or Purchase Consultation",
        "Quotation or purchase consultation",
        "Quotation or Purchase consultation",
        "quotation_or_purchase_consultation",
        "Purchase or Quotation",
        "Purchase",
        "Request for quotation or purchase",
        "Sales Inquiry",
        "sales",
        "quotation_",
    ],
    # Usage: Asking about how to use the product.
    "U": [
        "Usage or Technical Consultation",
        "Usage or technical consultation",
        "usage or technical consultation",
        "usage_or_technical_consultation",
        "Technical Consultation",
        "Technical Support",
        "technical",
        "technical_consultation",
        "Request for technical consulting",
        "Product Information",
        "i want to know the details about it",
        "first Info and pricing",
        "Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung",
        "tôi cần tham khảo giá và giải pháp từ LG",
        "Trainings",
    ],
    "O": [
        "Other",
        "Others",
        "other_",
        "other",
        "others",
        "Etc.",
        "ETC.",
        "Not specified",
        "(Select ID_Needs)",
        "IDB",
        "Services",
        "Request for Partnership",
        "Request a Demo",
        "Request for Distributorship",
        "Customer Suggestions",
        *(
            df_train["inquiry_type"]
            .value_counts()
            .loc[df_train["inquiry_type"].value_counts() < 10]
            .index.tolist()
        ),
    ],
}

for key, value in inquiry_type_remap.items():
    X.loc[X["inquiry_type"].isin(value), "inquiry_type"] = key
    X_test.loc[X_test["inquiry_type"].isin(value), "inquiry_type"] = key

## 3. Feature Engineering

In [197]:
country_frequency = X["customer_country"].value_counts(normalize=True)
X["customer_country_frequency"] = X["customer_country"].map(country_frequency)
X_test["customer_country_frequency"] = X_test["customer_country"].map(country_frequency)

country_target = (
    pd.concat([X, y], axis=1).groupby("customer_country")["is_converted"].mean()
)
X["customer_country_target"] = X["customer_country"].map(country_target)
X_test["customer_country_target"] = X_test["customer_country"].map(country_target)

owner_frequency = X["lead_owner"].value_counts(normalize=True)
X["lead_owner_frequency"] = X["lead_owner"].map(owner_frequency)
X_test["lead_owner_frequency"] = X_test["lead_owner"].map(owner_frequency)

owner_target = pd.concat([X, y], axis=1).groupby("lead_owner")["is_converted"].mean()
X["lead_owner_target"] = X["lead_owner"].map(owner_target)
X_test["lead_owner_target"] = X_test["lead_owner"].map(owner_target)


In [198]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

def categorize(num):
    if num > 4000:
        return 'converted_high'
    elif num > 288:
        return 'converted_middle'
    else:
        return 'converted_low'

In [199]:
converted_counts = X.groupby('response_corporate').size().to_dict()
X['corporate_corporate_num'] = X['response_corporate'].map(converted_counts).fillna(0).astype(int)
X['corporate_converted_category'] = X['corporate_corporate_num'].apply(categorize)

converted_counts = X_test.groupby('response_corporate').size().to_dict()
X_test['corporate_corporate_num'] = X_test['response_corporate'].map(converted_counts).fillna(0).astype(int)
X_test['corporate_converted_category'] = X_test['corporate_corporate_num'].apply(categorize)


In [200]:
X['category'] = label_encoding(X["corporate_converted_category"])
X_test['category'] = label_encoding(X_test["corporate_converted_category"])

In [201]:
del X['corporate_converted_category']
del X_test['corporate_converted_category']

In [202]:
X.columns

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'customer_country_frequency',
       'customer_country_target', 'lead_owner_frequency', 'lead_owner_target',
       'corporate_corporate_num', 'category'],
      dtype='object')

## 4. Feature Encoding

In [203]:
def encode_features(
    df: pd.DataFrame,
    features=["enterprise", "business_unit", "inquiry_type", "bant_submit"],
) -> pd.DataFrame:
    df_encoded = pd.get_dummies(df[features], columns=features)
    df_encoded = df_encoded.apply(lambda x: x.astype("category").cat.codes)
    df = pd.concat([df, df_encoded], axis=1).drop(features, axis=1)
    return df


X = encode_features(X)
X_test = encode_features(X_test)

In [204]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

## 5. Feature Normalization

In [205]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

standard_transformer = Pipeline(steps=[("standardize", StandardScaler())])
column_transformer = ColumnTransformer(
    transformers=[
        ("std", standard_transformer, ["lead_desc_length", "historical_existing_cnt"]),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)
column_transformer.set_output(transform="pandas")

X = column_transformer.fit_transform(X)
X_test = column_transformer.transform(X_test)

## 6. Outlier Removal

## 7. Feature Removal

In [206]:
features_to_drop = [
    "customer_country",
    "business_subarea",
    "business_area",
    "customer_idx",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "response_corporate",
    "customer_country.1",
    "customer_position",
    "customer_job",
    "customer_type",
    "expected_timeline",
    "lead_owner",
    "corporate_corporate_num"
]

X = X.drop(features_to_drop, axis=1)  # type: ignore
X_test = X_test.drop(features_to_drop, axis=1)  # type: ignore

In [207]:
X.columns.tolist()

['lead_desc_length',
 'historical_existing_cnt',
 'com_reg_ver_win_rate',
 'id_strategic_ver',
 'it_strategic_ver',
 'idit_strategic_ver',
 'ver_cus',
 'ver_pro',
 'ver_win_rate_x',
 'ver_win_ratio_per_bu',
 'customer_country_frequency',
 'customer_country_target',
 'lead_owner_frequency',
 'lead_owner_target',
 'category',
 'enterprise_Enterprise',
 'enterprise_SMB',
 'business_unit_ID',
 'business_unit_AS',
 'business_unit_IT',
 'business_unit_Solution',
 'business_unit_CM',
 'inquiry_type_O',
 'inquiry_type_Q',
 'inquiry_type_U',
 'bant_submit_0.0',
 'bant_submit_0.25',
 'bant_submit_0.5',
 'bant_submit_0.75',
 'bant_submit_1.0']

# Modeling

In [208]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(random_state=RANDOM_STATE, verbose=False)

# K-Fold Cross Validation

In [209]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import StratifiedKFold

kfolds, scores = (
    StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True),
    [],
)

for fold_idx, (train_idx, val_idx) in enumerate(kfolds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    sm = SMOTE(sampling_strategy="minority", random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)  # type: ignore

    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    prediction = np.where(prediction == "True", True, False)

    print(f"Accuracy (Fold {fold_idx}): {accuracy_score(y_val, prediction)}")
    print(f"Precision (Fold {fold_idx}): {precision_score(y_val, prediction)}")
    print(f"Recall (Fold {fold_idx}): {recall_score(y_val, prediction)}")
    print(f"F1-Score (Fold {fold_idx}): {f1_score(y_val, prediction)}")
    print(
        f"ROC-AUC Score (Fold {fold_idx}): "
        + f"{roc_auc_score(y_val, model.predict_proba(X_val)[:,1])}\n"
    )

    scores.append(f1_score(y_val, prediction))

print(f"Cross-Validation Average F1-Score: {np.mean(scores)}")

Accuracy (Fold 0): 0.9391231028667791
Precision (Fold 0): 0.6001615508885298
Recall (Fold 0): 0.765979381443299
F1-Score (Fold 0): 0.6730072463768116
ROC-AUC Score (Fold 0): 0.9667662567568847

Accuracy (Fold 1): 0.94106239460371
Precision (Fold 1): 0.6056118472330475
Recall (Fold 1): 0.8010309278350516
F1-Score (Fold 1): 0.6897470039946738
ROC-AUC Score (Fold 1): 0.9711095490992399

Accuracy (Fold 2): 0.9365092748735244
Precision (Fold 2): 0.5810306198655714
Recall (Fold 2): 0.8020618556701031
F1-Score (Fold 2): 0.6738847986141187
ROC-AUC Score (Fold 2): 0.9664257855026366

Accuracy (Fold 3): 0.9392074198988195
Precision (Fold 3): 0.5968871595330739
Recall (Fold 3): 0.790721649484536
F1-Score (Fold 3): 0.6802660753880266
ROC-AUC Score (Fold 3): 0.9678598544015602

Accuracy (Fold 4): 0.9395395901846699
Precision (Fold 4): 0.5993715632364494
Recall (Fold 4): 0.7865979381443299
F1-Score (Fold 4): 0.6803388319215337
ROC-AUC Score (Fold 4): 0.9649714125576458

Cross-Validation Average F1-S

# Create Submission

In [210]:
sm = SMOTE(sampling_strategy="minority", random_state=RANDOM_STATE)
X, y = sm.fit_resample(X, y)  # type: ignore
model.fit(X, y)

prediction = model.predict(X_test)
prediction = np.where(prediction == 1, True, False)

df_submission["is_converted"] = prediction
df_submission.to_csv("submission.csv", index=False)
df_submission.head()

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.0,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.04984,retail,Electronics & Telco,278,False
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,1.3e-05,,transportation,Others,437,False
2,8491,1.0,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,6e-05,0.131148,hospital & health care,General Hospital,874,False
3,19895,0.5,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.04984,retail,,194,False
4,10465,1.0,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,False


In [None]:
# F1-Score: 0.6794487912590329