In [1]:
import os
import re
import random
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scripts.data_cleaning import clean_data

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [4]:
SEED = 1971

In [5]:
# clean data 
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)
cleaned_train_orig = clean_data(train)
cleaned_test_orig = clean_data(test)

In [6]:
# chnage pass to cat col
cleaned_test_orig["passport"] = cleaned_test_orig["passport"].astype('object')
cleaned_train_orig["passport"] = cleaned_train_orig["passport"].astype('object')

In [7]:
cleaned_train_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3489 entries, 0 to 3488
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        3389 non-null   float64
 1   typeof_contact             3483 non-null   object 
 2   city_tier                  3489 non-null   int64  
 3   duration_of_pitch          3368 non-null   float64
 4   occupation                 3489 non-null   object 
 5   gender                     3489 non-null   object 
 6   number_of_person_visiting  3489 non-null   float64
 7   number_of_followups        3456 non-null   float64
 8   product_pitched            3489 non-null   object 
 9   preferred_property_star    3489 non-null   float64
 10  number_of_trips            3467 non-null   float64
 11  passport                   3489 non-null   object 
 12  pitch_satisfaction_score   3489 non-null   int64  
 13  designation                3489 non-null   objec

In [8]:
CAT_COL  = cleaned_test_orig.select_dtypes(include=['object', 'category']).columns
NUM_COL = [col for col in cleaned_test_orig.columns if col not in CAT_COL]

In [9]:
cleaned_train_orig[CAT_COL].isna().sum()

typeof_contact     6
occupation         0
gender             0
product_pitched    0
passport           0
designation        0
marital_status     0
car_ownership      0
children           0
dtype: int64

In [10]:
cleaned_test_orig[CAT_COL].isna().sum()

typeof_contact     12
occupation          0
gender              0
product_pitched     0
passport            0
designation         0
marital_status      0
car_ownership       0
children            0
dtype: int64

In [11]:
# create new category for type of contact
cleaned_test_orig.fillna("unknown", inplace = True)
cleaned_train_orig.fillna("unknown", inplace = True)

In [12]:
target_name = "prod_taken"

In [22]:
CAT_COL = CAT_COL.tolist()

AttributeError: 'list' object has no attribute 'tolist'

In [23]:
CAT_COL

['typeof_contact',
 'occupation',
 'gender',
 'product_pitched',
 'passport',
 'designation',
 'marital_status',
 'car_ownership',
 'children',
 'prod_taken']

In [27]:
CAT_COL

['typeof_contact',
 'occupation',
 'gender',
 'product_pitched',
 'passport',
 'designation',
 'marital_status',
 'car_ownership',
 'children']

In [24]:
# data preparation 
data = cleaned_train_orig[CAT_COL]
# y = cleaned_train_orig["prod_taken"]

In [32]:
import h2o
h2o.init()
from h2o.estimators import H2OTargetEncoderEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

seed = 1971
#Import the  dataset
data = h2o.H2OFrame(data)
# Set prod_taken column as a factor
data['prod_taken'] = data['prod_taken'].asfactor()
response='prod_taken'

# Split the dataset into train and test
train, test = data.split_frame(ratios = [.8], seed = seed)

# Choose which columns to encode
encoded_columns = CAT_COL

# For k_fold strategy we need to provide fold column
fold_column = "kfold_column"
train[fold_column] = train.kfold_column(n_folds=5, seed=seed)

# Train a TE model
titanic_te = H2OTargetEncoderEstimator(fold_column=fold_column,
                                       data_leakage_handling="k_fold",
                                       blending=True,
                                       inflection_point=1,
                                       smoothing=1,
                                       noise=0.0,     # In general, the less data you have the more regularization you need
                                       seed=seed)

titanic_te.train(x=encoded_columns,
                 y=response,
                 training_frame=train)

# New target encoded train and test sets
train_te = titanic_te.transform(frame=train, as_training=True)
test_te = titanic_te.transform(frame=test, noise=0)

gbm_with_te=H2OGradientBoostingEstimator(fold_column=fold_column,
                                         model_id="gbm_with_te")

# Training is based on training data with early stopping based on xval performance
# x_with_te = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te"]
gbm_with_te.train(x=CAT_COL, y=response, training_frame=train_te)

# To prevent overly optimistic results ( overfitting to xval metrics ) metric is computed on yet unseen test split
my_gbm_metrics = gbm_with_te.model_performance(test_te)
auc_with_te = my_gbm_metrics.auc()

print("auc_with_te", auc_with_te)

# Train a GBM estimator
gbm_baseline=H2OGradientBoostingEstimator(fold_column=fold_column,
                                          model_id="gbm_baseline")

# x_baseline = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked", "home.dest"]
gbm_baseline.train(x=CAT_COL, y=response, training_frame=train)

# Measuring performance on a test split
gbm_baseline_metrics = gbm_baseline.model_performance(test)
auc_baseline = gbm_baseline_metrics.auc()

print(auc_baseline)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,17 mins 02 secs
H2O_cluster_timezone:,Asia/Tokyo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_bulbul_xpnwy7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.506 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


targetencoder Model Build progress: |████████████████████████████████████████████| (done) 100%
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
auc_with_te 0.8069209167497056
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
0.8069209167497056
