# AutoML: TPOT and Hyperopt Sklearn

#### Import Libraries

In [523]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#### Import Data

In [526]:
df = pd.read_csv('counterfeit_transactions.csv')

#### Data Structure

In [529]:
df.shape

(3000, 20)

In [531]:
df.head()

Unnamed: 0,transaction_id,customer_id,transaction_date,customer_age,customer_location,quantity,unit_price,total_amount,payment_method,shipping_speed,customer_history_orders,discount_applied,discount_percentage,shipping_cost,delivery_time_days,refund_requested,velocity_flag,geolocation_mismatch,device_fingerprint_new,involves_counterfeit
0,TXN_957334,CUST_11907,2024-10-12 03:52:13,22,JP,1,199.66,199.66,Debit Card,Express,17,False,0.0,5.39,26,False,False,False,False,False
1,TXN_246397,CUST_27641,2024-08-18 00:36:57,62,DE,4,116.01,464.03,PayPal,Express,36,False,0.0,8.18,25,False,False,False,False,False
2,TXN_403072,CUST_78628,2024-08-19 22:21:30,75,DE,1,42.04,42.04,Credit Card,Priority,21,False,0.0,8.61,27,False,False,False,False,False
3,TXN_848560,CUST_98579,2025-04-16 18:49:39,56,BR,3,147.69,443.08,Credit Card,Standard,14,True,48.7,17.0,26,False,False,False,False,False
4,TXN_270817,CUST_67519,2024-10-06 13:30:52,19,IN,4,40.7,162.8,PayPal,Priority,48,False,0.0,20.53,1,False,False,True,False,False


In [533]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   transaction_id           3000 non-null   object 
 1   customer_id              3000 non-null   object 
 2   transaction_date         3000 non-null   object 
 3   customer_age             3000 non-null   int64  
 4   customer_location        3000 non-null   object 
 5   quantity                 3000 non-null   int64  
 6   unit_price               3000 non-null   float64
 7   total_amount             3000 non-null   float64
 8   payment_method           3000 non-null   object 
 9   shipping_speed           3000 non-null   object 
 10  customer_history_orders  3000 non-null   int64  
 11  discount_applied         3000 non-null   bool   
 12  discount_percentage      3000 non-null   float64
 13  shipping_cost            3000 non-null   float64
 14  delivery_time_days      

In [535]:
df_categorical = df.select_dtypes(include='object')
df_numerical = df.select_dtypes(include=np.number)

In [537]:
df_categorical.describe()

Unnamed: 0,transaction_id,customer_id,transaction_date,customer_location,payment_method,shipping_speed
count,3000,3000,3000,3000,3000,3000
unique,2997,2953,3000,10,6,4
top,TXN_152377,CUST_23163,2024-10-12 03:52:13,GB,PayPal,Standard
freq,2,3,1,325,771,1099


In [539]:
df_numerical.describe()

Unnamed: 0,customer_age,quantity,unit_price,total_amount,customer_history_orders,discount_percentage,shipping_cost,delivery_time_days
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,48.665667,4.372333,133.781473,431.415573,19.355667,8.5731,14.87478,15.046333
std,17.899889,4.391901,85.403283,331.140587,15.652247,14.689838,5.658677,8.320813
min,18.0,1.0,5.2,6.28,0.0,0.0,5.0,1.0
25%,33.0,2.0,60.145,171.685,4.0,0.0,9.95,8.0
50%,48.0,3.0,112.465,322.585,18.0,0.0,14.78,15.0
75%,64.0,4.0,210.3725,623.16,33.0,13.7,19.765,22.0
max,79.0,19.0,299.59,1865.66,49.0,50.0,24.99,29.0


In [541]:
df.isna().sum()

transaction_id             0
customer_id                0
transaction_date           0
customer_age               0
customer_location          0
quantity                   0
unit_price                 0
total_amount               0
payment_method             0
shipping_speed             0
customer_history_orders    0
discount_applied           0
discount_percentage        0
shipping_cost              0
delivery_time_days         0
refund_requested           0
velocity_flag              0
geolocation_mismatch       0
device_fingerprint_new     0
involves_counterfeit       0
dtype: int64

In [543]:
df.duplicated().sum()

0

#### Data Preprocessing

In [422]:
cols_to_drop = ['transaction_id', 'customer_id']
df = df.drop(cols_to_drop, axis=1)

In [424]:
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

In [426]:
boolean_columns = ['discount_applied',
              'refund_requested',
              'velocity_flag',
              'geolocation_mismatch',
              'device_fingerprint_new',
              'involves_counterfeit'
            ]

for col in boolean_columns:
    df[boolean_columns] = df[boolean_columns].astype(int)
    

In [428]:
categorical_cols = ['customer_location', 'payment_method', 'shipping_speed']

df = pd.get_dummies(df, columns=categorical_cols)

In [430]:
df = df.astype(int)

In [432]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   transaction_date               3000 non-null   int64
 1   customer_age                   3000 non-null   int64
 2   quantity                       3000 non-null   int64
 3   unit_price                     3000 non-null   int64
 4   total_amount                   3000 non-null   int64
 5   customer_history_orders        3000 non-null   int64
 6   discount_applied               3000 non-null   int64
 7   discount_percentage            3000 non-null   int64
 8   shipping_cost                  3000 non-null   int64
 9   delivery_time_days             3000 non-null   int64
 10  refund_requested               3000 non-null   int64
 11  velocity_flag                  3000 non-null   int64
 12  geolocation_mismatch           3000 non-null   int64
 13  device_fingerprint

In [545]:
X = df.drop('involves_counterfeit',axis=1)
y = df['involves_counterfeit']

In [547]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TPOT

In [371]:
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
import sklearn.metrics
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics
import timeit

In [373]:
tpot = TPOTClassifier(verbosity=3, 
                      scoring="balanced_accuracy", 
                      random_state=23, 
                      periodic_checkpoint_folder="tpot_trnsctns.txt", 
                      n_jobs=-1)
'''
    Args:
        verbosity: How much information to print during the optimization process. 
        scoring: A scorer or list of scorers to be used in the cross-validation process.
        random_state: A seed for reproducability of experiments.
        periodic_checkpoint_folder: Folder to save the population to periodically.
        n_jobs: Number of processes to run in parallel.
        
'''
winning_pipes = []
times = []
scores = []

# run three iterations and time them
for x in range(3):
    start_time = timeit.default_timer()
    tpot.fit(X_train, y_train)
    elapsed = timeit.default_timer() - start_time
    times.append(elapsed)
    winning_pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export('tpot_trnsctns_pipeline.py')

is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier




is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
32 operators have been imported by TPOT.


Version 0.12.2 of tpot is outdated. Version 1.1.0 was released Thursday July 03, 2025.


Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by LinearSVC..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by SGDClassifier..
_pre_test decorator: _random_mutation_operator: num_test=0 The 'step' parameter of RFE must be an int in the range (0, inf) or a float in the range (0.0, 1.0). Got 1.0 instead..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by MultinomialNB..
_pre_test decorator: _random_mutation_operator: num_test=0 FeatureAgglomeration.__init__() got an unexpected keyword argument 'affinity'.
_pre_test decorator: _random_mutation_operator: num_test=0 The 'loss' parameter of SGDClassifier must be a str among {'modified_huber', 'huber', 'squared_hinge', 'epsilon_insensitive', 'perceptron', 'hinge', 'squared_



is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
32 operators have been imported by TPOT.


Version 0.12.2 of tpot is outdated. Version 1.1.0 was released Thursday July 03, 2025.


Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by LinearSVC..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by SGDClassifier..
_pre_test decorator: _random_mutation_operator: num_test=0 The 'step' parameter of RFE must be an int in the range (0, inf) or a float in the range (0.0, 1.0). Got 1.0 instead..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by MultinomialNB..
_pre_test decorator: _random_mutation_operator: num_test=0 FeatureAgglomeration.__init__() got an unexpected keyword argument 'affinity'.
_pre_test decorator: _random_mutation_operator: num_test=0 The 'loss' parameter of SGDClassifier must be a str among {'modified_huber', 'huber', 'squared_hinge', 'epsilon_insensitive', 'perceptron', 'hinge', 'squared_



is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
32 operators have been imported by TPOT.


Version 0.12.2 of tpot is outdated. Version 1.1.0 was released Thursday July 03, 2025.


Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by LinearSVC..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by SGDClassifier..
_pre_test decorator: _random_mutation_operator: num_test=0 The 'step' parameter of RFE must be an int in the range (0, inf) or a float in the range (0.0, 1.0). Got 1.0 instead..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by MultinomialNB..
_pre_test decorator: _random_mutation_operator: num_test=0 FeatureAgglomeration.__init__() got an unexpected keyword argument 'affinity'.
_pre_test decorator: _random_mutation_operator: num_test=0 The 'loss' parameter of SGDClassifier must be a str among {'modified_huber', 'huber', 'squared_hinge', 'epsilon_insensitive', 'perceptron', 'hinge', 'squared_

In [496]:
print('Times:', times)
print('Scores:', scores)   
print('Winning pipelines:', winning_pipes)

Times: [332.3947922079824, 322.77182475011796, 324.0405525842216]
Scores: [0.9966887417218543, 0.9966887417218543, 0.9966887417218543]
Winning pipelines: [Pipeline(steps=[('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.01,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=

In [611]:
model = winning_pipes[0]
model

0,1,2
,steps,"[('xgbclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [629]:
model_1 = winning_pipes[1]
model_1

0,1,2
,steps,"[('xgbclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [631]:
model_2 = winning_pipes[2]
model_2

0,1,2
,steps,"[('xgbclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
