# TPOT AutoML

## Import Library

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import yaml
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from tpot import TPOTRegressor

## Read config

In [5]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

## DuckDB Registration

In [8]:
import duckdb  
conn = duckdb.connect(':memory:')

In [9]:
joined_parquet_dataset = '../' + config["data_load"]["joined_parquet"]
conn.sql('CREATE TABLE evi_data AS SELECT * FROM read_parquet(\'' + joined_parquet_dataset + '\')')

In [None]:
conn.execute('''  
SELECT 
    DISCOVER_DAY,
    DISCOVER_HOUR,
    IP_ADDRESS,
    LONGITUDE,
    LATITUDE,
    NAME,
    PAIR_NAME
FROM
   evi_data
LIMIT 5
''').df()

In [11]:
conn.execute('''  
SELECT 
    Min(DISCOVER_YEAR),
    Max(DISCOVER_YEAR),
    Min(DISCOVER_MONTH),
    Max(DISCOVER_MONTH),
    Min(DISCOVER_DAY),    
    Max(DISCOVER_DAY),
FROM
   evi_data
LIMIT 5
''').df()

Unnamed: 0,"min(""DISCOVER_YEAR"")","max(""DISCOVER_YEAR"")","min(""DISCOVER_MONTH"")","max(""DISCOVER_MONTH"")","min(""DISCOVER_DAY"")","max(""DISCOVER_DAY"")"
0,2022,2022,12,12,15,21


In [12]:
clean_data_df = conn.execute('''  
SELECT 
    DISCOVER_DAY,
    DISCOVER_HOUR,
    IP_ADDRESS,
    LONGITUDE,
    LATITUDE,
    NAME,
    PAIR_NAME,
    COUNT(REG_NO) AS VEHICLES
FROM
   evi_data
GROUP BY DISCOVER_DAY, DISCOVER_HOUR, IP_ADDRESS, LONGITUDE, LATITUDE, NAME, PAIR_NAME

''').df()

In [None]:
clean_data_df.head()

## Data Split

In [64]:
clean_data_df.shape

(2464, 8)

In [65]:
df = pd.get_dummies(clean_data_df, columns=['IP_ADDRESS', 'NAME', 'PAIR_NAME'])

In [66]:
df.shape

(2464, 46)

In [None]:
df.sample(5)

In [68]:
X = df.drop("VEHICLES", axis=1)
y = df["VEHICLES"]

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_test.sample(5)

In [70]:
print (X_train.shape, y_train.shape)

(1724, 45) (1724,)


In [82]:
print (X_test.shape, y_test.shape)

(740, 45) (740,)


## TPOT AutoML Experiments

In [30]:
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=42)
# define search
tpot = TPOTRegressor(generations=10, population_size=20, #max_time_mins = 60, 
                     early_stop = 5,
                     cv=cv, scoring='neg_root_mean_squared_error', 
                     verbosity=2, random_state=1, n_jobs=-1) 

tpot.fit(X_train, y_train)

mae = tpot.score(X_test, y_test)

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -107.90704337704693

Generation 2 - Current best internal CV score: -107.90704337704693

Generation 3 - Current best internal CV score: -107.86131682785953

Generation 4 - Current best internal CV score: -107.86131682785953

Generation 5 - Current best internal CV score: -94.32137105129932

Generation 6 - Current best internal CV score: -94.32137105129932

Generation 7 - Current best internal CV score: -94.32137105129932

Generation 8 - Current best internal CV score: -94.32137105129932

Generation 9 - Current best internal CV score: -93.12628778853252

Generation 10 - Current best internal CV score: -92.39597470526708

Best pipeline: XGBRegressor(MinMaxScaler(LinearSVR(input_matrix, C=10.0, dual=True, epsilon=0.0001, loss=squared_epsilon_insensitive, tol=0.001)), learning_rate=0.1, max_depth=9, min_child_weight=11, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.7500000000000001, verbosity=0)




In [96]:
print("MAE: %.3f" % -mae)

MAE: 89.093


In [33]:
tpot.export('tpot_evi_dentisity_pipeline.py')

## TPOT Exported Pipeline

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from tpot.export_utils import set_param_recursive


# Average CV score on the training set was: -92.39597470526708
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LinearSVR(C=10.0, dual=True, epsilon=0.0001, loss="squared_epsilon_insensitive", tol=0.001)),
    MinMaxScaler(),
    XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=11, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.7500000000000001, verbosity=0)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)



In [36]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, results))

89.0931600310122

In [84]:
single_df = X_test.iloc[[700]].reset_index(drop=True)

In [85]:
single_df.shape

(1, 45)

In [86]:
df.shape

(2464, 46)

In [87]:
single_df.head()

Unnamed: 0,DISCOVER_DAY,DISCOVER_HOUR,LONGITUDE,LATITUDE,IP_ADDRESS_192.168.250.10,IP_ADDRESS_192.168.250.11,IP_ADDRESS_192.168.250.18,IP_ADDRESS_192.168.250.19,IP_ADDRESS_192.168.250.2,IP_ADDRESS_192.168.250.26,IP_ADDRESS_192.168.250.27,IP_ADDRESS_192.168.250.3,IP_ADDRESS_192.168.250.34,IP_ADDRESS_192.168.250.35,IP_ADDRESS_192.168.250.42,IP_ADDRESS_192.168.250.43,IP_ADDRESS_192.168.250.50,IP_ADDRESS_192.168.250.51,IP_ADDRESS_192.168.250.58,IP_ADDRESS_192.168.250.59,IP_ADDRESS_192.168.250.67,NAME_To Aminbazar New,NAME_To Bijoy Nagar New,NAME_To Bishaw Road New,NAME_To Dhanmondi New,NAME_To Kakoli New,NAME_To Kakrail Mosque New,NAME_To Kollyanpur New,NAME_To Mohakhali Circle New,NAME_To Motsho Bhaban New,NAME_To New Market New,NAME_To Notun Baazar New,NAME_To Paltan New,NAME_To Shahbag Circle New,NAME_To Tatibazar New,NAME_To Zero Point Circle New,PAIR_NAME_Buriganga Bridge,PAIR_NAME_Gabtoli Mazar Road,PAIR_NAME_Gulshan 2,PAIR_NAME_Kakrail,PAIR_NAME_Kuril Bishawroad,PAIR_NAME_Mohakhali,PAIR_NAME_Science Lab,PAIR_NAME_Shahbag Circle,PAIR_NAME_Zero Point
0,18,8,90.421319,23.809342,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [88]:
exported_pipeline.predict(single_df)



array([770.457], dtype=float32)

In [93]:
y_test.iloc[[700]]

1503    708
Name: VEHICLES, dtype: int64

In [92]:
X_test.iloc[[700]]

Unnamed: 0,DISCOVER_DAY,DISCOVER_HOUR,LONGITUDE,LATITUDE,IP_ADDRESS_192.168.250.10,IP_ADDRESS_192.168.250.11,IP_ADDRESS_192.168.250.18,IP_ADDRESS_192.168.250.19,IP_ADDRESS_192.168.250.2,IP_ADDRESS_192.168.250.26,IP_ADDRESS_192.168.250.27,IP_ADDRESS_192.168.250.3,IP_ADDRESS_192.168.250.34,IP_ADDRESS_192.168.250.35,IP_ADDRESS_192.168.250.42,IP_ADDRESS_192.168.250.43,IP_ADDRESS_192.168.250.50,IP_ADDRESS_192.168.250.51,IP_ADDRESS_192.168.250.58,IP_ADDRESS_192.168.250.59,IP_ADDRESS_192.168.250.67,NAME_To Aminbazar New,NAME_To Bijoy Nagar New,NAME_To Bishaw Road New,NAME_To Dhanmondi New,NAME_To Kakoli New,NAME_To Kakrail Mosque New,NAME_To Kollyanpur New,NAME_To Mohakhali Circle New,NAME_To Motsho Bhaban New,NAME_To New Market New,NAME_To Notun Baazar New,NAME_To Paltan New,NAME_To Shahbag Circle New,NAME_To Tatibazar New,NAME_To Zero Point Circle New,PAIR_NAME_Buriganga Bridge,PAIR_NAME_Gabtoli Mazar Road,PAIR_NAME_Gulshan 2,PAIR_NAME_Kakrail,PAIR_NAME_Kuril Bishawroad,PAIR_NAME_Mohakhali,PAIR_NAME_Science Lab,PAIR_NAME_Shahbag Circle,PAIR_NAME_Zero Point
1503,18,8,90.421319,23.809342,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [94]:
X.iloc[[1503]]

Unnamed: 0,DISCOVER_DAY,DISCOVER_HOUR,LONGITUDE,LATITUDE,IP_ADDRESS_192.168.250.10,IP_ADDRESS_192.168.250.11,IP_ADDRESS_192.168.250.18,IP_ADDRESS_192.168.250.19,IP_ADDRESS_192.168.250.2,IP_ADDRESS_192.168.250.26,IP_ADDRESS_192.168.250.27,IP_ADDRESS_192.168.250.3,IP_ADDRESS_192.168.250.34,IP_ADDRESS_192.168.250.35,IP_ADDRESS_192.168.250.42,IP_ADDRESS_192.168.250.43,IP_ADDRESS_192.168.250.50,IP_ADDRESS_192.168.250.51,IP_ADDRESS_192.168.250.58,IP_ADDRESS_192.168.250.59,IP_ADDRESS_192.168.250.67,NAME_To Aminbazar New,NAME_To Bijoy Nagar New,NAME_To Bishaw Road New,NAME_To Dhanmondi New,NAME_To Kakoli New,NAME_To Kakrail Mosque New,NAME_To Kollyanpur New,NAME_To Mohakhali Circle New,NAME_To Motsho Bhaban New,NAME_To New Market New,NAME_To Notun Baazar New,NAME_To Paltan New,NAME_To Shahbag Circle New,NAME_To Tatibazar New,NAME_To Zero Point Circle New,PAIR_NAME_Buriganga Bridge,PAIR_NAME_Gabtoli Mazar Road,PAIR_NAME_Gulshan 2,PAIR_NAME_Kakrail,PAIR_NAME_Kuril Bishawroad,PAIR_NAME_Mohakhali,PAIR_NAME_Science Lab,PAIR_NAME_Shahbag Circle,PAIR_NAME_Zero Point
1503,18,8,90.421319,23.809342,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [95]:
y.iloc[[1503]]

1503    708
Name: VEHICLES, dtype: int64