# TPOT AutoML

## Import Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import yaml
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from tpot import TPOTRegressor




## Read config

In [2]:
with open('../params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

## Read Clean Data

In [3]:
clean_data_df = pd.read_csv('../' + config["featurize"]["clean_data"])

In [4]:
clean_data_df.shape

(85409, 9)

In [None]:
clean_data_df.sample(5)

## Data Split

In [6]:
clean_data_df.shape

(85409, 9)

In [7]:
df = pd.get_dummies(clean_data_df, columns=['IP_ADDRESS', 'NAME', 'PAIR_NAME'])

In [8]:
df.shape

(85409, 60)

In [None]:
df.sample(5)

In [10]:
X = df.drop("VEHICLES", axis=1)
y = df["VEHICLES"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_test.sample(5)

In [13]:
print (X_train.shape, y_train.shape)

(59786, 59) (59786,)


In [14]:
print (X_test.shape, y_test.shape)

(25623, 59) (25623,)


## TPOT AutoML Experiments

In [15]:
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=42)
# define search
tpot = TPOTRegressor(generations=10, population_size=20, #max_time_mins = 60, 
                     early_stop = 5,
                     cv=cv, scoring='neg_root_mean_squared_error', 
                     verbosity=2, random_state=1, n_jobs=-1) 

tpot.fit(X_train, y_train)

mae = tpot.score(X_test, y_test)

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -164.04555322887072

Generation 2 - Current best internal CV score: -162.30070642198348

Generation 3 - Current best internal CV score: -162.30070642198348

Generation 4 - Current best internal CV score: -162.30070642198348

Generation 5 - Current best internal CV score: -161.6008085317183

Generation 6 - Current best internal CV score: -161.6008085317183

Generation 7 - Current best internal CV score: -160.34426157027337

Generation 8 - Current best internal CV score: -160.34426157027337

Generation 9 - Current best internal CV score: -160.3433949128129

Generation 10 - Current best internal CV score: -160.3433949128129

Best pipeline: XGBRegressor(DecisionTreeRegressor(input_matrix, max_depth=2, min_samples_leaf=7, min_samples_split=13), learning_rate=0.5, max_depth=4, min_child_weight=6, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.9500000000000001, verbosity=0)





In [16]:
print("MAE: %.3f" % -mae)

MAE: 162.733


In [17]:
tpot.export('tpot_evi_dentisity_pipeline.py')

## TPOT Exported Pipeline

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from tpot.export_utils import set_param_recursive

# Average CV score on the training set was: -160.3433949128129
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=DecisionTreeRegressor(max_depth=2, min_samples_leaf=7, min_samples_split=13)),
    XGBRegressor(learning_rate=0.5, max_depth=4, min_child_weight=6, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)






In [19]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, results))

162.7325180643