#### Automatic machine learning tool using genetic programming to optimise model pipeline

In [5]:
import pickle
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split, LeaveOneOut
import sklearn

In [3]:
with open('/nfs/see-fs-02_users/earlacoa/emulator/dfs_gridcell_sample.pickle', 'rb') as ds:
    df_gridcells = pickle.load(ds)
    
df_gridcells.head()

Unnamed: 0,lat,lon,RES,IND,TRA,AGR,POW,PM2_5_DRY
0,30.5,114.25,0.23229,0.20507,0.29904,0.97374,0.55184,82.050903
1,30.5,114.25,1.1212,1.1098,0.8701,1.2222,1.252,260.18296
2,30.5,114.25,0.79843,1.2946,0.090709,0.75401,0.1942,209.802636
3,30.5,114.25,0.34445,1.4224,0.75112,1.2409,1.2113,164.002287
4,30.5,114.25,0.62562,1.3589,1.0583,0.04688,0.64196,196.083809


In [2]:
"""
Custom TPOT config based on regressor.py
- Including Gaussian Process Regressor with various kernels
- Including additional preprocessors
    - PowerTransformer, QuantileTransformer
- Excluding other regressors
- Excluding dimensionality reduction
    - FeatureAgglomeration, PCA, FastICA
- Excluding feature selection
    - VarianceThreshold, SelectPercentile, SelectFwe
"""

tpot_config = {

    'sklearn.gaussian_process.GaussianProcessRegressor': {
        'kernel': {
            'sklearn.gaussian_process.kernels.Matern': {
                'nu': [0.5, 1.5, 2.5],
                'length_scale': np.arange(0.1, 5, 0.1)
            }
        },
        'normalize_y': [True, False],
        'n_restarts_optimizer': np.arange(5, 300, 5)
    },

    # Preprocesssors
    'sklearn.preprocessing.PowerTransformer': {
    },

    'sklearn.preprocessing.QuantileTransformer': {
        'output_distribution': ['uniform', 'normal']
    },

    'sklearn.preprocessing.Binarizer': {
        'threshold': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.preprocessing.MaxAbsScaler': {
    },

    'sklearn.preprocessing.MinMaxScaler': {
    },

    'sklearn.preprocessing.Normalizer': {
        'norm': ['l1', 'l2', 'max']
    },

    'sklearn.kernel_approximation.Nystroem': {
        'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
        'gamma': np.arange(0.0, 1.01, 0.05),
        'n_components': range(1, 11)
    },

    'sklearn.preprocessing.PolynomialFeatures': {
        'degree': [2],
        'include_bias': [False],
        'interaction_only': [False]
    },

    'sklearn.kernel_approximation.RBFSampler': {
        'gamma': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.preprocessing.RobustScaler': {
    },

    'sklearn.preprocessing.StandardScaler': {
    },

    'tpot.builtins.ZeroCount': {
    },

    'tpot.builtins.OneHotEncoder': {
        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
        'sparse': [False],
        'threshold': [10]
    },


    # Selectors
    'sklearn.feature_selection.SelectFromModel': {
        'threshold': np.arange(0, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesRegressor': {
                'n_estimators': [100],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    }

}

In [None]:
lats = df_gridcells[['lat', 'lon']].drop_duplicates()['lat'].values
lons = df_gridcells[['lat', 'lon']].drop_duplicates()['lon'].values

features = ['RES', 'IND', 'TRA', 'AGR', 'POW']
target = 'PM2_5_DRY'

for gridcell in df_gridcells[['lat', 'lon']].drop_duplicates().values:
    lat, lon = gridcell
    df_gridcell = df_gridcells.loc[df_gridcells.lat == lat].loc[df_gridcells.lon == lon]
    
    X = df_gridcell[features].values
    y = df_gridcell[target].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=123
    )
    
    loo = LeaveOneOut()
    #cv = loo.get_n_splits(X_train) # causes all NaNs at the moment, so using 5-fold for the time being
    cv = 5
    
    emulator = TPOTRegressor(
        generations=200, 
        population_size=200, 
        verbosity=2,        
        random_state=123, 
        use_dask=True, 
        n_jobs=-1, 
        scoring='r2',                
        config_dict=tpot_config, 
        cv=cv)

    emulator.fit(X_train, y_train)
    print('CV score: the final one of the above')
    print(f"test score: {emulator.score(X_test, y_test):.4f}")
    emulator.export('/nfs/see-fs-02_users/earlacoa/emulator/tpot_emulator_pipeline_' + target + '_' + str(index) + '.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=40200.0, style=ProgressStyle(…

Generation 1 - Current best internal CV score: -18.65649390449532
Generation 2 - Current best internal CV score: -18.65649390449532
Generation 3 - Current best internal CV score: -18.65649390449532
Generation 4 - Current best internal CV score: -18.605000958751187
Generation 5 - Current best internal CV score: -18.580005584997526
Generation 6 - Current best internal CV score: -18.52065496851595
