In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train = pd.read_csv('data/train.csv', index_col=0)

X = train.drop('target', axis=1)
y = train.target

### PCA

from sklearn.decomposition import PCA

pca = PCA(
    copy=True, iterated_power=7, n_components=100, 
    random_state=None, svd_solver='auto', tol=0.0, whiten=False
)
X = pca.fit_transform(X)

### Scaler

from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
X = minmax.fit_transform(X)
y = np.log1p(y)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [4]:
from tpot import TPOTRegressor


model = TPOTRegressor(
    generations=10,
    population_size=100,
    n_jobs=4,
    verbosity=2,
    cv=3,
    early_stop=3
)
model.fit(X_train, y_train.values)

Optimization Progress:  18%|█▊        | 203/1100 [26:04<2:09:11,  8.64s/pipeline]

Generation 1 - Current best internal CV score: -2.322333776687301


Optimization Progress:  28%|██▊       | 303/1100 [32:06<34:42,  2.61s/pipeline]  

Generation 2 - Current best internal CV score: -2.290753392088593


Optimization Progress:  37%|███▋      | 403/1100 [40:47<1:22:16,  7.08s/pipeline]

Generation 3 - Current best internal CV score: -2.290753392088593


Optimization Progress:  46%|████▌     | 503/1100 [48:39<34:42,  3.49s/pipeline]  

Generation 4 - Current best internal CV score: -2.2819677619356895


Optimization Progress:  55%|█████▍    | 604/1100 [1:03:48<26:41,  3.23s/pipeline]  

Generation 5 - Current best internal CV score: -2.2819677619356895


Optimization Progress:  64%|██████▍   | 704/1100 [1:12:11<30:19,  4.59s/pipeline]  

Generation 6 - Current best internal CV score: -2.2819677619356895


Optimization Progress:  73%|███████▎  | 808/1100 [1:30:44<46:11,  9.49s/pipeline]  

Generation 7 - Current best internal CV score: -2.2621096327377646


Optimization Progress:  83%|████████▎ | 909/1100 [1:46:04<28:50,  9.06s/pipeline]  

Generation 8 - Current best internal CV score: -2.2621096327377646


Optimization Progress:  92%|█████████▏| 1010/1100 [2:05:31<10:07,  6.75s/pipeline] 

Generation 9 - Current best internal CV score: -2.2621096327377646


                                                                                  

Generation 10 - Current best internal CV score: -2.2621096327377646

Best pipeline: ExtraTreesRegressor(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), bootstrap=False, max_features=0.3, min_samples_leaf=2, min_samples_split=15, n_estimators=100)


TPOTRegressor(config_dict={'sklearn.ensemble.GradientBoostingRegressor': {'max_features': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'max_depth': range(1, 11), 'loss': ['ls', 'lad', 'huber', 'quantile'], 'subsample': arra....045, 0.046, 0.047, 0.048, 0.049]), 'score_func': {'sklearn.feature_selection.f_regression': None}}},
       crossover_rate=0.1, cv=3, disable_update_check=False, early_stop=3,
       generations=10, max_eval_time_mins=5, max_time_mins=None,
       memory=None, mutation_rate=0.9, n_jobs=4, offspring_size=100,
       periodic_checkpoint_folder=None, population_size=100,
       random_state=None, scoring=None, subsample=1.0, verbosity=2,
       warm_start=False)

In [5]:
def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))
    return rmsle

y_pred = model.predict(X_test)
print(rmsle_metric(y_test, y_pred))

1.4759588230454297


In [6]:
from sklearn.externals import joblib

joblib.dump(model.fitted_pipeline_, 'PCA_y_log_TPOT_1_475.pkl')

['PCA_y_log_TPOT_1_475.pkl']