In [1]:
%reset -sf

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
     |████████████████████████████████| 24.8 MB 1.8 MB/s             
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.5.0 which is incompatible.
hypertools 0.7.0 requires scikit-learn!=0.22,<0.24,>=0.19.1, but you have scikit-learn 1.0.2 which is incompatible.[0m
Successfully installed scikit-learn-1.0.2


In [4]:
# Patch Xeon Intel OneAPI Scikit accelerator
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2021.5.1-py37-none-manylinux1_x86_64.whl (69 kB)
     |████████████████████████████████| 69 kB 315 kB/s            
Collecting daal4py==2021.5.1
  Downloading daal4py-2021.5.1-py37-none-manylinux1_x86_64.whl (22.5 MB)
     |████████████████████████████████| 22.5 MB 1.8 MB/s            
[?25hCollecting daal==2021.5.1
  Downloading daal-2021.5.1-py2.py3-none-manylinux1_x86_64.whl (284.3 MB)
     |████████████████████████████████| 284.3 MB 1.8 kB/s            
Collecting tbb==2021.*
  Downloading tbb-2021.4.0-py2.py3-none-manylinux1_x86_64.whl (4.0 MB)
     |████████████████████████████████| 4.0 MB 39.9 MB/s            
Installing collected packages: tbb, daal, daal4py, scikit-learn-intelex
Successfully installed daal-2021.5.1 daal4py-2021.5.1 scikit-learn-intelex-2021.5.1 tbb-2021.4.0


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
from numpy import array, linspace, mean, column_stack
from pandas import read_parquet, DataFrame, Series, to_datetime, concat
from pathlib import Path
from itertools import cycle
from random import randint
from gc import collect
from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, classification_report
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import MinMaxScaler

from deap import creator as ga_cr, base as ga_b, algorithms as ga_algo, tools as ga_t

In [6]:
p = Path().cwd()
p = p.parent
p = p / 'input' / 'kaggle-pog-series-s01e01'

train = read_parquet(p / 'train.parquet')
test = read_parquet(p / 'test.parquet')

In [7]:
def clean_train(df):
    df['publishedAt'] = to_datetime(df['publishedAt'], errors='coerce').dt.tz_localize(None)
    df['trending_date'] = to_datetime(df['trending_date'], errors='coerce').dt.tz_localize(None) 
    return df
    
train = clean_train(train)
test = clean_train(test)

In [8]:
def feat_eng(df):
    df['t_delta'] = (df['trending_date'] - df['publishedAt']).dt.seconds
    return df

train = feat_eng(train)
test = feat_eng(test)

In [9]:
class GA_Scikit():
    def __init__(self, 
                 estimator, 
                 #
                 params, 
                 eval_func, 
                 eval_weights, 
                 #
                 train_df, 
                 #valid_df, 
                 score, 
                 #
                 sel_tournsize=2, 
                 cx_uniform_prob=0.5, 
                 mut_shuffle_idx_prob=0.1, 
                 n_pop=20, 
                 n_gen=10, 
                 n_hof=1, 
                 cx_prob=0.5, 
                 mut_prob=0.1, 
                 n_jobs=4  # not good with HistGradBoosReg...
                ):
        
        self.est = estimator
        self.params = params
        self.eval_func = eval_func
        self.eval_weights = eval_weights
        #
        self.train_df = train_df
        #self.valid_df = valid_df
        self.score = score
        #
        self.sel_tournsize = sel_tournsize
        self.cx_uniform_prob = cx_uniform_prob
        self.mut_shuffle_idx_prob = mut_shuffle_idx_prob
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.n_hof = n_hof
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        self.n_jobs = n_jobs
        
        self._pad_params()
        self._create_fitness_and_indiv()
        self._register_indiv_and_pop_generators()
        self._register_eval_func()
        self._register_selection_crossover_mutation_methods()
        
    def _pad_params(self):
        """Pad params for crossover shuffle idx method"""
        assert isinstance(self.params, dict), 'Params must be a dict, i.e. estimator.get_params()'
        params_count = {k: len(v) for k,v in self.params.items()}
        max_length, max_key = -99, ''
        for k, v in params_count.items():
            if v <= max_length:
                continue
            else:
                max_key = k
                max_length = v
        assert isinstance(max_length, int), 'The max length between all params must be an int'
        # cycle through params for max length param, otherwise infinite cycle
        values_padded = (cycle(v) if k!=max_key else v for k,v in self.params.items())
        values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
        values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
        padded_params = {}
        for k, v in zip(self.params, values_padded):
            padded_params[k] = v
        self.padded_params = padded_params
        
    def _create_fitness_and_indiv(self):
        """Create GA individual and fitness entities (classes)"""
        ga_cr.create('Fitness', ga_b.Fitness, weights=self.eval_weights)
        ga_cr.create('Individual', list, fitness=ga_cr.Fitness)

    def _gen_params_to_ga(self):
        """Generate index for each param for individual"""
        max_dict = len(self.padded_params)
        max_length = len(list(self.padded_params.values())[0])
        idxs = [randint(0, max_length-1) for _ in range(max_dict)]
        return idxs
            
    def _register_indiv_and_pop_generators(self):
        """Register GA individual and population generators"""
        self.tb = ga_b.Toolbox()

        if self.n_jobs > 1:
            from multiprocessing import Pool
            pool = Pool()
            self.tb.register("map", pool.map)

        self.tb.register("individual", ga_t.initIterate, ga_cr.Individual, self._gen_params_to_ga)
        #print('indiv', self.tb.individual())
        self.tb.register("population", ga_t.initRepeat, list, self.tb.individual)
        #print('population', self.tb.population(n=2))
    
    def _register_eval_func(self):
        """Set GA evaluate individual function"""
        self.tb.register("evaluate",
                        self.eval_func,
                        padded_params=self.padded_params,
                        est=self.est,
                        train_df=self.train_df,
                        score=self.score
                        )
        #print(list(self.tb.evaluate(indiv) for indiv in self.tb.population(3)))
        
    def _register_selection_crossover_mutation_methods(self):
        """Register GA select/mate/mutate methods"""
        self.tb.register("select", ga_t.selTournament, tournsize=self.sel_tournsize)
        self.tb.register("mate", ga_t.cxUniform, indpb=self.cx_uniform_prob)
        self.tb.register("mutate", ga_t.mutShuffleIndexes, indpb=self.mut_shuffle_idx_prob)
        
    def run_ga_search(self):
        """Run search"""
        pop = self.tb.population(n=self.n_pop)
        hof = ga_t.HallOfFame(self.n_hof)

        # Stats stdout
        #stats = ga_t.Statistics(lambda ind: ind.fitness.values )
        stats1 = ga_t.Statistics(lambda ind: ind.fitness.values[0] )
        stats2 = ga_t.Statistics(lambda ind: ind.fitness.values[1] )
        stats3 = ga_t.Statistics(lambda ind: ind.fitness.values[2] )
        stats = ga_t.MultiStatistics(v_loss=stats1, max_depth=stats2, n_est=stats3)
        stats.register("avg", mean)
        #stats.register("std", np.std)
        
        # GA Run
        pop, log = ga_algo.eaSimple(pop, self.tb, cxpb=self.cx_prob, 
                                    mutpb=self.mut_prob, ngen=self.n_gen, 
                                    stats=stats, halloffame=hof, verbose=True)
        
        # Convert back params
        hof_ = {}
        for i in range(self.n_hof):
            hof_['hof_' + str(i)] = self._ga_to_params(hof[i])

        return pop, log, hof_
    
    def _ga_to_params(self, idx_params):
        """Helper to convert to readable params"""
        res = {}
        for (k,v), idx in zip(self.padded_params.items(), idx_params):
            res[k] = v[idx]
        return res

In [10]:
def prep_data(params, train_df, train_data=True):
    train_X = train_df[['title', 'duration_seconds', 't_delta']].copy()

    cv_params = {k:v for k,v in params.items() if k != 'q'}
    cv_params = {**cv_params, **{'strip_accents': 'ascii',
                                 'lowercase': True,
                                 'stop_words': 'english',
                                 'analyzer': 'word',
                                 #'token_pattern': '(?u)\b\w\w\w+\b',  # 3 or more letters
                                 }}
    #cv = CountVectorizer(**)
    tdf = TfidfVectorizer(**cv_params)
    
    q_param = {k:v for k,v in params.items() if k == 'q'}
    q = train_X['duration_seconds'].quantile(**q_param)
    mask = train_X['duration_seconds'].isna()
    train_X.loc[mask, 'duration_seconds'] = q
    
    mms = MinMaxScaler()
    
    train_X = column_stack((
        tdf.fit_transform(train_X['title']).toarray(),
        mms.fit_transform(train_X[['duration_seconds']]),
        mms.fit_transform(train_X[['t_delta']]),
    ))
    
    if train_data:
        train_y = train_df[['likes', 'view_count']]
        train_y = train_y['likes'] / train_y['view_count']
        return train_X, train_y
    else:
        return train_X

In [11]:
def train_model(train_X, train_y, est, score, give_model=False, folds=False):
    if not give_model:
        if not folds:
            train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, train_size=0.8)
            est.fit(train_X, train_y)
            pred_v = est.predict(valid_X)
            obj = score(valid_y, pred_v)
            return obj
        else:
            ss = ShuffleSplit(n_splits=5, test_size=0.2)
            #ss = TimeSeriesSplit(n_splits=2)
            objs = []
            for train_idx, valid_idx in ss.split(train_X):
                est.fit(train_X[train_idx, :], train_y.iloc[train_idx])
                pred_v = est.predict(train_X[valid_idx, :])
                obj = score(train_y.iloc[valid_idx], pred_v)
                objs.append(obj)
            obj = mean(objs)
            return obj
    else:
        est.fit(train_X, train_y)
        return est

In [12]:
# Estimator, params and requirements

est = RandomForestRegressor(random_state=16)

cv_params = {
             'n_estimators': linspace(1,100,100).astype(int),
             'max_depth': linspace(1,100,100).astype(int),
             #
             'q': linspace(0.01,0.99,100),
             #
             #'ngram_range_': linspace(1,10,10).astype(int),
             'min_df': linspace(1,5,10).astype(int),
             'max_df': linspace(5,10,10).astype(int),
             'max_features': linspace(10,300,100).astype(int),
             'binary': [True, False],
            }

def cv_eval_indiv(individual, padded_params, est, train_df, score):
    """Evaluate individual's genes (estimator's params)"""
    est_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual) if k in ['n_estimators', 'max_depth']}
    _ = est.set_params(**{**est_params, **{'min_samples_leaf': 0.1}})
    
    cv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual) if k not in ['n_estimators', 'max_depth']}
    train_X, train_y = prep_data(cv_params, train_df, train_data=True)

    #model_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual) if k != '#_lags'}
    #est.set_params(**model_params)  # seems can't parallelize this simultaneous with GA  # **{**model_params, **{'n_jobs': 1} }

    obj = train_model(train_X, train_y, est, score, folds=False)
    
    collect()
    
    return obj, est_params['n_estimators'], est_params['max_depth'],
        
cv_weights = -0.1, -1, -1,

In [13]:
ga_params = GA_Scikit(est,
                      cv_params,
                      cv_eval_indiv,
                      cv_weights,
                      #
                      train[['title', 'duration_seconds', 't_delta', 'likes', 'view_count']],
                      mean_absolute_error,
                      )
pop, log, hof = ga_params.run_ga_search()

   	      	      max_depth       	        n_est         	            v_loss            
   	      	----------------------	----------------------	------------------------------
gen	nevals	avg  	gen	nevals	avg 	gen	nevals	avg      	gen	nevals
0  	20    	55.05	0  	20    	55.8	0  	20    	0.0293627	0  	20    
1  	12    	66.5 	1  	12    	61.3	1  	12    	0.0293905	1  	12    
2  	9     	77.85	2  	9     	74.05	2  	9     	0.029386 	2  	9     
3  	8     	74.5 	3  	8     	75.3 	3  	8     	0.0293527	3  	8     
4  	12    	68.4 	4  	12    	83.9 	4  	12    	0.0293839	4  	12    
5  	8     	57.05	5  	8     	81.9 	5  	8     	0.0293161	5  	8     
6  	11    	62.9 	6  	11    	84.3 	6  	11    	0.0293335	6  	11    
7  	16    	78.85	7  	16    	86.65	7  	16    	0.0293819	7  	16    
8  	12    	86.4 	8  	12    	85   	8  	12    	0.0293565	8  	12    
9  	12    	89.95	9  	12    	79.1 	9  	12    	0.0294365	9  	12    
10 	8     	95.4 	10 	8     	81.3 	10 	8     	0.029324 	10 	8     


In [14]:
# Save HOF params
hof
dump(hof['hof_0'], 'best_params.json')

{'hof_0': {'n_estimators': 14,
  'max_depth': 76,
  'q': 0.9603030303030302,
  'min_df': 5,
  'max_df': 8,
  'max_features': 53,
  'binary': False}}

['best_params.json']

In [15]:
# Full train
params = load('best_params.json')

est_params = {k:v for k,v in params.items() if k in ['n_estimators', 'max_depth']}
est_params
est = RandomForestRegressor(random_state=16)
est.set_params(**est_params)

cv_params = {k:v for k,v in params.items() if k not in ['n_estimators', 'max_depth']}
cv_params
train_X, train_y = prep_data(cv_params, train)
est = train_model(train_X, train_y, est, mean_absolute_error, give_model=True)

# Submission
test_X = prep_data(cv_params, test, train_data=False)
pred = est.predict(test_X)
subm = concat([test['id'], Series(pred)], axis=1)
subm.columns = ['id', 'target']
subm.to_csv('submission.csv', index=False)

{'n_estimators': 14, 'max_depth': 76}

RandomForestRegressor(max_depth=76, n_estimators=14, random_state=16)

{'q': 0.9603030303030302,
 'min_df': 5,
 'max_df': 8,
 'max_features': 53,
 'binary': False}

In [16]:
subm

Unnamed: 0,id,target
0,_wNsZEqpKUA_2021-12-01,0.017209
1,2jfbXZiE6Lc_2021-12-01,0.058022
2,F1Hq8eVOMHs_2021-12-01,0.005870
3,GQXVQmcGQUY_2021-12-01,0.017971
4,n4XojTb6pfs_2021-12-01,0.065540
...,...,...
5795,hJfpCXAMYPM_2021-12-30,0.010629
5796,MbmTMEYnEzo_2021-12-30,0.010477
5797,VG7arSAYvQI_2021-12-30,0.016680
5798,SNb-g-hNYYs_2021-12-30,0.056530
