# Stacking submissions

In [1]:
# %matplotlib inline
import os
import random
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

from colour.notation import HEX_to_RGB
from colour import convert
from sklearn.cluster import KMeans

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# set seeds
np.random.seed(1)
random.seed(1)

# Clean data

## Create and drop features, manage categorical

In [2]:
# Function to prepare data

from sklearn.base import TransformerMixin

def create_new_features(X):
    # transform utc offset:
    X['Sin_UTC'] = np.sin((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    X['Cos_UTC'] = np.cos((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    
    # time since creation in days
    duration = (pd.to_datetime('today') - 
                pd.to_datetime(X['Profile Creation Timestamp']).dt.tz_localize(None))
    X['Duration'] = duration.apply(lambda x: x.days)
    
    # convert personal url into True/False (NaNs or unique)
    X['Has Personal URL'] = X['Personal URL'].notna()
    
    # log(x + 1) of numerical features
    for feature in ['Num of Followers', 'Num of People Following',
                    'Num of Status Updates', 'Num of Direct Messages',
                    'Avg Daily Profile Visit Duration in seconds',
                    'Avg Daily Profile Clicks']:
        X[f'log {feature}'] = np.log1p(X[feature])
    
    for col in ['Profile Text Color', 'Profile Page Color', 'Profile Theme Color']:
        X[f'Lab {col}'] = X[col].apply(lambda x: convert(HEX_to_RGB(x), 'RGB', 'CIE Lab') if not pd.isnull(x) 
                else convert((1,1,1), 'RGB', 'CIE Lab'))

    return X
    
def clean_features(X):
    # merge categories names with and without cap letter
    X['Location Public Visibility'] = X['Location Public Visibility'].str.lower()
    X.loc[X['Profile Category'] == " ", 'Profile Category'] = 'unknown'
    return X

def drop_features(X, l_features):
    return X.drop(columns=l_features)
    
class RemoveCategories(TransformerMixin):
    def __init__(self, min_obs=10, l_cols=[]):
        self.min_obs = min_obs
        self.l_cats = {}
        self.l_cols = l_cols

    def fit(self, X):
        # assumes all columns of df_cat are strings
        for col in self.l_cols:
            val_counts = X[col].fillna('missing').value_counts()
            self.l_cats[col] = val_counts[val_counts >= self.min_obs].index.tolist()
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        for col in self.l_cols:
            X.loc[~ X[col].isin(self.l_cats[col]), col] = 'other'        
        return X

class ClusterColors(TransformerMixin):
    """ The data in X[ccols] should be CIE Lab color data"""
    def __init__(self, ccols=['Lab Profile Text Color', 'Lab Profile Page Color', 'Lab Profile Theme Color'], *args, **kwargs):
        self.clusterers = dict(zip(ccols, [KMeans(*args, **kwargs)]*len(ccols)))
        self.columns = ccols

    def set_params(self, **params):
        for clusterer in self.clusterers.values():
            clusterer.set_params(**params)
        return self

    def fit(self, X, y=None, **fit_params):        
        for col in self.columns:
            self.clusterers[col].fit(list(X[col].to_numpy()))
        return self

    def transform(self, X):
        for col in self.columns:
            X[f'Clustered {col}'] = self.clusterers[col].predict(list(X[col].to_numpy()))
        return X


In [3]:
# data cleaning and feature engineering

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

l_cat_small = ['User Language', 'Location']
l_drop = ['Id', 'User Name', 'Profile Image', 'Personal URL',
          'Profile Creation Timestamp', 'Location', 'UTC Offset']
l_drop_logs = ['Num of Followers', 'Num of People Following',
               'Num of Status Updates', 'Num of Direct Messages',
               'Avg Daily Profile Visit Duration in seconds', 
               'Avg Daily Profile Clicks']
l_drop_hex = ['Profile Text Color', 'Profile Page Color',
               'Profile Theme Color'
               ]
features_to_drop =dict(l_features = l_drop + l_drop_logs + l_drop_hex)

pipe = Pipeline([('create new features',
                  FunctionTransformer(create_new_features)),
                 ('clean data', FunctionTransformer(clean_features)),
                 ('cluster', ClusterColors(n_clusters=7, n_init=10, max_iter=300, tol=1e-4)),
                 ('remove categories', RemoveCategories(min_obs=2, 
                                                        l_cols=l_cat_small)),
                 ('drop variables', FunctionTransformer(drop_features,
                                    kw_args=features_to_drop))
                ])

## Imputation and encoding/scaling

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    ])

# categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [5]:
# train data
df = pd.read_csv('data/train.csv')
y = np.log1p(df['Num of Profile Likes'].values)
X = df.drop(columns='Num of Profile Likes')

X = pipe.fit_transform(X)

ccols = ['Lab Profile Text Color', 'Lab Profile Page Color', 'Lab Profile Theme Color',]
catcols = list(X.select_dtypes(include=['object', 'bool']).columns) + ['Clustered '+c for c in ccols]
cat_features = list([col for col in catcols if col not in ccols])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   X.select_dtypes(exclude=['object', 'bool']).columns),
                  ('cat', categorical_transformer,
                   cat_features)])

X = preprocessor.fit_transform(X)

# Stacking

In [6]:
def create_submission(estimator, number):
    X_test = pd.read_csv('data/test.csv')
    Id = X_test['Id']
    X_test = pipe.transform(X_test)
    X_test = preprocessor.transform(X_test)
    y_pred = (estimator.predict(X_test))
    y_pred_test = pd.Series(np.expm1(y_pred))
    submission = pd.DataFrame({'Id': Id, 'Predicted': y_pred_test})
    path = f'submissions/submission{number}.csv'
    while os.path.exists(path):
        number += 1
        path = f'submissions/submission{number}.csv'
    submission.to_csv(path, index=False)
    print(f'submission #{number} created')

In [7]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR

ridge = Ridge(alpha=36.7, normalize=False)
rf = RandomForestRegressor(n_estimators=700, min_samples_split=3, min_samples_leaf=3, max_depth=18)
gb = GradientBoostingRegressor(n_iter_no_change=6, n_estimators=10000,
                                     min_samples_split=21, min_samples_leaf=9,
                                     max_depth=5, learning_rate=0.06)
bagged_svr =  BaggingRegressor(base_estimator=SVR(kernel='rbf', C=1.37))

estimators = [('ridge', ridge), ('rf', rf), ('gb', gb), ('bagged_svr', bagged_svr)]
stack = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), verbose=10, n_jobs=100, passthrough=False)


In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit


parameters = {
    'ridge__alpha': np.logspace(-2,2,10),
    'rf__n_estimators': range(400,1001,100),
    'rf__max_depth': range(10,30,5),
    'gb__learning_rate': np.linspace(0.1,0.5,5),
    'bagged_svr__base_estimator__C': np.logspace(-2,2,5),
    'final_estimator__alphas': np.logspace(-2,2,5),
    'final_estimator__cv': [ShuffleSplit(n_splits=5, test_size=0.3, random_state=0),],
    }

stacked_cv = RandomizedSearchCV(estimator=stack, param_grid=parameters, scoring='neg_mean_squared_error', n_iter=100, n_jobs=-1)
stacked_cv.fit(X, y)
print(np.sqrt(-stacked_cv.best_score_), stacked_cv.best_params_)

 ## Make submission

In [None]:
stack = stacked_cv.best_estimator_
create_submission(stack, 13)
