In [1]:
%matplotlib inline
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# set seeds
np.random.seed(1)
random.seed(1)
# sns.set(style = 'darkgrid')

In [2]:
# Function to prepare data

from colour.notation import HEX_to_RGB
from colour import convert
from sklearn.cluster import KMeans

from sklearn.base import TransformerMixin

def create_new_features(X):
    # transform utc offset:
    X['Sin_UTC'] = np.sin((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    X['Cos_UTC'] = np.cos((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    
    # time since creation in days
    duration = (pd.to_datetime('today') - 
                pd.to_datetime(X['Profile Creation Timestamp']).dt.tz_localize(None))
    X['Duration'] = duration.apply(lambda x: x.days)
    
    # convert personal url into True/False (NaNs or unique)
    X['Has Personal URL'] = X['Personal URL'].notna()
    
    # log(x + 1) of numerical features
    for feature in ['Num of Followers', 'Num of People Following',
                    'Num of Status Updates', 'Num of Direct Messages',
                    'Avg Daily Profile Visit Duration in seconds',
                    'Avg Daily Profile Clicks']:
        X[f'log {feature}'] = np.log1p(X[feature])
    
    for col in ['Profile Text Color', 'Profile Page Color', 'Profile Theme Color']:
        X[f'Lab {col}'] = X[col].apply(lambda x: convert(HEX_to_RGB(x), 'RGB', 'CIE Lab') if not pd.isnull(x) 
                else convert((1,1,1), 'RGB', 'CIE Lab'))

    return X
    
def clean_features(X):
    # merge categories names with and without cap letter
    X['Location Public Visibility'] = X['Location Public Visibility'].str.lower()
    X.loc[X['Profile Category'] == " ", 'Profile Category'] = 'unknown'
    return X

def drop_features(X, l_features):
    return X.drop(columns=l_features)
    
class RemoveCategories(TransformerMixin):
    def __init__(self, min_obs=10, l_cols=[]):
        self.min_obs = min_obs
        self.l_cats = {}
        self.l_cols = l_cols

    def fit(self, X):
        # assumes all columns of df_cat are strings
        for col in self.l_cols:
            val_counts = X[col].fillna('missing').value_counts()
            self.l_cats[col] = val_counts[val_counts >= self.min_obs].index.tolist()
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        for col in self.l_cols:
            X.loc[~ X[col].isin(self.l_cats[col]), col] = 'other'        
        return X

class ClusterColors(TransformerMixin):
    """ The data in X[ccols] should be CIE Lab color data"""
    def __init__(self, ccols=['Lab Profile Text Color', 'Lab Profile Page Color', 'Lab Profile Theme Color'], *args, **kwargs):
        self.clusterers = dict(zip(ccols, [KMeans(*args, **kwargs)]*len(ccols)))
        self.columns = ccols

    def set_params(self, **params):
        for clusterer in self.clusterers.values():
            clusterer.set_params(**params)
        return self

    def fit(self, X, y=None, **fit_params):        
        for col in self.columns:
            self.clusterers[col].fit(list(X[col].to_numpy()))
        return self

    def transform(self, X):
        for col in self.columns:
            X[f'Clustered {col}'] = self.clusterers[col].predict(list(X[col].to_numpy()))
        return X


In [3]:
# data cleaning and feature engineering

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

l_cat_small = ['User Language', 'Location']
l_drop = ['Id', 'User Name', 'Profile Image', 'Personal URL',
          'Profile Creation Timestamp', 'Location', 'UTC Offset']
l_drop_logs = ['Num of Followers', 'Num of People Following',
               'Num of Status Updates', 'Num of Direct Messages',
               'Avg Daily Profile Visit Duration in seconds', 
               'Avg Daily Profile Clicks']
l_drop_hex = ['Profile Text Color', 'Profile Page Color',
               'Profile Theme Color'
               ]
features_to_drop =dict(l_features = l_drop + l_drop_logs + l_drop_hex)

pipe = Pipeline([('create new features',
                  FunctionTransformer(create_new_features)),
                 ('clean data', FunctionTransformer(clean_features)),
                 ('cluster', ClusterColors(n_clusters=7, n_init=10, max_iter=300, tol=1e-4)),
                 ('remove categories', RemoveCategories(min_obs=2, 
                                                        l_cols=l_cat_small)),
                 ('drop variables', FunctionTransformer(drop_features,
                                    kw_args=features_to_drop))
                ])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# numerical features
ccols = ['Lab Profile Text Color', 'Lab Profile Page Color',
               'Lab Profile Theme Color',]
num_features = list(X.select_dtypes(exclude=['object', 'bool']).columns)
# num_features.append(pd.Index(ccols))
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

catcols = list(X.select_dtypes(include=['object', 'bool']).columns) #+ ['Clustered '+c for c in ccols]
print(catcols)
cat_features = list([col for col in catcols if col not in ccols])

# categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=False))])

# preprocessing
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, num_features),
                  ('cat', categorical_transformer, cat_features)])

# X2 = pd.DataFrame(X_t, columns=get_ct_feature_names(preprocessor))

['Profile Cover Image Status', 'Profile Verification Status', 'Is Profile View Size Customized?', 'Location Public Visibility', 'User Language', 'User Time Zone', 'Profile Category', 'Has Personal URL', 'Lab Profile Text Color', 'Lab Profile Page Color', 'Lab Profile Theme Color']


In [7]:
df = pd.read_csv('data/train.csv')
y = np.log1p(df['Num of Profile Likes'].values)
X = df.drop(columns='Num of Profile Likes')

X = pipe.fit_transform(X)

X = preprocessor.fit_transform(X)



    convert(*args, **kwargs)
  warn(*args, **kwargs)


In [13]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR

parameters = {
    'base_estimator__C': np.logspace(-3,3,100),
    'base_estimator__epsilon': np.logspace(-4,-1,100),
    'max_features': np.linspace(0.1, 1.0, 10),
    'max_samples': np.linspace(0.1, 1.0, 10),
    'bootstrap_features': [True, False],
    'bootstrap': [True, False],
    }

br = BaggingRegressor(base_estimator=SVR())
br_cv = RandomizedSearchCV(br, parameters, scoring='neg_mean_squared_error',
                           n_jobs=50, n_iter=100)
br_cv.fit(X, y)
print(np.sqrt(-br_cv.best_score_), br_cv.best_params_)

1.75351258114 {'max_samples': 0.70000000000000007, 'max_features': 0.90000000000000002, 'bootstrap_features': False, 'bootstrap': True, 'base_estimator__epsilon': 0.086974900261778343, 'base_estimator__C': 6.5793322465756825}


In [14]:
1.75313391282 {'max_samples': 0.40000000000000002, 'max_features': 0.90000000000000002,
               'bootstrap_features': False, 'bootstrap': False,
               'base_estimator__epsilon': 0.00057223676593502206,
               'base_estimator__C': 5.72236765935022}

SyntaxError: invalid syntax (<ipython-input-14-72d372246e14>, line 1)