# Fourth submission: Support Vector Regression
 * min observations in each category: 2
 * 

In [1]:
%matplotlib inline
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# set seeds
np.random.seed(1)
random.seed(1)

# Clean data

## Create and drop features, manage categorical

In [2]:
# Function to prepare data

from sklearn.base import TransformerMixin

def hex2rgb(hex_code):
    return np.array([int(hex_code[i:i+2], 16) for i in (0, 2, 4)])

def hex2dominantChannel(hex_code, label=None):
    # If nan or not an hexadecimal
    if type(hex_code) == float:
        return np.nan
    colors_map = {0: 'red', 1: 'green', 2: 'blue'}
    rgb = hex2rgb(hex_code)
    y = np.argmax(rgb)
    return label + " " + colors_map[y] if label is not None else colors_map[y]

def create_new_features(X):
    # transform utc offset:
    X['Sin_UTC'] = np.sin((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    X['Cos_UTC'] = np.cos((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    
    # time since creation in days
    duration = (pd.to_datetime('today') - 
                pd.to_datetime(X['Profile Creation Timestamp']).dt.tz_localize(None))
    X['Duration'] = duration.apply(lambda x: x.days)
    
    # convert personal url into True/False (NaNs or unique)
    X['Has Personal URL'] = X['Personal URL'].notna()
    
    # Group color into Red, Blue, and Green
    X['Profile Text Color'] = df['Profile Text Color'].apply(lambda x: hex2dominantChannel(x, label="text"))
    X['Profile Theme Color'] = df['Profile Theme Color'].apply(lambda x: hex2dominantChannel(x, label="theme"))
    X['Profile Page Color'] = df['Profile Page Color'].apply(lambda x: hex2dominantChannel(x, label="page"))
    
    # log(x + 1) of numerical features
    for feature in ['Num of Followers', 'Num of People Following',
                    'Num of Status Updates', 'Num of Direct Messages',
                    'Avg Daily Profile Visit Duration in seconds',
                    'Avg Daily Profile Clicks']:
        X[f'log {feature}'] = np.log1p(X[feature])
    
    return X
    
def clean_features(X):
    # merge categories names with and without cap letter
    X['Location Public Visibility'] = X['Location Public Visibility'].str.lower()
    X.loc[X['Profile Category'] == " ", 'Profile Category'] = 'unknown'
    return X

def drop_features(X, l_features):
    return X.drop(columns=l_features)
    
class RemoveCategories(TransformerMixin):
    def __init__(self, min_obs=10, l_cols=[]):
        self.min_obs = min_obs
        self.l_cats = {}
        self.l_cols = l_cols

    def fit(self, X):
        # assumes all columns of df_cat are strings
        for col in self.l_cols:
            val_counts = X[col].fillna('missing').value_counts()
            self.l_cats[col] = val_counts[val_counts >= self.min_obs].index.tolist()
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        for col in self.l_cols:
            X.loc[~ X[col].isin(self.l_cats[col]), col] = 'other'        
        return X

In [3]:
# data cleaning and feature engineering

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

l_cat_small = ['Profile Text Color', 'Profile Page Color',
               'Profile Theme Color', 'User Language', 'Location',
               'Profile Image Dominant Color','Profile Image Avatar (18)']
l_drop = ['Id', 'User Name', 'Profile Image', 'Personal URL',
          'Profile Creation Timestamp', 'Location', 'UTC Offset']
l_drop_logs = ['Num of Followers', 'Num of People Following',
               'Num of Status Updates', 'Num of Direct Messages',
               'Avg Daily Profile Visit Duration in seconds', 
               'Avg Daily Profile Clicks']

features_to_drop =dict(l_features = l_drop + l_drop_logs)

pipe = Pipeline([('create new features',
                  FunctionTransformer(create_new_features)),
                 #('transform color features', FunctionTransformer(transform_color_features)),
                 ('clean data', FunctionTransformer(clean_features)),
                 ('remove categories', RemoveCategories(min_obs=10, 
                                                        l_cols=l_cat_small)),
                 ('drop variables', FunctionTransformer(drop_features,
                                    kw_args=features_to_drop))
                ])

## Imputation and encoding/scaling

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    ])

# categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [5]:
# train data
im_df = pd.read_csv('../data/train_profile_images_data.csv')
df = pd.read_csv('../data/train.csv')
df[['Profile Image Avatar (18)', 'Profile Image Dominant Color']] = im_df[['Profile Image Avatar (18)','Profile Image Dominant Color']]

y = np.log1p(df['Num of Profile Likes'].values)
X = df.drop(columns='Num of Profile Likes')

X = pipe.fit_transform(X)

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   X.select_dtypes(exclude=['object', 'bool']).columns),
                  ('cat', categorical_transformer,
                   X.select_dtypes(include=['object', 'bool']).columns)])

X = preprocessor.fit_transform(X)
X.shape

(7500, 219)

In [6]:
# to get the columns names back

from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in
def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []
    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
    return output_features

df_X = pd.DataFrame(X, columns=get_ct_feature_names(preprocessor))

In [7]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Columns: 219 entries, Sin_UTC to Has Personal URL_True
dtypes: float64(219)
memory usage: 12.5 MB


In [8]:
df_X.head()

Unnamed: 0,Sin_UTC,Cos_UTC,Duration,log Num of Followers,log Num of People Following,log Num of Status Updates,log Num of Direct Messages,log Avg Daily Profile Visit Duration in seconds,log Avg Daily Profile Clicks,Profile Cover Image Status_Not set,Profile Cover Image Status_Set,Profile Verification Status_Not verified,Profile Verification Status_Pending,Profile Verification Status_Verified,Profile Text Color_other,Profile Text Color_text blue,Profile Text Color_text green,Profile Text Color_text red,Profile Page Color_other,Profile Page Color_page blue,Profile Page Color_page green,Profile Page Color_page red,Profile Theme Color_other,Profile Theme Color_theme blue,Profile Theme Color_theme green,Profile Theme Color_theme red,Is Profile View Size Customized?_False,Is Profile View Size Customized?_True,Location Public Visibility_??,Location Public Visibility_disabled,Location Public Visibility_enabled,User Language_ar,User Language_de,User Language_en,User Language_en-gb,User Language_es,User Language_fr,User Language_id,User Language_it,User Language_ja,User Language_ko,User Language_nl,User Language_other,User Language_pl,User Language_pt,User Language_ru,User Language_tr,User Time Zone_Abu Dhabi,User Time Zone_Adelaide,User Time Zone_Africa/Johannesburg,User Time Zone_Alaska,User Time Zone_Almaty,User Time Zone_America/Argentina/Buenos_Aires,User Time Zone_America/Bogota,User Time Zone_America/Boise,User Time Zone_America/Chicago,User Time Zone_America/Denver,User Time Zone_America/Hermosillo,User Time Zone_America/Los_Angeles,User Time Zone_America/Mexico_City,User Time Zone_America/New_York,User Time Zone_America/Panama,User Time Zone_America/Santiago,User Time Zone_America/Sao_Paulo,User Time Zone_Amsterdam,User Time Zone_Arizona,User Time Zone_Asia/Calcutta,User Time Zone_Asia/Colombo,User Time Zone_Asia/Jakarta,User Time Zone_Asia/Karachi,User Time Zone_Asia/Kolkata,User Time Zone_Asia/Qatar,User Time Zone_Asia/Seoul,User Time Zone_Athens,User Time Zone_Atlantic Time (Canada),User Time Zone_Auckland,User Time Zone_Australia/Sydney,User Time Zone_Baghdad,User Time Zone_Baku,User Time Zone_Bangkok,User Time Zone_Beijing,User Time Zone_Belgrade,User Time Zone_Berlin,User Time Zone_Bern,User Time Zone_Bogota,User Time Zone_Brasilia,User Time Zone_Bratislava,User Time Zone_Brisbane,User Time Zone_Brussels,User Time Zone_Bucharest,User Time Zone_Budapest,User Time Zone_Buenos Aires,User Time Zone_Cairo,User Time Zone_Caracas,User Time Zone_Casablanca,User Time Zone_Central America,User Time Zone_Central Time (US & Canada),User Time Zone_Chennai,User Time Zone_Chihuahua,User Time Zone_Copenhagen,User Time Zone_Dhaka,User Time Zone_Dublin,User Time Zone_Eastern Time (US & Canada),User Time Zone_Edinburgh,User Time Zone_Europe/Athens,User Time Zone_Europe/Brussels,User Time Zone_Europe/Dublin,User Time Zone_Europe/Istanbul,User Time Zone_Europe/London,User Time Zone_Europe/Minsk,User Time Zone_Europe/Paris,User Time Zone_Fiji,User Time Zone_Georgetown,User Time Zone_Greenland,User Time Zone_Guadalajara,User Time Zone_Hanoi,User Time Zone_Harare,User Time Zone_Hawaii,User Time Zone_Helsinki,User Time Zone_Hong Kong,User Time Zone_Indiana (East),User Time Zone_International Date Line West,User Time Zone_Irkutsk,User Time Zone_Islamabad,User Time Zone_Istanbul,User Time Zone_Jakarta,User Time Zone_Jerusalem,User Time Zone_Kabul,User Time Zone_Karachi,User Time Zone_Kolkata,User Time Zone_Kuala Lumpur,User Time Zone_Kuwait,User Time Zone_Kyiv,User Time Zone_La Paz,User Time Zone_Lima,User Time Zone_Lisbon,User Time Zone_Ljubljana,User Time Zone_London,User Time Zone_Madrid,User Time Zone_Mazatlan,User Time Zone_Melbourne,User Time Zone_Mexico City,User Time Zone_Mid-Atlantic,User Time Zone_Midway Island,User Time Zone_Minsk,User Time Zone_Monrovia,User Time Zone_Monterrey,User Time Zone_Moscow,User Time Zone_Mountain Time (US & Canada),User Time Zone_Mumbai,User Time Zone_Muscat,User Time Zone_Nairobi,User Time Zone_New Caledonia,User Time Zone_New Delhi,User Time Zone_Osaka,User Time Zone_Pacific Time (US & Canada),User Time Zone_Pacific/Auckland,User Time Zone_Paris,User Time Zone_Perth,User Time Zone_Port Moresby,User Time Zone_Prague,User Time Zone_Pretoria,User Time Zone_Quito,User Time Zone_Riga,User Time Zone_Riyadh,User Time Zone_Rome,User Time Zone_Samoa,User Time Zone_Santiago,User Time Zone_Sarajevo,User Time Zone_Saskatchewan,User Time Zone_Seoul,User Time Zone_Singapore,User Time Zone_Sofia,User Time Zone_Stockholm,User Time Zone_Sydney,User Time Zone_Taipei,User Time Zone_Tallinn,User Time Zone_Tashkent,User Time Zone_Tbilisi,User Time Zone_Tehran,User Time Zone_Tokyo,User Time Zone_UTC,User Time Zone_Ulaan Bataar,User Time Zone_Urumqi,User Time Zone_Vienna,User Time Zone_Vilnius,User Time Zone_Volgograd,User Time Zone_Warsaw,User Time Zone_Wellington,User Time Zone_West Central Africa,User Time Zone_Yerevan,User Time Zone_Zagreb,Profile Category_business,Profile Category_celebrity,Profile Category_government,Profile Category_unknown,Profile Image Avatar (18)_class 0,Profile Image Avatar (18)_class 1,Profile Image Avatar (18)_class 10,Profile Image Avatar (18)_class 11,Profile Image Avatar (18)_class 12,Profile Image Avatar (18)_class 13,Profile Image Avatar (18)_class 14,Profile Image Avatar (18)_class 15,Profile Image Avatar (18)_class 16,Profile Image Avatar (18)_class 17,Profile Image Avatar (18)_class 2,Profile Image Avatar (18)_class 3,Profile Image Avatar (18)_class 4,Profile Image Avatar (18)_class 5,Profile Image Avatar (18)_class 6,Profile Image Avatar (18)_class 7,Profile Image Avatar (18)_class 8,Profile Image Avatar (18)_class 9,Profile Image Dominant Color_avatar blue,Profile Image Dominant Color_avatar green,Profile Image Dominant Color_avatar red,Has Personal URL_False,Has Personal URL_True
0,-1.282498,2.033501,0.965351,-0.133948,1.001049,0.695624,0.235038,-0.776784,-0.685076,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-8.211642000000001e-17,0.0,0.331944,0.837235,-0.263673,-0.09169,-0.322581,-2.029941,2.279596,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,0.9364191,0.445004,0.492591,-0.940355,0.613012,-0.417813,-0.446287,0.91434,-1.622217,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.8860139,-0.029732,0.836834,0.385486,0.382624,1.019229,1.010769,0.195681,0.584002,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.9364191,0.445004,0.775635,-1.09548,-0.792527,-0.632122,-0.727261,-1.971533,0.544463,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


# XGBoost

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

gb= GradientBoostingRegressor(random_state=0)

parameters = {'n_estimators': [10000],
              'n_iter_no_change': [6, 8, 10],
              'learning_rate': [0.06, 0.07],
              'min_samples_split': range(8, 25),
              'min_samples_leaf': range(3, 10),
              'max_depth': range(3, 7)}

reg_gb = RandomizedSearchCV(gb, parameters, scoring='neg_mean_squared_error',
                            n_jobs=-1, random_state=0, n_iter=100, verbose=10)
reg_gb.fit(X, y)

print(f'RMSLE: {np.sqrt(-reg_gb.best_score_):.3f}',
      f'\nbest parameters: { reg_gb.best_params_}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.1min


In [None]:
1.715

# Submission

In [27]:
def create_submission(estimator, number):
    X_test = pd.read_csv('data/test.csv')
    
    im_df = pd.read_csv('data/test_profile_images_data.csv')
    df[['Profile Image Avatar (18)', 'Profile Image Dominant Color']] = im_df[['Profile Image Avatar (18)','Profile Image Dominant Color']]
    
    Id = X_test['Id']
    X_test = pipe.transform(X_test)
    X_test = preprocessor.transform(X_test)
    y_pred = (estimator.predict(X_test))
    y_pred_test = pd.Series(np.expm1(y_pred))
    submission = pd.DataFrame({'Id': Id, 'Predicted': y_pred_test})
    path = f'submissions/submission{number}.csv'
    while os.path.exists(path):
        number += 1
        path = f'submissions/submission{number}.csv'
    submission.to_csv(path, index=False)
    print(f'submission #{number} created')

In [28]:
create_submission(reg_svr, 1)

submission #4 created


In [29]:
pd.read_csv('submissions/submission4_02.csv').head()

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,2809.866389
1,727IRIR59A3P88LK,3132.848317
2,LN95SD15SRPCEE8F,327.989895
3,TB11I7F0PN033D4T,5138.042429
4,32PSGCK5PATHMR07,271.653679
