In [1]:
%matplotlib inline
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# set seeds
np.random.seed(1)
random.seed(1)

Comments:
* could try to impute np.nan in categorical instead of creating 'missing'
* improve pipelines to have juste one pipeline

# Clean data

## Create and drop features, manage categorical

In [2]:
# Function to prepare data

from sklearn.base import TransformerMixin

def create_new_features(X):
    # transform utc offset:
    X['Sin_UTC'] = np.sin((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    X['Cos_UTC'] = np.cos((11 * 3600 + X['UTC Offset']) / (24 * 3600) * 2 * np.pi)
    
    # time since creation in days
    duration = (pd.to_datetime('today') - 
                pd.to_datetime(X['Profile Creation Timestamp']).dt.tz_localize(None))
    X['Duration'] = duration.apply(lambda x: x.days)
    
    # convert personal url into True/False (NaNs or unique)
    X['Has Personal URL'] = X['Personal URL'].notna()
    
    # log(x + 1) of numerical features
    for feature in ['Num of Followers', 'Num of People Following',
                    'Num of Status Updates', 'Num of Direct Messages',
                    'Avg Daily Profile Visit Duration in seconds',
                    'Avg Daily Profile Clicks']:
        X[f'log {feature}'] = np.log1p(X[feature])

    return X
    
def clean_features(X):
    # merge categories names with and without cap letter
    X['Location Public Visibility'] = X['Location Public Visibility'].str.lower()
    return X

def drop_features(X, l_features):
    return X.drop(columns=l_features)
    
class RemoveCategories(TransformerMixin):
    def __init__(self, min_obs=10, l_cols=[]):
        self.min_obs = min_obs
        self.l_cats = {}
        self.l_cols = l_cols

    def fit(self, X):
        # assumes all columns of df_cat are strings
        for col in self.l_cols:
            val_counts = X[col].fillna('missing').value_counts()
            self.l_cats[col] = val_counts[val_counts >= self.min_obs].index.tolist()
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        for col in self.l_cols:
            X.loc[~ X[col].isin(self.l_cats[col]), col] = 'other'        
        return X

In [3]:
# train data
df = pd.read_csv('data/train.csv')
y = np.log1p(df['Num of Profile Likes'].values)
X = df.drop(columns='Num of Profile Likes')

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

min_obs_cat = 10
l_cat_small = ['Profile Text Color', 'Profile Page Color',
               'Profile Theme Color', 'User Language', 'Location',
               'User Time Zone']
l_drop = ['Id', 'User Name', 'Profile Image', 'Personal URL', 'UTC Offset', 
          'Profile Creation Timestamp']
l_drop_logs = ['Num of Followers', 'Num of People Following',
               'Num of Status Updates', 'Num of Direct Messages',
               'Avg Daily Profile Visit Duration in seconds', 
               'Avg Daily Profile Clicks']

features_to_drop =dict(l_features = l_drop + l_drop_logs)

pipe = Pipeline([('create new features',
                  FunctionTransformer(create_new_features)),
                 ('clean data', FunctionTransformer(clean_features)),
                 ('remove categories', RemoveCategories(min_obs=min_obs_cat,
                                                        l_cols=l_cat_small)),
                 ('drop variables', FunctionTransformer(drop_features,
                                    kw_args=features_to_drop))
                ])

X = pipe.fit_transform(X)

## Imputation and encoding/scaling

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# numerical features
num_features = X.select_dtypes(exclude=['object', 'bool']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
    ])

cat_features = X.select_dtypes(include=['object', 'bool']).columns

# categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=False))])

# preprocessing
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, num_features),
                  ('cat', categorical_transformer, cat_features)])
X = preprocessor.fit_transform(X)

# Linear models

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv=10, scoring='neg_mean_squared_error',
                         n_jobs=-1)

print(f'mean score: {np.mean(scores)}')
print(f'std score: {np.std(scores)}')

mean score: -3.3635096613105935
std score: 0.21865189471770438


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

parameters = {'alpha': np.linspace(1, 50, 20), 'normalize':[False, True]}
reg = Ridge()
reg_cv = GridSearchCV(reg, parameters, scoring='neg_mean_squared_error',
                      n_jobs=-1)
reg_cv.fit(X, y)
print(reg_cv.best_score_, reg_cv.best_params_)

-3.260161739761636 {'alpha': 44.8421052631579, 'normalize': False}


In [13]:
from sklearn.linear_model import Lasso
parameters = {'alpha': np.logspace(-2, 2, 10), 'normalize':[False, True]}
reg = Lasso()
reg_cv = GridSearchCV(reg, parameters, scoring='neg_mean_squared_error',
                      n_jobs=-1)
reg_cv.fit(X, y)
print(reg_cv.best_score_, reg_cv.best_params_)

-3.318874968170141 {'alpha': 0.01, 'normalize': False}


In [22]:
from sklearn.linear_model import ElasticNet
parameters = {'alpha': np.logspace(-1, 1, 8),
              'l1_ratio': np.logspace(-1, 1, 8), 
              'normalize':[False, True]}
reg = ElasticNet()
reg_cv = GridSearchCV(reg, parameters, scoring='neg_mean_squared_error',
                      n_jobs=1)
reg_cv.fit(X, y)
print(reg_cv.best_score_, reg_cv.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-3.391009805995496 {'alpha': 0.1, 'l1_ratio': 0.1, 'normalize': False}
