In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score

In [2]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

df = pd.concat([df_dev, df_eval], sort=False)
df.head()

Unnamed: 0,id,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,...,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,data_channel,weekday
0,0,http://mashable.com/2014/09/08/safest-cabbies-...,121.0,12.0,1015.0,0.422018,1.0,0.545031,10.0,6.0,...,-0.160714,-0.5,-0.071429,0.0,0.0,0.5,0.0,2900.0,bus,tuesday
1,1,http://mashable.com/2013/07/25/3d-printed-rifle/,532.0,9.0,503.0,0.569697,1.0,0.737542,9.0,0.0,...,-0.1575,-0.25,-0.1,0.0,0.0,0.5,0.0,1300.0,tech,thursday
2,2,http://mashable.com/2013/10/30/digital-dinosau...,435.0,9.0,232.0,0.646018,1.0,0.748428,12.0,3.0,...,-0.4275,-1.0,-0.1875,0.0,0.0,0.5,0.0,17700.0,lifestyle,wednesday
3,3,http://mashable.com/2014/08/27/homer-simpson-i...,134.0,12.0,171.0,0.722892,1.0,0.867925,9.0,5.0,...,-0.216667,-0.25,-0.166667,0.4,-0.25,0.1,0.25,1500.0,bus,wednesday
4,4,http://mashable.com/2013/01/10/creepy-robotic-...,728.0,11.0,286.0,0.652632,1.0,0.8,5.0,2.0,...,-0.251786,-0.5,-0.1,0.2,-0.1,0.3,0.1,1400.0,tech,thursday


In [3]:
df_dev.shape

(31715, 50)

## Preprocessing

In [4]:
def final_preprocessing(df, reduce_df=True):
    df_preproc = df.copy()

    # one hot encoding
    enc = OneHotEncoder()
    encoded_df = pd.concat([df_preproc['weekday'], df_preproc['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    df_preproc[additional_columns] = encoded_df.toarray()
    df_preproc.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)

    # drop from feature selection
    df_preproc.drop(columns=['n_non_stop_words', 'kw_min_min', 'kw_max_max'], inplace=True)

    if reduce_df:
        # remove n_tokens_content less than 0
        df_preproc = df_preproc.query("n_tokens_content > 0")
        df_preproc['n_tokens_content'] = np.log(df_preproc['n_tokens_content'])
        # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
        q1 = df_preproc['kw_avg_avg'].describe()['25%']
        q3 = df_preproc['kw_avg_avg'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_avg = q1 - 1.5*iqr
        max_kw_avg_avg = q3 + 1.5*iqr
        df_preproc = df_preproc[(df_preproc.kw_avg_avg < max_kw_avg_avg) & (df_preproc.kw_avg_avg > min_kw_avg_avg)]
    else:
        df_preproc['n_tokens_content'] = np.log(1 + df_preproc['n_tokens_content'])

    # adjust num_imgs, num_self_hrefs, num_videos, num_hrefs
    df_preproc['num_imgs'].fillna(df_preproc['num_imgs'].mean(), inplace=True)
    df_preproc['num_imgs'] = np.log(1 + df_preproc['num_imgs'])
    df_preproc['num_self_hrefs'].fillna(df_preproc['num_self_hrefs'].mean(), inplace=True)
    df_preproc['num_self_hrefs'] = np.log(1 + df_preproc['num_self_hrefs'])
    df_preproc['num_videos'].fillna(df_preproc['num_videos'].mean(), inplace=True)
    df_preproc['num_videos'] = np.log(1 + df_preproc['num_videos'])
    df_preproc['num_hrefs'] = np.log(1 + df_preproc['num_hrefs'])

    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_title', 'n_tokens_content']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_title', 'n_tokens_content']])
    df_preproc[['n_tokens_title', 'n_tokens_content']] = scaled_features

    df_preproc['avg_negative_polarity'] = df_preproc['avg_negative_polarity'].abs()

    # Since this features has a range between [0, 10], we can apply a min max scaling
    df_preproc['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']
    std_scaler = MinMaxScaler().fit(df_preproc[['num_keywords']])
    scaled_features = std_scaler.transform(df_preproc[['num_keywords']])
    df_preproc[['num_keywords']] = scaled_features

    if 'shares' in df_preproc:
        df_preproc['shares'] = np.log(df_preproc['shares'])

    std_scaler = StandardScaler().fit(df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg', 'kw_max_avg', 'kw_max_min', 'kw_min_max']])
    scaled_features = std_scaler.transform(df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']])
    df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']] = scaled_features

    std_scaler = StandardScaler().fit(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    scaled_features = std_scaler.transform(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']] = scaled_features


    is_weekend = []
    for _, row in df_preproc.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    df_preproc['is_weekend'] = is_weekend

    std_scaler = StandardScaler().fit(df_preproc[['timedelta']])
    scaled_features = std_scaler.transform(df_preproc[['timedelta']])
    df_preproc[['timedelta']] = scaled_features

    return df_preproc

In [5]:
working_df_dev = final_preprocessing(df_dev)
df_working_df_eval = final_preprocessing(df_eval, reduce_df=False)
(len(df_working_df_eval), len(df_eval))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preproc['n_tokens_content'] = np.log(df_preproc['n_tokens_content'])
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_preproc['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_preproc['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']


(7917, 7917)

## Hyperparameter tuning

In [6]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=42)

In [23]:
params = {
    "objective": ['reg:squarederror', 'reg:pseudohubererror'],
    "max_depth": [4,7,8,10],
    "min_child_weight": [3,5,7,9,12,15],
    "eta": [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05],
    "eval_metric": [mean_squared_error],
}

In [24]:
def RMSE(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    print('MSE: %2.3f' % mse)
    return mse

def R2(y_true,y_pred):    
     r2 = r2_score(y_true, y_pred)
     adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
     print('R2: %2.3f' % adj_r2)
     return adj_r2

def two_score(y_true,y_pred):    
    RMSE(y_true,y_pred) #set score here and not below if using MSE in GridCV
    score = R2(y_true,y_pred)
    return score

def two_scorer():
    return make_scorer(two_score, greater_is_better=False) # change for false if using MSE

def rmse_scorer():
    return make_scorer(RMSE, greater_is_better=False) # change for false if using MSE

In [25]:
gs = GridSearchCV(XGBRegressor(), param_grid=params, cv=5, scoring='r2')
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.1639529628590907
{'eta': 0.05, 'eval_metric': <function mean_squared_error at 0x7f29aca60820>, 'max_depth': 7, 'min_child_weight': 12, 'objective': 'reg:squarederror'}


All the features has been used to train xgboost regressor and others. I tried also to decrease the number but the score decreases

In [26]:
rms = mean_squared_error(y_valid, gs.predict(X_valid), squared=False)
print(rms)
r2 = r2_score(y_valid, gs.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.814497689628405
0.15743544269862797


Recall that for each of these models you can check the feature importance, so that remove some features

In [47]:
# feature_names = working_df_dev.drop(columns=["shares"]).columns
# sorted(zip(feature_names, xgb_r.feature_importances_), key=lambda x: x[1], reverse=True)

[('data_channel_tech', 0.1563371),
 ('kw_avg_avg', 0.12027372),
 ('data_channel_socmed', 0.07851338),
 ('data_channel_entertainment', 0.07000306),
 ('is_weekend', 0.056433074),
 ('data_channel_lifestyle', 0.033759877),
 ('kw_min_avg', 0.033008665),
 ('data_channel_world', 0.03147098),
 ('num_hrefs', 0.026406191),
 ('min_positive_polarity', 0.025813013),
 ('global_subjectivity', 0.02474313),
 ('num_videos', 0.0245458),
 ('kw_avg_max', 0.024481896),
 ('num_imgs', 0.022690505),
 ('title_subjectivity', 0.022096708),
 ('title_sentiment_polarity', 0.021878142),
 ('global_sentiment_polarity', 0.021053853),
 ('n_tokens_content', 0.020581577),
 ('average_token_length', 0.020086395),
 ('global_rate_negative_words', 0.020071955),
 ('avg_negative_polarity', 0.02002607),
 ('n_non_stop_unique_tokens', 0.020018425),
 ('rate_negative_words', 0.019788092),
 ('avg_positive_polarity', 0.019588307),
 ('global_rate_positive_words', 0.01914742),
 ('max_positive_polarity', 0.018795239),
 ('num_keywords', 0.0

In [57]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [14]:
xgbr = XGBRegressor(gs.best_params_)
xgbr.fit(X, y)



XGBoostError: [11:08:02] ../src/objective/objective.cc:26: Unknown objective function: `{'eta': 0.05, 'eval_metric': <function mean_squared_error at 0x7f29aca60820>, 'max_depth': 7, 'min_child_weight': 12}`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: reg:pseudohubererror
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie
Objective candidate: reg:absoluteerror

Stack trace:
  [bt] (0) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3708b3) [0x7f299bfeb8b3]
  [bt] (1) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x370f8f) [0x7f299bfebf8f]
  [bt] (2) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2e543c) [0x7f299bf6043c]
  [bt] (3) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2ebfd7) [0x7f299bf66fd7]
  [bt] (4) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x2e02f8) [0x7f299bf5b2f8]
  [bt] (5) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7f299bdb75f0]
  [bt] (6) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f29ebd57630]
  [bt] (7) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f29ebd56fed]
  [bt] (8) /home/gesposito/miniconda3/envs/thesis/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x1084a) [0x7f29ebd6b84a]



Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: reg:pseudohubererror
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie
Objective candidate: reg:absoluteerror

In [51]:
# Make final predictions
y_pred = xgbr.predict(df_working_df_eval.values)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/output_xgboost_submit.csv', columns=['Id','Predicted'], index=False)

## Generate final CSV

In [None]:
# Get the best configuration from the grid search

In [27]:
# Make final predictions
y_pred = gs.predict(df_working_df_eval.values)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/NAME.csv', columns=['Id','Predicted'], index=False)

                 Id    Predicted
count   7917.000000  7917.000000
mean   35679.634584  1788.035156
std     2289.051312   635.961304
min    31715.000000   434.370697
25%    33699.000000  1342.390991
50%    35680.000000  1667.848267
75%    37661.000000  2084.888916
max    39643.000000  9050.632812
