In [41]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestRegressor

In [49]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

## Preprocessing

In [50]:
def final_preprocessing(df, reduce_df=True):
    df_preproc = df.copy()

    # one hot encoding
    enc = OneHotEncoder()
    encoded_df = pd.concat([df_preproc['weekday'], df_preproc['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    df_preproc[additional_columns] = encoded_df.toarray()
    df_preproc.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)

    # drop from feature selection
    df_preproc.drop(columns=['n_non_stop_words', 'kw_min_min', 'kw_max_max'], inplace=True)
    # remove n_tokens_content less than 0
    df_preproc['n_tokens_content'] = np.log(1 + df_preproc['n_tokens_content'])

    if reduce_df:
        # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
        q1 = df_preproc['kw_avg_avg'].describe()['25%']
        q3 = df_preproc['kw_avg_avg'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_avg = q1 - 1.5*iqr
        max_kw_avg_avg = q3 + 1.5*iqr
        df_preproc = df_preproc[(df_preproc.kw_avg_avg < max_kw_avg_avg) & (df_preproc.kw_avg_avg > min_kw_avg_avg)]

    # adjust num_imgs, num_self_hrefs, num_videos, num_hrefs
    df_preproc['num_imgs'].fillna(df_preproc['num_imgs'].mean(), inplace=True)
    df_preproc['num_imgs'] = np.log(1 + df_preproc['num_imgs'])
    df_preproc['num_self_hrefs'].fillna(df_preproc['num_self_hrefs'].mean(), inplace=True)
    df_preproc['num_self_hrefs'] = np.log(1 + df_preproc['num_self_hrefs'])
    df_preproc['num_videos'].fillna(df_preproc['num_videos'].mean(), inplace=True)
    df_preproc['num_videos'] = np.log(1 + df_preproc['num_videos'])
    df_preproc['num_hrefs'] = np.log(1 + df_preproc['num_hrefs'])

    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_title', 'n_tokens_content']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_title', 'n_tokens_content']])
    df_preproc[['n_tokens_title', 'n_tokens_content']] = scaled_features

    df_preproc['avg_negative_polarity'] = df_preproc['avg_negative_polarity'].abs()

    # Since this features has a range between [0, 10], we can apply a min max scaling
    df_preproc['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']
    std_scaler = MinMaxScaler().fit(df_preproc[['num_keywords']])
    scaled_features = std_scaler.transform(df_preproc[['num_keywords']])
    df_preproc[['num_keywords']] = scaled_features

    if 'shares' in df_preproc:
        df_preproc['shares'] = np.log(df_preproc['shares'])

    std_scaler = StandardScaler().fit(df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg', 'kw_max_avg', 'kw_max_min', 'kw_min_max']])
    scaled_features = std_scaler.transform(df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']])
    df_preproc[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']] = scaled_features

    std_scaler = StandardScaler().fit(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    scaled_features = std_scaler.transform(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']] = scaled_features

    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_title', 'n_tokens_content']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_title', 'n_tokens_content']])
    df_preproc[['n_tokens_title', 'n_tokens_content']] = scaled_features

    is_weekend = []
    for _, row in df_preproc.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    df_preproc['is_weekend'] = is_weekend

    std_scaler = StandardScaler().fit(df_preproc[['timedelta']])
    scaled_features = std_scaler.transform(df_preproc[['timedelta']])
    df_preproc[['timedelta']] = scaled_features

    return df_preproc

In [51]:
working_df_dev = final_preprocessing(df_dev)

In [52]:
df_working_df_eval = final_preprocessing(df_eval, reduce_df=False)
(len(df_working_df_eval), len(df_eval))

(7917, 7917)

## RandomRegressionForest

In [39]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [40]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X, y)


KeyboardInterrupt



In [19]:
# Make final predictions
y_pred = reg.predict(df_working_df_eval.values)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/output_4_submit.csv', columns=['Id','Predicted'], index=False)

                 Id    Predicted
count   7917.000000  7917.000000
mean   35679.634584  1946.647866
std     2289.051312   726.951028
min    31715.000000   430.352947
25%    33699.000000  1418.882316
50%    35680.000000  1804.181471
75%    37661.000000  2308.108594
max    39643.000000  8278.567419


## GradientBoostingRegressor model

In [20]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [21]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

reg = GradientBoostingRegressor(**params)
reg.fit(X, y)

In [22]:
# Make final predictions
y_pred = reg.predict(df_working_df_eval.values)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/output_gradboostingregr_submit.csv', columns=['Id','Predicted'], index=False)

                 Id    Predicted
count   7917.000000  7917.000000
mean   35679.634584  1864.520639
std     2289.051312   564.790037
min    31715.000000   865.703629
25%    33699.000000  1447.786002
50%    35680.000000  1770.196672
75%    37661.000000  2160.964434
max    39643.000000  7406.089016


# XGBoost

In [53]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [54]:
xgbr = XGBRegressor(n_estimators=500, learning_rate=0.01, eval_metric = 'rmsle')
xgbr.fit(X, y)

In [56]:
# Make final predictions
y_pred = xgbr.predict(df_working_df_eval.values)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/output_xgboost_submit.csv', columns=['Id','Predicted'], index=False)

                 Id    Predicted
count   7917.000000  7917.000000
mean   35679.634584  1787.667847
std     2289.051312   595.940430
min    31715.000000   574.820740
25%    33699.000000  1352.073975
50%    35680.000000  1684.927246
75%    37661.000000  2101.483398
max    39643.000000  6792.044922
