In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [7]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

df = pd.concat([df_dev, df_eval], sort=False)
df.head()

Unnamed: 0,id,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,...,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,data_channel,weekday
0,0,http://mashable.com/2014/09/08/safest-cabbies-...,121.0,12.0,1015.0,0.422018,1.0,0.545031,10.0,6.0,...,-0.160714,-0.5,-0.071429,0.0,0.0,0.5,0.0,2900.0,bus,tuesday
1,1,http://mashable.com/2013/07/25/3d-printed-rifle/,532.0,9.0,503.0,0.569697,1.0,0.737542,9.0,0.0,...,-0.1575,-0.25,-0.1,0.0,0.0,0.5,0.0,1300.0,tech,thursday
2,2,http://mashable.com/2013/10/30/digital-dinosau...,435.0,9.0,232.0,0.646018,1.0,0.748428,12.0,3.0,...,-0.4275,-1.0,-0.1875,0.0,0.0,0.5,0.0,17700.0,lifestyle,wednesday
3,3,http://mashable.com/2014/08/27/homer-simpson-i...,134.0,12.0,171.0,0.722892,1.0,0.867925,9.0,5.0,...,-0.216667,-0.25,-0.166667,0.4,-0.25,0.1,0.25,1500.0,bus,wednesday
4,4,http://mashable.com/2013/01/10/creepy-robotic-...,728.0,11.0,286.0,0.652632,1.0,0.8,5.0,2.0,...,-0.251786,-0.5,-0.1,0.2,-0.1,0.3,0.1,1400.0,tech,thursday


## Preprocessing

In [8]:
def final_preprocessing_eval(df, dev_stats):
    working_df_dev = df.copy()

    for index, row in working_df_dev.iterrows():
        if 'data_channel' in row and not row['num_keywords'] >= 0:
            working_df_dev.at[index, 'num_keywords'] = dev_stats['num_keywords_mean'][row['data_channel']]
            
    enc = OneHotEncoder()
    encoded_df = pd.concat([df['weekday'], df['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    working_df_dev[additional_columns] = encoded_df.toarray()
    working_df_dev.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)



    working_df_dev['n_tokens_content'] = np.log(1 + working_df_dev['n_tokens_content'])

    std_scaler = dev_stats['kw_scaler']
    scaled_features = std_scaler.transform(working_df_dev[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']])
    working_df_dev[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']] = scaled_features

    std_scaler = dev_stats['ref_scaler']
    scaled_features = std_scaler.transform(working_df_dev[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    working_df_dev[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']] = scaled_features

    std_scaler = dev_stats['scaler_details']
    scaled_features = std_scaler.transform(working_df_dev[['n_tokens_title', 'n_tokens_content']])
    working_df_dev[['n_tokens_title', 'n_tokens_content']] = scaled_features

    working_df_dev['num_imgs'].fillna(dev_stats['num_imgs_mean'], inplace=True)
    working_df_dev['num_imgs'] = np.log(1 + working_df_dev['num_imgs'])

    working_df_dev['num_self_hrefs'].fillna(dev_stats['num_self_hrefs_mean'], inplace=True)
    working_df_dev['num_self_hrefs'] = np.log(1 + working_df_dev['num_self_hrefs'])

    working_df_dev['num_videos'].fillna(dev_stats['num_videos_mean'], inplace=True)
    working_df_dev['num_videos'] = np.log(1 + working_df_dev['num_videos'])

    is_weekend = []
    for _, row in working_df_dev.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    working_df_dev['is_weekend'] = is_weekend
    working_df_dev.drop(columns=[x for x in additional_columns if x.startswith('weekday')], inplace=True)

    std_scaler = dev_stats['time_scaler']
    scaled_features = std_scaler.transform(working_df_dev[['timedelta']])
    working_df_dev[['timedelta']] = scaled_features

    return working_df_dev

In [9]:
def final_preprocessing_dev(df):
    working_df_dev = df.copy()
    dev_stats = dict()

    enc = OneHotEncoder()
    encoded_df = pd.concat([df['weekday'], df['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    working_df_dev[additional_columns] = encoded_df.toarray()
    working_df_dev.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)

    working_df_dev['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']
    dev_stats['num_keywords_mean'] = df.groupby(['data_channel'], sort=False)['num_keywords'].mean()

    working_df_dev['n_tokens_content'] = np.log(1 + working_df_dev['n_tokens_content'])

    working_df_dev['shares'] = np.log(working_df_dev['shares'])

    # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
    q1 = working_df_dev['kw_avg_avg'].describe()['25%']
    q3 = working_df_dev['kw_avg_avg'].describe()['75%']
    iqr = q3 - q1
    min_kw_avg_avg = q1 - 1.5*iqr
    max_kw_avg_avg = q3 + 1.5*iqr
    working_df_dev = working_df_dev[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]

    std_scaler = StandardScaler().fit(working_df_dev[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg', 'kw_max_avg', 'kw_max_min', 'kw_min_max']])
    scaled_features = std_scaler.transform(working_df_dev[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']])
    working_df_dev[['kw_avg_max', 'kw_avg_avg', 'kw_avg_min', 'kw_min_avg','kw_max_avg', 'kw_max_min', 'kw_min_max']] = scaled_features
    dev_stats['kw_scaler'] = std_scaler

    std_scaler = StandardScaler().fit(working_df_dev[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    scaled_features = std_scaler.transform(working_df_dev[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    working_df_dev[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']] = scaled_features
    dev_stats['ref_scaler'] = std_scaler

    std_scaler = StandardScaler().fit(working_df_dev[['n_tokens_title', 'n_tokens_content']])
    scaled_features = std_scaler.transform(working_df_dev[['n_tokens_title', 'n_tokens_content']])
    working_df_dev[['n_tokens_title', 'n_tokens_content']] = scaled_features
    dev_stats['scaler_details'] = std_scaler

    working_df_dev['num_imgs'].fillna(working_df_dev['num_imgs'].mean(), inplace=True)
    working_df_dev['num_imgs'] = np.log(1 + working_df_dev['num_imgs'])
    dev_stats['num_imgs_mean'] = working_df_dev['num_imgs'].mean()

    working_df_dev['num_self_hrefs'].fillna(working_df_dev['num_self_hrefs'].mean(), inplace=True)
    working_df_dev['num_self_hrefs'] = np.log(1 + working_df_dev['num_self_hrefs'])
    dev_stats['num_self_hrefs_mean'] = working_df_dev['num_self_hrefs'].mean()

    working_df_dev['num_videos'].fillna(working_df_dev['num_videos'].mean(), inplace=True)
    working_df_dev['num_videos'] = np.log(1 + working_df_dev['num_videos'])
    dev_stats['num_videos_mean'] = working_df_dev['num_videos'].mean()

    is_weekend = []
    for _, row in working_df_dev.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    working_df_dev['is_weekend'] = is_weekend
    working_df_dev.drop(columns=[x for x in additional_columns if x.startswith('weekday')], inplace=True)

    std_scaler = StandardScaler().fit(working_df_dev[['timedelta']])
    scaled_features = std_scaler.transform(working_df_dev[['timedelta']])
    working_df_dev[['timedelta']] = scaled_features
    dev_stats['time_scaler'] = std_scaler

    return working_df_dev, dev_stats

In [10]:
def calcDrop(res):
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res['v1'].tolist() + res['v2'].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res['drop'].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[ res['v1'].isin(keep)  | res['v2'].isin(keep) ][['v1', 'v2']]
    q = list(set(p['v1'].tolist() + p['v2'].tolist()))
    drop = (list(set(q).difference(set(keep))))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[ res['v1'].isin(poss_drop)  | res['v2'].isin(poss_drop) ][['v1', 'v2','drop']]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m['v1'].isin(drop) & ~m['v2'].isin(drop)]['drop']))
    for item in more_drop:
        drop.append(item)

    return drop

def corrX_new(df, cut = 0.9):
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis = 1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(np.bool_))
    dropcols = list()

    res = pd.DataFrame(columns=(['v1', 'v2', 'v1.target',
                                 'v2.target','corr', 'drop' ]))
    for row in range(len(up)-1):
        col_idx = row + 1
        for col in range (col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = pd.Series([ corr_mtx.index[row],
                                up.columns[col],
                                avg_corr[row],
                                avg_corr[col],
                                up.iloc[row,col],
                                drop],
                              index = res.columns)

                res.loc[len(res)] = s.to_numpy()

    dropcols_names = calcDrop(res)

    return dropcols_names

In [11]:
working_df_dev, dev_stats = final_preprocessing_dev(df_dev)
df_working_df_eval = final_preprocessing_eval(df_eval, dev_stats)

drop_new = corrX_new(working_df_dev, cut = 0.7)
working_df_dev.drop(drop_new, axis=1, inplace=True)
df_working_df_eval.drop(drop_new, axis=1, inplace=True)
(len(working_df_dev), len(df_eval))

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  working_df_dev['num_keywords'] = df.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']


(30410, 7917)

In [12]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=42)

 ## Hyperparameter tuning

In [6]:
params = {
    "kernel": ['rbf'],
    "gamma": ['scale', 'auto'],
    "tol": [1e-3, 1e-4],
    "C": [0.5, 1, 2, 6],
    "epsilon": [0.3, 0.5, 1, 2, 6],
    "shrinking": [True, False]
}

In [7]:
# prova a non fare la cross validation!
gs = GridSearchCV(SVR(), param_grid=params, cv=5, scoring='r2', verbose=3)
gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV 1/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.001;, score=0.134 total time= 1.2min
[CV 2/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.001;, score=0.126 total time= 1.1min
[CV 3/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.001;, score=0.139 total time= 1.0min
[CV 4/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.001;, score=0.152 total time= 1.0min
[CV 5/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.001;, score=0.132 total time= 1.0min
[CV 1/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.0001;, score=0.134 total time= 1.0min
[CV 2/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.0001;, score=0.126 total time= 1.0min
[CV 3/5] END C=0.5, epsilon=0.3, gamma=scale, kernel=rbf, shrinking=True, tol=0.0001;, score=0.139 total time= 1.0mi

## Generate final CSV

In [13]:
best_combination = {'C': 1, 'epsilon': 0.5, 'gamma': 'auto', 'kernel': 'rbf', 'shrinking': False, 'tol': 0.001}

In [14]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values
svr = SVR(**best_combination)
svr.fit(X,y)

In [15]:
# Make final predictions
y_pred = svr.predict(df_working_df_eval.values)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = y_pred
new_df.to_csv('../output/svr_rbf_results.csv', columns=['Id','Predicted'], index=False)