In [340]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns


In [341]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

In [342]:
def calcDrop(res):
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res['v1'].tolist() + res['v2'].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res['drop'].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[ res['v1'].isin(keep)  | res['v2'].isin(keep) ][['v1', 'v2']]
    q = list(set(p['v1'].tolist() + p['v2'].tolist()))
    drop = (list(set(q).difference(set(keep))))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[ res['v1'].isin(poss_drop)  | res['v2'].isin(poss_drop) ][['v1', 'v2','drop']]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m['v1'].isin(drop) & ~m['v2'].isin(drop)]['drop']))
    for item in more_drop:
        drop.append(item)

    return drop

In [343]:
def corrX_new(df, cut = 0.9):
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis = 1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(np.bool_))
    dropcols = list()

    res = pd.DataFrame(columns=(['v1', 'v2', 'v1.target',
                                 'v2.target','corr', 'drop' ]))
    for row in range(len(up)-1):
        col_idx = row + 1
        for col in range (col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = pd.Series([ corr_mtx.index[row],
                                up.columns[col],
                                avg_corr[row],
                                avg_corr[col],
                                up.iloc[row,col],
                                drop],
                              index = res.columns)

                res.loc[len(res)] = s.to_numpy()

    dropcols_names = calcDrop(res)

    return dropcols_names

In [344]:
drop_new = corrX_new(df_dev, cut = 0.70)
print(drop_new)
# working_df_dev.drop(drop_new, axis=1, inplace=True)
# df_working_df_eval.drop(drop_new, axis=1, inplace=True)

  corr_mtx = df.corr().abs()


['rate_positive_words', 'self_reference_avg_sharess', 'kw_max_max', 'avg_positive_polarity', 'kw_avg_min', 'rate_negative_words', 'abs_title_sentiment_polarity', 'avg_negative_polarity', 'kw_avg_avg', 'n_non_stop_unique_tokens', 'n_non_stop_words']


In [345]:
def final_preprocessing(df, scaler = None, dev_stats=None):

    # one hot encoding
    working_df = df.copy()
    enc = OneHotEncoder()
    encoded_df = pd.concat([working_df['weekday'], working_df['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    working_df[additional_columns] = encoded_df.toarray()
    # print(working_df.shape)

    # feature selection from correlation analysis
    working_df.drop(['weekday', 'data_channel', 'url', 'id', 'n_tokens_content', 'n_non_stop_words', 'kw_max_min',
                'kw_min_max', 'kw_min_min', 'kw_max_avg', 'title_subjectivity', 'rate_positive_words'], axis = 1, inplace=True)
    # , 'kw_avg_min', 'kw_avg_avg'
    # 'url', 'id','weekday','data_channel'
    working_df.drop(['self_reference_avg_sharess', 'kw_max_max', 'avg_positive_polarity', 'rate_negative_words', 'abs_title_sentiment_polarity', 'avg_negative_polarity', 'n_non_stop_unique_tokens'], inplace=True, axis=1)
    
    # fill missing values
    working_df['num_keywords'].fillna(0, inplace=True)
    working_df['num_imgs'].fillna(0, inplace=True)
    working_df['num_self_hrefs'].fillna(0, inplace=True)
    working_df['num_videos'].fillna(0, inplace=True)
    if dev_stats == None:
        dev_stats = dict()
        kw_avg_min_mean =  working_df['kw_avg_min'][working_df['kw_avg_min']>0].mean()
        kw_min_avg_mean =  working_df['kw_min_avg'][working_df['kw_min_avg']>0].mean()
        working_df['kw_avg_min'] = working_df['kw_avg_min'].apply(lambda x: kw_avg_min_mean if x == -1 else x)
        working_df['kw_min_avg'] = working_df['kw_min_avg'].apply(lambda x: kw_min_avg_mean if x == -1 else x)
        dev_stats['kw_avg_min_mean'] = kw_avg_min_mean
        dev_stats['kw_min_avg_mean'] = kw_min_avg_mean
    else:
        working_df['kw_avg_min'] = working_df['kw_avg_min'].apply(lambda x: dev_stats['kw_avg_min_mean'] if x == -1 else x)
        working_df['kw_min_avg'] = working_df['kw_min_avg'].apply(lambda x: dev_stats['kw_avg_min_mean'] if x == -1 else x)

    
    if scaler == None:
        q1 = working_df['num_hrefs'].describe()['25%']
        q3 = working_df['num_hrefs'].describe()['75%']
        iqr = q3 - q1
        min_num_hrefs = q1 - 1.5*iqr
        max_num_hrefs = q3 + 1.5*iqr
        # print(min_num_hrefs, max_num_hrefs)
        working_df = working_df[(df.num_hrefs < max_num_hrefs) & (df.num_hrefs > min_num_hrefs)]
        # print(working_df.shape)

        q1 = working_df['kw_avg_min'].describe()['25%']
        q3 = working_df['kw_avg_min'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_min = q1 - 1.5*iqr
        max_kw_avg_min = q3 + 1.5*iqr
        # print(min_kw_avg_min, max_kw_avg_min)
        working_df = working_df[(df.kw_avg_min < max_kw_avg_min) & (df.kw_avg_min > min_kw_avg_min)]
        # print(working_df.shape)

        q1 = working_df['num_self_hrefs'].describe()['25%']
        q3 = working_df['num_self_hrefs'].describe()['75%']
        iqr = q3 - q1
        min_num_self_hrefs = q1 - 1.5*iqr
        max_num_self_hrefs = q3 + 1.5*iqr
        # print(min_num_self_hrefs, max_num_self_hrefs)
        working_df = working_df[(df.num_self_hrefs < max_num_self_hrefs) & (df.num_self_hrefs > min_num_self_hrefs)]
        # print(working_df.shape)

        q1 = working_df['num_imgs'].describe()['25%']
        q3 = working_df['num_imgs'].describe()['75%']
        iqr = q3 - q1
        min_num_imgs = q1 - 1.5*iqr
        max_num_imgs = q3 + 1.5*iqr
        # print(min_num_imgs, max_num_imgs)
        working_df = working_df[(df.num_imgs < max_num_imgs) & (df.num_imgs > min_num_imgs)]
        # print(working_df.shape)

        q1 = working_df['num_videos'].describe()['25%']
        q3 = working_df['num_videos'].describe()['75%']
        iqr = q3 - q1
        min_num_videos = q1 - 1.5*iqr
        max_num_videos = q3 + 1.5*iqr
        # print(min_num_videos, max_num_videos)
        working_df = working_df[(df.num_videos < max_num_videos) & (df.num_videos > min_num_videos)]
        # print(working_df.shape)

        q1 = working_df['kw_avg_avg'].describe()['25%']
        q3 = working_df['kw_avg_avg'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_avg = q1 - 1.5*iqr
        max_kw_avg_avg = q3 + 1.5*iqr
        # print(min_kw_avg_avg, max_kw_avg_avg)
        working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]
        # print(working_df.shape)

    # log scales
    working_df['num_imgs'] = np.log(1 + working_df['num_imgs'])
    working_df['num_self_hrefs'] = np.log(1 + working_df['num_self_hrefs'])
    working_df['kw_avg_avg'] = np.log(1+working_df['kw_avg_avg'])
    working_df['kw_avg_min'] =np.log(1+working_df['kw_avg_min'])
        # avoid log on num_videos because it makes it worse

    y_dev = None
    
    # standard scaler
    if scaler == None:
        print(working_df.shape)
        # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
        # q1 = working_df['kw_avg_avg'].describe()['25%']
        # q3 = working_df['kw_avg_avg'].describe()['75%']
        # iqr = q3 - q1
        # min_kw_avg_avg = q1 - 1.5*iqr
        # max_kw_avg_avg = q3 + 1.5*iqr
        # working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]

        working_df['shares'] = np.log(working_df['shares'])
        y_dev = working_df['shares']
        working_df.drop(columns=['shares'], inplace=True)
        scaler = StandardScaler().fit(working_df)
        scaled_features = scaler.transform(working_df)
        working_df[:] = scaled_features[:]

        trans = RFECV(estimator=XGBRegressor(), step=1, cv=4 ,n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')
        trans.fit(working_df, y_dev)
        dev_stats['trans'] = trans
        working_df = trans.transform(working_df)
        
    else:
        scaled_features = scaler.transform(working_df)
        working_df[:] = scaled_features[:]
        trans = dev_stats['trans']
        working_df = trans.transform(working_df)

    # print(scaled_features.shape)
    
   

    return working_df, scaler, y_dev, dev_stats

In [346]:
working_df_dev, std_scaler, y_dev, dev_stats = final_preprocessing(df_dev)
working_df_eval, _, _, _ = final_preprocessing(df_eval, std_scaler, dev_stats)



(12000, 44)


  working_df = working_df[(df.kw_avg_min < max_kw_avg_min) & (df.kw_avg_min > min_kw_avg_min)]
  working_df = working_df[(df.num_self_hrefs < max_num_self_hrefs) & (df.num_self_hrefs > min_num_self_hrefs)]
  working_df = working_df[(df.num_imgs < max_num_imgs) & (df.num_imgs > min_num_imgs)]
  working_df = working_df[(df.num_videos < max_num_videos) & (df.num_videos > min_num_videos)]
  working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]


Fitting estimator with 43 features.
Fitting estimator with 43 features.
Fitting estimator with 43 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 42 features.
Fitting estimator with 42 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 41 features.
Fitting estimator with 41 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 40 features.
Fitting estimator with 40 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 39 features.
Fitting estimator with 39 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 38 features.
Fitting estimator with 38 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 37 fe

In [347]:
X = working_df_dev
y = y_dev

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=42)

In [348]:
rfreg = GradientBoostingRegressor(**{'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42})
rfreg.fit(X_train, y_train)

rms = mean_squared_error(y_valid, rfreg.predict(X_valid), squared=False)
print(rms)
r2 = r2_score(y_valid, rfreg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)

0.7902571669685947
0.15969339785764247


In [349]:
rfreg = GradientBoostingRegressor(**{'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42})
rfreg.fit(X, y)

In [351]:
y_pred = rfreg.predict(working_df_eval)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/rfreg_with_rfecv.csv', columns=['Id','Predicted'], index=False)

                 Id     Predicted
count   7917.000000   7917.000000
mean   35679.634584   2034.727555
std     2289.051312   1114.186086
min    31715.000000    455.196388
25%    33699.000000   1353.546691
50%    35680.000000   1741.818562
75%    37661.000000   2350.443842
max    39643.000000  18090.205014
