In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

In [3]:
def final_preprocessing(df, scaler = None, dev_stats=None):

    # one hot encoding
    working_df = df.copy()
    enc = OneHotEncoder()
    encoded_df = pd.concat([working_df['weekday'], working_df['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    working_df[additional_columns] = encoded_df.toarray()
    # print(working_df.shape)

    is_weekend = []
    for _, row in working_df.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    working_df['is_weekend'] = is_weekend
    working_df.drop(columns=[x for x in additional_columns if x.startswith('weekday')], inplace=True)

    # feature selection from correlation analysis
    working_df.drop(['weekday', 'data_channel', 'url', 'id', 'n_tokens_content', 'n_non_stop_words', 'kw_max_min',
                'kw_min_max', 'kw_min_min', 'kw_max_avg', 'title_subjectivity', 'rate_positive_words'], axis = 1, inplace=True)
    # , 'kw_avg_min', 'kw_avg_avg'
    # 'url', 'id','weekday','data_channel'
    working_df.drop(['self_reference_avg_sharess', 'kw_max_max', 'avg_positive_polarity', 'rate_negative_words', 'abs_title_sentiment_polarity', 'avg_negative_polarity', 'n_non_stop_unique_tokens'], inplace=True, axis=1)
    
    # fill missing values
    working_df['num_keywords'].fillna(0, inplace=True)
    working_df['num_imgs'].fillna(0, inplace=True)
    working_df['num_self_hrefs'].fillna(0, inplace=True)
    working_df['num_videos'].fillna(0, inplace=True)
    if dev_stats == None:
        dev_stats = dict()
        kw_avg_min_mean =  working_df['kw_avg_min'][working_df['kw_avg_min']>0].mean()
        kw_min_avg_mean =  working_df['kw_min_avg'][working_df['kw_min_avg']>0].mean()
        working_df['kw_avg_min'] = working_df['kw_avg_min'].apply(lambda x: kw_avg_min_mean if x == -1 else x)
        working_df['kw_min_avg'] = working_df['kw_min_avg'].apply(lambda x: kw_min_avg_mean if x == -1 else x)
        dev_stats['kw_avg_min_mean'] = kw_avg_min_mean
        dev_stats['kw_min_avg_mean'] = kw_min_avg_mean
    else:
        working_df['kw_avg_min'] = working_df['kw_avg_min'].apply(lambda x: dev_stats['kw_avg_min_mean'] if x == -1 else x)
        working_df['kw_min_avg'] = working_df['kw_min_avg'].apply(lambda x: dev_stats['kw_avg_min_mean'] if x == -1 else x)
    
    
    if scaler == None:
        q1 = working_df['shares'].describe()['25%']
        q3 = working_df['shares'].describe()['75%']
        iqr = q3 - q1
        min_shares = q1 - 1.5*iqr
        max_shares = q3 + 1.5*iqr
        # print(min_shares, max_shares)
        working_df = working_df[(df.shares < max_shares) & (df.shares > min_shares)]
        print(working_df.shape)

        q1 = working_df['global_subjectivity'].describe()['25%']
        q3 = working_df['global_subjectivity'].describe()['75%']
        iqr = q3 - q1
        min_global_subjectivity = q1 - 1.5*iqr
        max_global_subjectivity = q3 + 1.5*iqr
        # print(min_global_subjectivity, max_global_subjectivity)
        working_df = working_df[(df.global_subjectivity < max_global_subjectivity) & (df.global_subjectivity > min_global_subjectivity)]
        print(working_df.shape)

        q1 = working_df['kw_avg_avg'].describe()['25%']
        q3 = working_df['kw_avg_avg'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_avg = q1 - 1.5*iqr
        max_kw_avg_avg = q3 + 1.5*iqr
        # print(min_kw_avg_avg, max_kw_avg_avg)
        working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]
        print(working_df.shape)

        q1 = working_df['self_reference_min_shares'].describe()['25%']
        q3 = working_df['self_reference_min_shares'].describe()['75%']
        iqr = q3 - q1
        min_self_reference_min_shares = q1 - 1.5*iqr
        max_self_reference_min_shares = q3 + 1.5*iqr
        # print(min_self_reference_min_shares, max_self_reference_min_shares)
        working_df = working_df[(df.self_reference_min_shares < max_self_reference_min_shares) & (df.self_reference_min_shares > min_self_reference_min_shares)]
        print(working_df.shape)

        q1 = working_df['self_reference_max_shares'].describe()['25%']
        q3 = working_df['self_reference_max_shares'].describe()['75%']
        iqr = q3 - q1
        min_self_reference_max_shares = q1 - 1.5*iqr
        max_self_reference_max_shares = q3 + 1.5*iqr
        # print(min_self_reference_min_shares, max_self_reference_min_shares)
        working_df = working_df[(df.self_reference_max_shares < max_self_reference_max_shares) & (df.self_reference_max_shares > min_self_reference_max_shares)]
        print(working_df.shape)


        # q1 = working_df['num_self_hrefs'].describe()['25%']
        # q3 = working_df['num_self_hrefs'].describe()['75%']
        # iqr = q3 - q1
        # min_num_self_hrefs = q1 - 1.5*iqr
        # max_num_self_hrefs = q3 + 1.5*iqr
        # # print(min_num_self_hrefs, max_num_self_hrefs)
        # working_df = working_df[(df.num_self_hrefs < max_num_self_hrefs) & (df.num_self_hrefs > min_num_self_hrefs)]
        # print(working_df.shape)

        # q1 = working_df['num_imgs'].describe()['25%']
        # q3 = working_df['num_imgs'].describe()['75%']
        # iqr = q3 - q1
        # min_num_imgs = q1 - 1.5*iqr
        # max_num_imgs = q3 + 1.5*iqr
        # # print(min_num_imgs, max_num_imgs)
        # working_df = working_df[(df.num_imgs < max_num_imgs) & (df.num_imgs > min_num_imgs)]
        # print(working_df.shape)

        # q1 = working_df['num_videos'].describe()['25%']
        # q3 = working_df['num_videos'].describe()['75%']
        # iqr = q3 - q1
        # min_num_videos = q1 - 1.5*iqr
        # max_num_videos = q3 + 1.5*iqr
        # # print(min_num_videos, max_num_videos)
        # working_df = working_df[(df.num_videos < max_num_videos) & (df.num_videos > min_num_videos)]
        # print(working_df.shape)



    # log scales
    working_df['num_imgs'] = np.log(1 + working_df['num_imgs'])
    working_df['num_self_hrefs'] = np.log(1 + working_df['num_self_hrefs'])
    working_df['kw_avg_avg'] = np.log(1+working_df['kw_avg_avg'])
    working_df['kw_avg_min'] =np.log(1+working_df['kw_avg_min'])
    # avoid log on num_videos because it makes it worse

    y_dev = None
    
    # standard scaler
    if scaler == None:
        print(working_df.shape)
        # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
        # q1 = working_df['kw_avg_avg'].describe()['25%']
        # q3 = working_df['kw_avg_avg'].describe()['75%']
        # iqr = q3 - q1
        # min_kw_avg_avg = q1 - 1.5*iqr
        # max_kw_avg_avg = q3 + 1.5*iqr
        # working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]

        working_df['shares'] = np.log(working_df['shares'])
        y_dev = working_df['shares']
        working_df.drop(columns=['shares'], inplace=True)
        scaler = StandardScaler().fit(working_df)
        scaled_features = scaler.transform(working_df)
        working_df[:] = scaled_features[:]

        trans = RFECV(estimator=XGBRegressor(), step=1, cv=4 ,n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')
        trans.fit(working_df, y_dev)
        dev_stats['trans'] = trans
        working_df = trans.transform(working_df)
        
    else:
        scaled_features = scaler.transform(working_df)
        working_df[:] = scaled_features[:]
        trans = dev_stats['trans']
        working_df = trans.transform(working_df)

    # print(scaled_features.shape)
    working_df = working_df[:, [ 0,  1,  2,  3, 10, 11, 12, 14, 15, 29, 32, 34]]
   

    return working_df, scaler, y_dev, dev_stats

In [4]:
working_df_dev, std_scaler, y_dev, dev_stats = final_preprocessing(df_dev)

(28081, 38)
(26785, 38)
(25591, 38)
(22562, 38)
(19924, 38)
(19924, 38)


  working_df = working_df[(df.global_subjectivity < max_global_subjectivity) & (df.global_subjectivity > min_global_subjectivity)]
  working_df = working_df[(df.kw_avg_avg < max_kw_avg_avg) & (df.kw_avg_avg > min_kw_avg_avg)]
  working_df = working_df[(df.self_reference_min_shares < max_self_reference_min_shares) & (df.self_reference_min_shares > min_self_reference_min_shares)]
  working_df = working_df[(df.self_reference_max_shares < max_self_reference_max_shares) & (df.self_reference_max_shares > min_self_reference_max_shares)]


Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 36 features.
Fitting estimator with 36 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 35 features.
Fitting estimator with 35 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 31 features.
Fitting estimator with 31 features.
Fitting estimator with 30 fe

In [5]:
X = working_df_dev
y = y_dev

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, random_state=42)

In [6]:
dev_stats['trans'].n_features_

35

In [7]:
rfreg = GradientBoostingRegressor(**{'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42})
rfreg.fit(X_train, y_train)

rms = mean_squared_error(y_valid, rfreg.predict(X_valid), squared=False)
print(rms)
r2 = r2_score(y_valid, rfreg.predict(X_valid))
adj_r2 = 1-(1-r2)*(len(X_valid) - 1)/(len(X_valid) - X_valid.shape[1] - 1)
print(adj_r2)
'''
0.5588991495408469
0.18894985155235465
'''

0.55439213566643
0.17592776547602917


'\n0.5588991495408469\n0.18894985155235465\n'

In [8]:
rfreg = GradientBoostingRegressor(**{'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42})
rfreg.fit(X, y)

In [10]:
np.where(rfreg.feature_importances_ > 0.03)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),)

In [11]:
working_df_eval, _, _, _= final_preprocessing(df_eval, std_scaler, dev_stats)

In [12]:
y_pred = rfreg.predict(working_df_eval)
final_preds = np.exp(y_pred)
# Write CSV
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
print(new_df.describe())
new_df.to_csv('../output/gboost_with_rfecv.csv', columns=['Id','Predicted'], index=False)

                 Id    Predicted
count   7917.000000  7917.000000
mean   35679.634584  1454.654358
std     2289.051312   398.001047
min    31715.000000   508.912630
25%    33699.000000  1170.762520
50%    35680.000000  1396.987923
75%    37661.000000  1673.464539
max    39643.000000  3878.277454
