In [30]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
import xgboost as xg


from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

In [13]:
df_dev = pd.read_csv('../dataset/development.csv')
df_eval = pd.read_csv('../dataset/evaluation.csv')

In [14]:
def final_preprocessing(df, reduce_df=True):
    df_preproc = df.copy()

    # one hot encoding
    enc = OneHotEncoder()
    encoded_df = pd.concat([df_preproc['weekday'], df_preproc['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    print(encoded_df.toarray().shape)
    df_preproc[additional_columns] = encoded_df.toarray()
    df_preproc.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)

    # drop from feature selection
    df_preproc.drop(columns=['n_unique_tokens','n_non_stop_words','kw_max_min','kw_min_max','kw_max_avg','abs_title_sentiment_polarity',
                     'abs_title_subjectivity','rate_positive_words','timedelta','max_negative_polarity','min_negative_polarity',
                     'kw_min_min','kw_max_max','num_self_hrefs','data_channel_bus','LDA_00'], inplace=True)
    # reduce df
    if reduce_df:
        df_preproc = df_preproc.query("n_tokens_content > 0")
        # Remove outliers from kw_avg_avg (we lost another 9% of the dataset)
        q1 = df_preproc['kw_avg_avg'].describe()['25%']
        q3 = df_preproc['kw_avg_avg'].describe()['75%']
        iqr = q3 - q1
        min_kw_avg_avg = q1 - 1.5*iqr
        max_kw_avg_avg = q3 + 1.5*iqr
        df_preproc = df_preproc[(df_preproc.kw_avg_avg < max_kw_avg_avg) & (df_preproc.kw_avg_avg > min_kw_avg_avg)]
    
    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_content']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_content']])
    df_preproc[['n_tokens_content']] = scaled_features
    
    df_preproc['num_imgs'].fillna(df_preproc['num_imgs'].mean(), inplace=True)
    # df_preproc['num_imgs'] = np.log(1+df_preproc['num_imgs'])
    std_scaler = StandardScaler().fit(df_preproc[['num_imgs']])
    scaled_features = std_scaler.transform(df_preproc[['num_imgs']])
    df_preproc[['num_imgs']] = scaled_features

    df_preproc['num_videos'].fillna(df_preproc['num_videos'].mean(), inplace=True)
    std_scaler = StandardScaler().fit(df_preproc[['num_videos']])
    scaled_features = std_scaler.transform(df_preproc[['num_videos']])
    df_preproc[['num_videos']] = scaled_features
    
    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_title']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_title']])
    df_preproc[['n_tokens_title']] = scaled_features

    if 'shares' in df_preproc.columns:
        df_preproc['shares'] = np.log(df_preproc['shares'])
    
    df_preproc.drop(columns=['self_reference_min_shares','self_reference_max_shares', 'self_reference_avg_sharess'], inplace = True)

    df_preproc.drop(columns=['LDA_01', 'LDA_02', 'LDA_03', 'LDA_04'], inplace=True)

    is_weekend = []
    for _, row in df_preproc.iterrows():
        if row['weekday_sunday'] == 1 or row['weekday_saturday'] == 1:
            is_weekend.append(1)
        else:
            is_weekend.append(0)
    df_preproc['is_weekend'] = is_weekend


    df_preproc.drop(columns=[x for x in additional_columns if x.startswith('weekday')], inplace=True)

    new_df_keywords = df.copy()
    df_preproc['num_keywords'] = new_df_keywords.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']

    std_scaler = StandardScaler().fit(df_preproc[['num_keywords']])
    scaled_features = std_scaler.transform(df_preproc[['num_keywords']])
    df_preproc[['num_keywords']] = scaled_features

    df_preproc.drop(columns=['kw_avg_min'], inplace=True)
    std_scaler = StandardScaler().fit(df_preproc[['kw_avg_max', 'kw_min_avg', 'kw_avg_avg']])
    scaled_features = std_scaler.transform(df_preproc[['kw_avg_max', 'kw_min_avg', 'kw_avg_avg']])
    df_preproc[['kw_avg_max', 'kw_min_avg', 'kw_avg_avg']] = scaled_features


    # df_preproc['avg_negative_polarity'] = df_preproc['avg_negative_polarity'].abs()

    return df_preproc


In [15]:
working_df_dev = final_preprocessing(df_dev)
working_df_dev

(31715, 13)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_preproc['num_keywords'] = new_df_keywords.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']


Unnamed: 0,n_tokens_title,n_tokens_content,n_non_stop_unique_tokens,num_hrefs,num_imgs,num_videos,average_token_length,num_keywords,kw_avg_max,kw_min_avg,...,avg_negative_polarity,title_subjectivity,title_sentiment_polarity,shares,data_channel_entertainment,data_channel_lifestyle,data_channel_socmed,data_channel_tech,data_channel_world,is_weekend
0,0.766514,0.965314,0.545031,10.0,3.902772,-0.066116,4.656158,-1.845251,1.129960,0.698238,...,-0.160714,0.000000,0.000000,7.972466,0.0,0.0,0.0,0.0,0.0,0
1,-0.649809,-0.133735,0.737542,9.0,0.000000,-0.066116,4.576541,1.618213,-1.027350,-0.965816,...,-0.157500,0.000000,0.000000,7.170120,0.0,0.0,0.0,1.0,0.0,0
2,-0.649809,-0.715458,0.748428,12.0,-0.046128,-0.066116,4.935345,-0.690763,2.480468,-0.087832,...,-0.427500,0.000000,0.000000,9.781320,0.0,1.0,0.0,0.0,0.0,0
3,0.766514,-0.846399,0.867925,9.0,-0.590804,0.000000,4.970760,-0.690763,0.458149,0.944244,...,-0.216667,0.400000,-0.250000,7.313220,0.0,0.0,0.0,0.0,0.0,0
4,0.294406,-0.599543,0.800000,5.0,0.000000,-0.337313,5.006993,0.338227,-1.882379,-0.965816,...,-0.251786,0.200000,-0.100000,7.244228,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31709,0.294406,-0.101536,0.703008,9.0,-0.454635,-0.066116,4.372587,0.266477,-0.231487,2.207003,...,-0.386310,0.288889,-0.155556,6.527958,0.0,1.0,0.0,0.0,0.0,0
31710,0.294406,-0.268969,0.718978,10.0,0.000000,0.000000,4.784091,-1.268007,0.038879,0.502769,...,-0.209167,0.000000,0.000000,6.907755,0.0,0.0,0.0,0.0,1.0,0
31712,-0.649809,0.866572,0.710623,6.0,-0.318466,-0.066116,4.594427,0.463725,-0.312466,-0.965816,...,-0.400000,0.000000,0.000000,7.783224,0.0,0.0,0.0,1.0,0.0,0
31713,0.294406,3.028178,0.621080,21.0,-0.454635,0.476277,4.353239,1.618213,0.187219,1.580256,...,-0.323413,0.700000,-0.400000,8.699515,0.0,1.0,0.0,0.0,0.0,0


In [16]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [17]:
params = {
    "n_estimators": 1000,
    "max_depth": 3,
    "min_samples_split": 3,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

In [18]:
reg = GradientBoostingRegressor(**params).fit(X,y)

KeyboardInterrupt: 

In [None]:
working_df_eval = final_preprocessing(df_eval, reduce_df=False)
X_test = working_df_eval.values

y_pred = reg.predict(X_test)
final_preds = np.exp(y_pred)

(7917, 13)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_preproc['num_keywords'] = new_df_keywords.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']


In [9]:
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
new_df.to_csv('../output.csv', columns=['Id','Predicted'], index=False)

In [40]:
# xgbr = XGBRegressor(n_estimators=500, learning_rate=0.01, eval_metric = 'rmsle', alpha=0.5)
# xgb_r = xgbr.fit(X, y)

In [41]:
working_df_eval = final_preprocessing(df_eval, reduce_df=False)
X_test = working_df_eval.values

# y_pred = xgb_r.predict(X_test)
final_preds = np.exp(y_pred)

(7917, 13)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_preproc['num_keywords'] = new_df_keywords.groupby(['data_channel'], sort=False)['num_keywords'].apply(lambda x: x.fillna(x.mean())).reset_index()['num_keywords']


In [42]:
id_col = df_eval['id']
new_df = pd.DataFrame(columns=['Id', 'Predicted'])
new_df['Id'] = id_col
new_df['Predicted'] = final_preds
new_df.to_csv('../output.csv', columns=['Id','Predicted'], index=False)