In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [13]:
df_dev = pd.read_csv('dataset/development.csv')
df_eval = pd.read_csv('dataset/evaluation.csv')

## Preprocessing

In [16]:
def final_preprocessing(df):
    df_preproc = df.copy()
    enc = OneHotEncoder()
    encoded_df = pd.concat([df_preproc['weekday'], df_preproc['data_channel']], axis=1)
    enc.fit(encoded_df)
    encoded_df = enc.transform(encoded_df)
    additional_columns = enc.get_feature_names_out()
    df_preproc[additional_columns] = encoded_df.toarray()
    df_preproc.drop(['weekday', 'data_channel', 'url', 'id'], axis = 1, inplace=True)
    df_preproc.drop(columns=['n_unique_tokens','n_non_stop_words','kw_max_min','kw_min_max','kw_max_avg','abs_title_sentiment_polarity',
                                 'abs_title_subjectivity','rate_positive_words','timedelta','max_negative_polarity','min_negative_polarity',
                                 'kw_min_min','kw_max_max','num_self_hrefs','data_channel_bus','LDA_00'], inplace=True)
    df_preproc = df_preproc.query("n_tokens_content > 0")
    df_preproc['n_tokens_content'] = np.log(df_preproc['n_tokens_content'])
    df_preproc['num_imgs'].fillna(0, inplace=True)
    df_preproc['num_imgs'] = np.log(1+df_preproc['num_imgs'])
    df_preproc['num_videos'].fillna(0, inplace=True)
    df_preproc['num_videos'] = np.log(1+df_preproc['num_videos'])
    std_scaler = StandardScaler().fit(df_preproc[['n_tokens_title', 'n_tokens_content']])
    scaled_features = std_scaler.transform(df_preproc[['n_tokens_title', 'n_tokens_content']])
    df_preproc[['n_tokens_title', 'n_tokens_content']] = scaled_features
    df_preproc['avg_negative_polarity'] = df_preproc['avg_negative_polarity'].abs()
    df_preproc['num_keywords'].fillna(0, inplace=True)
    if 'shares' in df_preproc.columns:
        df_preproc['shares'] = np.log(df_preproc['shares'])
    std_scaler = StandardScaler().fit(df_preproc[['kw_avg_min', 'kw_avg_max', 'kw_min_avg', 'kw_avg_avg']])
    scaled_features = std_scaler.transform(df_preproc[['kw_avg_min', 'kw_avg_max', 'kw_min_avg', 'kw_avg_avg']])
    df_preproc[['kw_avg_min', 'kw_avg_max', 'kw_min_avg', 'kw_avg_avg']] = scaled_features
    std_scaler = StandardScaler().fit(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    scaled_features = std_scaler.transform(df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']])
    df_preproc[['self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess']] = scaled_features
    return df_preproc

In [15]:
working_df_dev = final_preprocessing(df_dev)
working_df_dev

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preproc['n_tokens_content'] = np.log(df_preproc['n_tokens_content'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preproc['num_imgs'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preproc['num_imgs'] = np.log(1+df_preproc['num_imgs'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

Unnamed: 0,n_tokens_title,n_tokens_content,n_non_stop_unique_tokens,num_hrefs,num_imgs,num_videos,average_token_length,num_keywords,kw_avg_min,kw_avg_max,...,weekday_saturday,weekday_sunday,weekday_thursday,weekday_tuesday,weekday_wednesday,data_channel_entertainment,data_channel_lifestyle,data_channel_socmed,data_channel_tech,data_channel_world
0,0.768992,1.182360,0.545031,10.0,3.526361,0.693147,4.656158,4.0,-0.322607,1.085158,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.649891,0.213789,0.737542,9.0,0.000000,0.693147,4.576541,10.0,0.340016,-1.040296,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.649891,-0.853837,0.748428,12.0,1.609438,0.693147,4.935345,6.0,-0.181578,2.415724,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.768992,-1.274725,0.867925,9.0,0.000000,0.000000,4.970760,6.0,0.211117,0.423268,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.296031,-0.565145,0.800000,5.0,0.000000,0.000000,5.006993,0.0,0.070408,-1.882699,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31709,0.296031,0.254330,0.703008,9.0,0.693147,0.693147,4.372587,0.0,0.045028,-0.256184,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31710,0.296031,0.029174,0.718978,10.0,0.000000,0.000000,4.784091,5.0,-0.142975,0.010189,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
31712,-0.649891,1.118374,0.710623,6.0,1.098612,0.693147,4.594427,8.0,-0.184777,-0.335967,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
31713,0.296031,2.101447,0.621080,21.0,0.693147,1.386294,4.353239,10.0,-0.220395,0.156338,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Model

In [18]:
X = working_df_dev.drop(columns=["shares"]).values
y = working_df_dev["shares"].values

In [None]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X , y)

In [None]:
working_df_eval = final_preprocessing(df_eval)
X_test = working_df_eval.values

y_pred = reg.predict(X_test)

In [None]:
#pd.DataFrame(y_pred, colu)