In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
#import pingouin as pg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer

def removingOutlierColumn(col,df,fact = 1.5):
  #fact usually should be 1.5
  q1 = df[col].quantile(0.25)    # First Quartile
  q3 = df[col].quantile(0.75)    # Third Quartile
  IQR = q3 - q1                            # Inter Quartile Range

  llimit = q1 - fact*IQR                       # Lower Limit
  ulimit = q3 + fact*IQR                        # Upper Limit

  outliers = df[(df[col] < llimit) | (df[col] > ulimit)]

  df.drop(outliers.index, axis = 0, inplace = True)


  print('Number of outliers in "' + col + ' : ' + str(len(outliers)))
  print(llimit)
  print(ulimit)
  print(IQR)

def findImportance(df):
  X = df
  y = df['shares']
  feature_list = []
  df.drop('shares', axis = 1, inplace = True)
  reg = RandomForestRegressor(100, random_state=42)
  reg.fit(X, y)
  df["shares"] = y
  feature_dict = dict(sorted(zip(df.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True))
  temp = feature_dict.keys()
  for key in temp:
    feature_list.append(key)
  return feature_dict,feature_list

In [None]:
## Initializing the dataset
path_dev = "./development.csv"
df = pd.read_csv(path_dev)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)



##  Deleting silly columns
df.drop(['url','id','timedelta'], axis = 1, inplace = True)
#   Remove strange outliers with some data with 0 values
mask = (df['average_token_length'] == 0) & (df['n_tokens_content'] == 0)
df.drop(df[mask].index, axis = 0, inplace = True)


##Filling NaN values in num_imgs and num_videos using zero
NaN_columns = ['num_imgs','num_videos']
for el in NaN_columns:
    mean_values = df.groupby('data_channel')[el].mean()
    # Iterate over each group and fill NaN values with the corresponding mean
    for group, mean in mean_values.items():
      df.loc[df['data_channel'] == group, el] = df.loc[df['data_channel'] == group, el].fillna(0)

##  filling NaN values in num_keywords grouping by 'data_channel' and calculate the mean
mean_values = df.groupby('data_channel')["num_keywords"].mean()
for group, mean in mean_values.items():
  df.loc[df['data_channel'] == group, "num_keywords"] = df.loc[df['data_channel'] == group, "num_keywords"].fillna(mean)

##  Transforming several features into a normal distribution shape using logaritmic transformation
logTransformation = ['n_tokens_content','num_hrefs','num_self_hrefs','num_imgs','num_videos','kw_max_min','kw_avg_min','kw_min_max','kw_min_avg','kw_max_max','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares']
df[logTransformation] = np.log(1.001 + df[logTransformation])


"""
##  Z-score normalization of our data
for el in df.columns:
  if(el != "shares" and el != "data_channel" and el != "weekday"):
    df[el] = (df[el] - df[el].mean()) / df[el].std()

for el in df_eval.columns:
  if(el != "shares" and el != "data_channel" and el != "weekday"):
    df_eval[el] = (df_eval[el] - df[el].mean()) / df[el].std()
"""


##  One-Hot Encoding
df = pd.get_dummies(df, columns=['data_channel','weekday'])


In [None]:
y = df["shares"]
df.drop("shares",axis=1,inplace=True)
X = df


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = SVR(kernel = 'poly')
rf_regressor.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_regressor.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
df["shares"] = y

Root Mean Squared Error: 9653.840474739272


In [None]:
y = df["shares"]

df.drop("shares",axis=1,inplace=True)
X = df


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = SVR(kernel = 'rbf')
rf_regressor.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_regressor.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
df["shares"] = y

Root Mean Squared Error: 9651.916647480688


In [None]:
y = df["shares"]
df.drop("shares",axis=1,inplace=True)
X = df


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = GradientBoostingRegressor(random_state = 42)
rf_regressor.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_regressor.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
df["shares"] = y

Root Mean Squared Error: 10275.217609294596


In [None]:
y = df["shares"]
df.drop("shares",axis=1,inplace=True)
X = df


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(n_estimators = 300, max_depth = 35, random_state = 42)
rf_regressor.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_regressor.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
df["shares"] = y

Root Mean Squared Error: 10258.642126191944


In [None]:
y = df["shares"]
df.drop("shares",axis=1,inplace=True)
X = df


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = LinearRegression()
rf_regressor.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_regressor.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
df["shares"] = y

Root Mean Squared Error: 9413.74627385229
