In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

def removingOutlierColumn(col,df,fact = 1.5):
  #fact usually should be 1.5
  q1 = df[col].quantile(0.25)    # First Quartile
  q3 = df[col].quantile(0.75)    # Third Quartile
  IQR = q3 - q1                            # Inter Quartile Range

  llimit = q1 - fact*IQR                       # Lower Limit
  ulimit = q3 + fact*IQR                        # Upper Limit

  outliers = df[(df[col] < llimit) | (df[col] > ulimit)]

  df.drop(outliers.index, axis = 0, inplace = True)


  print('Number of outliers in "' + col + ' : ' + str(len(outliers)))
  print(llimit)
  print(ulimit)
  print(IQR)

## Initializing the dataset
path_eva = "./evaluation.csv"
path_dev = "development.csv"
df = pd.read_csv(path_dev)
df_eval = pd.read_csv(path_eva)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
id_col = df_eval['id']


##  Deleting silly columns
df.drop(['url','id','timedelta'], axis = 1, inplace = True)
df_eval.drop(['url','id','timedelta'], axis = 1, inplace = True)


#   Remove strange outliers with some data with 0 values
mask = (df['average_token_length'] == 0) & (df['n_tokens_content'] == 0)
df.drop(df[mask].index, axis = 0, inplace = True)


##Filling NaN values in num_imgs and num_videos using zero
NaN_columns = ['num_imgs','num_videos']
for el in NaN_columns:
    mean_values = df.groupby('data_channel')[el].mean()
    for group, mean in mean_values.items():
      df.loc[df['data_channel'] == group, el] = df.loc[df['data_channel'] == group, el].fillna(0)
for el in NaN_columns:
    mean_values_eval = df.groupby('data_channel')[el].mean()
    # Iterate over each group and fill NaN values with the corresponding mean
    for group, mean in mean_values_eval.items():
      df_eval.loc[df_eval['data_channel'] == group, el] = df_eval.loc[df_eval['data_channel'] == group, el].fillna(0)

##  filling NaN values in num_keywords grouping by 'data_channel' and calculate the mean
mean_values = df.groupby('data_channel')["num_keywords"].mean()
for group, mean in mean_values.items():
  df.loc[df['data_channel'] == group, "num_keywords"] = df.loc[df['data_channel'] == group, "num_keywords"].fillna(mean)
  df_eval.loc[df_eval['data_channel'] == group, "num_keywords"] = df_eval.loc[df_eval['data_channel'] == group, "num_keywords"].fillna(mean)

##  Transforming several features into a normal distribution shape using logaritmic transformation
logTransformation = ['n_tokens_content','num_hrefs','num_self_hrefs','num_imgs','num_videos','kw_max_min','kw_avg_min','kw_min_max','kw_min_avg','kw_max_max','kw_max_avg','kw_avg_avg','self_reference_min_shares','self_reference_max_shares']
df[logTransformation] = np.log(1.001 + df[logTransformation])
df_eval[logTransformation] = np.log(1.001 + df_eval[logTransformation])

##  One-Hot Encoding
df = pd.get_dummies(df, columns=['data_channel','weekday'])
df_eval = pd.get_dummies(df_eval, columns=['data_channel','weekday'])

##  Dropping redundant columns using our previous computation using correlation matrix
toRemove = ['n_non_stop_words', 'n_unique_tokens','kw_max_max','kw_max_min','kw_min_max','kw_max_avg','self_reference_min_shares','self_reference_max_shares','global_sentiment_polarity','global_rate_negative_words','avg_negative_polarity','abs_title_sentiment_polarity']
df.drop(toRemove,axis=1,inplace=True)
df_eval.drop(toRemove,axis=1,inplace=True)

##  Removing outlier using Quartile. Values in the interval between Q1 - fact * IQR and Q3 + fact * IQR
removingOutlierColumn('shares',df, fact = 8)
removingOutlierColumn('kw_avg_avg',df,1.5)
removingOutlierColumn("self_reference_avg_sharess",df,4)
removingOutlierColumn('kw_avg_min',df,fact = 5)

y_train = df["shares"]
df.drop("shares",axis=1, inplace=True)
X_train = df
X_test = df_eval


##  Initialization the model with best parameters found in training
rf_regressor = RandomForestRegressor(n_estimators = 300, max_depth = 35, random_state = 42)

rf_regressor.fit(X_train,y_train)

y_pred = rf_regressor.predict(X_test)

df["shares"] = y_train

##  Creating CSV file for submitting the results
csv_df = pd.DataFrame(columns=['Id', 'Predicted'])
csv_df['Id'] = id_col
csv_df['Predicted'] = y_pred
csv_df.to_csv('./output.csv', columns=['Id','Predicted'], index=False)

Number of outliers in "shares : 825
-13095.0
16740.0
1755.0
Number of outliers in "kw_avg_avg : 673
7.170798012573379
8.760549506070774
0.3974378733743489
Number of outliers in "self_reference_avg_sharess : 1376
-15006.458333337501
21185.16666667
4021.2916666675
Number of outliers in "kw_avg_min : 635
0.3828502486341421
10.46816040955948
0.91684637826594
