# Reverse OHE on categorical features and apply Mean Encoding

In [45]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

raw = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')
raw.columns = [c.strip() for c in raw.columns]
raw.drop(columns=['url', 'timedelta'], inplace=True)
raw['popular'] = raw['shares'].apply(lambda row: 0 if row < 1400 else 1)
raw.drop(columns=['shares'], inplace=True)

In [46]:
X = raw.drop(columns=['popular'])
y = raw['popular']

In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [48]:
channel_vars = ['data_channel_is_lifestyle', 'data_channel_is_bus',
               'data_channel_is_entertainment', 'data_channel_is_socmed',
               'data_channel_is_tech', 'data_channel_is_world']

day_vars = ['weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday',
            'weekday_is_thursday','weekday_is_friday', 'weekday_is_saturday', 
            'weekday_is_sunday', 'is_weekend']

cat_vars = channel_vars + day_vars

In [49]:
X['channel'] = pd.get_dummies(X[channel_vars]).idxmax(1)
X['day'] = pd.get_dummies(X[day_vars]).idxmax(1)

X = X.drop(columns=cat_vars)

In [50]:
X.shape

(39644, 46)

## Try Mean Encoding on the categorical features

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [52]:
# Mean Encoding Training data
# Reference: https://www.geeksforgeeks.org/mean-encoding-machine-learning/
X_train_mean_encoded = X_train[['channel', 'day']]
y_train_mean_encoded = y_train.copy()
X_train_mean_encoded['popular'] = y_train_mean_encoded
for col in ['channel', 'day']:
    # calclate mean according to positive class value
    mean_enc = X_train_mean_encoded.groupby(col)['popular'].mean().to_dict()
    X_train_mean_encoded[col] = X_train_mean_encoded[col].map(mean_enc)
X_train_mean_encoded = X_train_mean_encoded.drop('popular', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_mean_encoded['popular'] = y_train_mean_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_mean_encoded[col] = X_train_mean_encoded[col].map(mean_enc)


In [53]:
X_train_mean_encoded.shape

(27750, 2)

In [54]:
X_train_mean_encoded = pd.concat([X_train_mean_encoded, X_train.drop(columns=['channel', 'day'])], axis=1)

In [55]:
X_train_mean_encoded.shape

(27750, 46)

In [56]:
# Mean Encoding the Testing data
X_test_mean_encoded = X_test[['channel', 'day']]
y_test_mean_encoded = y_test.copy()
X_test_mean_encoded['popular'] = y_test_mean_encoded
for col in ['channel', 'day']:
    # calclate mean according to positive class value
    mean_enc = X_test_mean_encoded.groupby(col)['popular'].mean().to_dict()
    X_test_mean_encoded[col] = X_test_mean_encoded[col].map(mean_enc)
X_test_mean_encoded = X_test_mean_encoded.drop('popular', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_mean_encoded['popular'] = y_test_mean_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_mean_encoded[col] = X_test_mean_encoded[col].map(mean_enc)


In [57]:
X_test_mean_encoded.shape

(11894, 2)

In [58]:
X_test_mean_encoded = pd.concat([X_test_mean_encoded, X_test.drop(columns=['channel', 'day'])], axis=1)

In [59]:
X_test_mean_encoded.shape

(11894, 46)

In [60]:
rf = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf.fit(X_train_mean_encoded, y_train)
y_pred = rf.predict(X_test_mean_encoded)
accuracy_score(y_test, y_pred)

0.6657138052799731

Create function to mean encode data.

In [61]:
def mean_encode(data, target, features):
    data_mean_enc = data[features]
    data_mean_enc['target'] = target.copy()
    for col in features:
        mean_enc = data_mean_enc.groupby(col)['target'].mean().to_dict()
        data_mean_enc[col] = data_mean_enc[col].map(mean_enc)
    data_mean_enc = data_mean_enc.drop('target', axis=1)
    data_mean_enc = pd.concat([data_mean_enc, data.drop(columns=features)], axis=1)
    return data_mean_enc


## Run moving window with mean encoded categorical features

In [23]:
index_min = 0
index_max = 10000
rf_mean_enc = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf_predictions_mean_enc = pd.DataFrame(columns=['y_actual', 'y_predict'], dtype=int)
acc_training_mean_enc = []

while index_max <= 39000:
    X_train_rf = X.iloc[index_min : index_max]
    y_train_rf = y.iloc[index_min : index_max]
    X_train_rf_mean_enc = mean_encode(X_train_rf, y_train_rf, ['channel', 'day'])
    
    X_test_rf = X.iloc[index_max : (index_max + 1000)]
    y_test_rf = y.iloc[index_max : (index_max + 1000)]
    X_test_rf_mean_enc = mean_encode(X_test_rf, y_test_rf, ['channel', 'day'])
    
    rf_mean_enc.fit(X_train_rf_mean_enc, y_train_rf)
    acc_training_mean_enc.append(rf_mean_enc.score(X_train_rf_mean_enc, y_train_rf))
    y_predictions_rf = rf_mean_enc.predict(X_test_rf_mean_enc)
    y_actual_and_predict_rf = pd.DataFrame({'y_actual': y_test_rf, 'y_predict': y_predictions_rf})
    rf_predictions_mean_enc = rf_predictions_mean_enc.append(y_actual_and_predict_rf, ignore_index=False)
    index_min += 1000
    index_max += 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc[col] = data_mean_enc[col].map(mean_enc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc[col] = data_mean_enc[col].map(mean_enc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc[col] = data_mean_enc[col].map(mean_enc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_mean_enc['target'] = target.copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

In [24]:
print(acc_training_mean_enc)
accuracy_score(rf_predictions_mean_enc['y_actual'].values, rf_predictions_mean_enc['y_predict'].values)

[0.9723, 0.9748, 0.9705, 0.9692, 0.9717, 0.9697, 0.9685, 0.9644, 0.9658, 0.9668, 0.9646, 0.9674, 0.9702, 0.9691, 0.9677, 0.9643, 0.9609, 0.9587, 0.9592, 0.9581, 0.9586, 0.956, 0.9551, 0.9545, 0.9514, 0.952, 0.9488, 0.9503, 0.9501, 0.9506]


0.6726487653488058

### Accuracy of 67% replicated with lower complexity.

#### Consider removing more features

In [75]:
important_features = pd.Series(data=rf_mean_enc.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)

ValueError: Length of passed values is 46, index implies 23.

In [63]:
drop_cols = important_features.index[23:]

In [64]:
X = X.drop(columns=drop_cols)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [71]:
rf = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6544476206490668

In [72]:
index_min = 0
index_max = 10000
rf_less_feat = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf_predictions_less_feat = pd.DataFrame(columns=['y_actual', 'y_predict'], dtype=int)
acc_training_less_feat = []

while index_max <= 39000:
    X_train_rf_less_feat = X.iloc[index_min : index_max]
    y_train_rf_less_feat = y.iloc[index_min : index_max]
    
    X_test_rf_less_feat = X.iloc[index_max : (index_max + 1000)]
    y_test_rf_less_feat = y.iloc[index_max : (index_max + 1000)]
    
    rf_less_feat.fit(X_train_rf_less_feat, y_train_rf_less_feat)
    acc_training_less_feat.append(rf_less_feat.score(X_train_rf_less_feat, y_train_rf_less_feat))
    y_predictions_rf = rf_less_feat.predict(X_test_rf_less_feat)
    y_actual_and_predict_rf = pd.DataFrame({'y_actual': y_test_rf_less_feat, 'y_predict': y_predictions_rf})
    rf_predictions_less_feat = rf_predictions_less_feat.append(y_actual_and_predict_rf, ignore_index=False)
    index_min += 1000
    index_max += 1000

In [73]:
print(acc_training_less_feat)
accuracy_score(rf_predictions_less_feat['y_actual'].values, rf_predictions_less_feat['y_predict'].values)

[0.9643, 0.9625, 0.9581, 0.9558, 0.9582, 0.956, 0.9561, 0.95, 0.9512, 0.9522, 0.9505, 0.9504, 0.9533, 0.9534, 0.9509, 0.9461, 0.9453, 0.9429, 0.9419, 0.9422, 0.945, 0.9408, 0.9416, 0.9378, 0.9335, 0.9337, 0.9341, 0.9336, 0.9329, 0.932]


0.6533868573741736