# Find important features and use the top 20 for model

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

raw = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')
raw.columns = [c.strip() for c in raw.columns]
raw.drop(columns=['url', 'timedelta'], inplace=True)
raw['popular'] = raw['shares'].apply(lambda row: 0 if row < 1400 else 1)
raw.drop(columns=['shares'], inplace=True)

In [2]:
X = raw.drop(columns=['popular'])
y = raw['popular']

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [4]:
cat_vars = ['data_channel_is_lifestyle', 'data_channel_is_bus',
               'data_channel_is_entertainment', 'data_channel_is_socmed',
               'data_channel_is_tech', 'data_channel_is_world', 'weekday_is_monday', 
               'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday',
               'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend']

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def sep_scale_combine(X, cat_vars):
    X_cat = X[cat_vars]
    X_cont = X.drop(columns=cat_vars, axis=1)

    X_cont[X_cont.columns] = scaler.fit_transform(X_cont)
    X_scaled = pd.concat([X_cat, X_cont], axis=1)
    return X_scaled

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [7]:
# Original Random Forest Model with 'improved' hyperparameters
rf = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [8]:
print('Accuracy: ', accuracy_score(y_test, y_pred))

important_features = pd.Series(data=rf.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)
important_features

Accuracy:  0.6636119051622666


kw_avg_avg                       0.050863
kw_max_avg                       0.043859
LDA_02                           0.035849
self_reference_min_shares        0.035110
self_reference_avg_sharess       0.031961
kw_avg_min                       0.029493
LDA_01                           0.029234
LDA_04                           0.028870
kw_avg_max                       0.028844
LDA_00                           0.027958
n_unique_tokens                  0.027381
n_non_stop_unique_tokens         0.027017
kw_min_avg                       0.026124
self_reference_max_shares        0.025981
global_subjectivity              0.025600
n_tokens_content                 0.025429
average_token_length             0.025200
LDA_03                           0.024998
global_rate_positive_words       0.024735
global_sentiment_polarity        0.024122
kw_max_min                       0.024017
avg_positive_polarity            0.023659
avg_negative_polarity            0.022508
global_rate_negative_words       0

In [9]:
drop_cols = important_features.index[20:]

In [10]:
X = X.drop(columns=drop_cols)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
rf = RandomForestClassifier(n_estimators=750, max_depth=30, min_samples_leaf=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))


Accuracy:  0.651589036488986


In [16]:
index_min = 0
index_max = 10000
rf = RandomForestClassifier(n_estimators=750, max_depth=20, min_samples_leaf=10)
rf_predictions = pd.DataFrame(columns=['y_actual', 'y_predict'], dtype=int)
acc_training = []

while index_max <= 39000:
    X_train_rf = X.iloc[index_min : index_max]
    y_train_rf = y.iloc[index_min : index_max]
    
    X_test_rf = X.iloc[index_max : (index_max + 1000)]
    y_test_rf = y.iloc[index_max : (index_max + 1000)]
    
    rf.fit(X_train_rf, y_train_rf)
    acc_training.append(rf.score(X_train_rf, y_train_rf))
    y_predictions_rf = rf.predict(X_test_rf)
    y_actual_and_predict_rf = pd.DataFrame({'y_actual': y_test_rf, 'y_predict': y_predictions_rf})
    rf_predictions = rf_predictions.append(y_actual_and_predict_rf, ignore_index=False)
    index_min += 1000
    index_max += 1000

In [17]:
print(acc_training)
accuracy_score(rf_predictions['y_actual'].values, rf_predictions['y_predict'].values)

[0.8757, 0.8765, 0.8742, 0.8757, 0.8762, 0.8731, 0.8713, 0.8629, 0.8564, 0.8576, 0.8586, 0.8631, 0.8635, 0.8618, 0.8618, 0.8569, 0.8547, 0.8526, 0.8519, 0.8506, 0.8506, 0.8445, 0.8475, 0.8492, 0.8465, 0.8443, 0.8435, 0.8415, 0.843, 0.8447]


0.6544326001889084

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

rf = RandomForestClassifier()
rf_params = {'n_estimators': [200, 400, 750, 1000], 'max_depth' : [15, 20, 30], 'min_samples_leaf' : [1, 5, 10]}
gsRf = GridSearchCV(estimator=rf, param_grid=rf_params)

gsRf.fit(X_train, y_train)
print(gsRf.best_params_ )
gsRf.predict(X_test)
print(gsRf.score(X_test, y_test)) # Accuracy

{'max_depth': 20, 'min_samples_leaf': 10, 'n_estimators': 750}
0.6491508323524466


## Repeat window with scaled data and hyperparameters

In [20]:
# Scale and optimize Random Forest hyperparameters. 
X_scaled = X.copy()
X_scaled[X_scaled.columns] = scaler.fit_transform(X_scaled)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.3, random_state=15)

rf_scaled = RandomForestClassifier()
rf_params_scaled = {'n_estimators': [200, 400, 750, 1000], 'max_depth' : [15, 20, 30], 'min_samples_leaf' : [1, 5, 10]}
gsRf_scaled = GridSearchCV(estimator=rf_scaled, param_grid=rf_params_scaled)

gsRf_scaled.fit(X_train_scaled, y_train_scaled)
print(gsRf_scaled.best_params_ )
gsRf_scaled.predict(X_test_scaled)
print(gsRf_scaled.score(X_test_scaled, y_test_scaled)) # Accuracy

{'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 1000}
0.6615100050445603


In [22]:
index_min = 0
index_max = 10000
rf_scaled = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)
rf_predictions_scaled = pd.DataFrame(columns=['y_actual', 'y_predict'], dtype=int)
acc_training_scaled = []

while index_max <= 39000:
    X_train_rf = X.iloc[index_min : index_max]
    X_train_rf_scaled = scaler.fit_transform(X_train_rf)
    y_train_rf = y.iloc[index_min : index_max]
    
    X_test_rf = X.iloc[index_max : (index_max + 1000)]
    X_test_rf_scaled = scaler.fit_transform(X_test_rf)
    y_test_rf = y.iloc[index_max : (index_max + 1000)]
    
    rf_scaled.fit(X_train_rf_scaled, y_train_rf)
    acc_training_scaled.append(rf.score(X_train_rf_scaled, y_train_rf))
    y_predictions_rf = rf_scaled.predict(X_test_rf_scaled)
    y_actual_and_predict_rf = pd.DataFrame({'y_actual': y_test_rf, 'y_predict': y_predictions_rf})
    rf_predictions_scaled = rf_predictions_scaled.append(y_actual_and_predict_rf, ignore_index=False)
    index_min += 1000
    index_max += 1000

In [23]:
print(acc_training_scaled)
accuracy_score(rf_predictions_scaled['y_actual'].values, rf_predictions_scaled['y_predict'].values)

[0.4612, 0.4676, 0.4787, 0.4846, 0.4992, 0.5136, 0.5266, 0.5303, 0.5281, 0.5175, 0.5072, 0.5006, 0.4875, 0.4853, 0.4769, 0.4715, 0.4666, 0.4693, 0.4709, 0.4814, 0.4815, 0.4822, 0.5008, 0.5067, 0.5141, 0.5215, 0.5329, 0.536, 0.5367, 0.5397]


0.6408042099581703

Looking at the training set accuracy, even when changed significantly (by 0.5), the testing set acccuracy is consistent. Small difference of .02 can be accounted for by randomness.