# Feature Scaling

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

raw = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')
raw.columns = [c.strip() for c in raw.columns]
raw.drop(columns=['url', 'timedelta'], inplace=True)
raw['popular'] = raw['shares'].apply(lambda row: 0 if row < 1400 else 1)
raw.drop(columns=['shares'], inplace=True)

In [2]:
X = raw.drop(columns=['popular'])
y = raw['popular']

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [4]:
cat_vars = ['data_channel_is_lifestyle', 'data_channel_is_bus',
               'data_channel_is_entertainment', 'data_channel_is_socmed',
               'data_channel_is_tech', 'data_channel_is_world', 'weekday_is_monday', 
               'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday',
               'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend']

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler()
X_train_cat = X_train[cat_vars]
X_train_cont = X_train.drop(columns=cat_vars, axis=1)
X_train_cont[X_train_cont.columns] = scaler.fit_transform(X_train_cont)
X_train_scaled = pd.concat([X_train_cat, X_train_cont], axis=1)

In [7]:
X_test_cat = X_test[cat_vars]
X_test_cont = X_test.drop(columns=cat_vars, axis=1)
X_test_cont[X_test_cont.columns] = scaler.fit_transform(X_test_cont)
X_test_scaled = pd.concat([X_test_cat, X_test_cont], axis=1)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [9]:
rf = RandomForestClassifier(n_estimators=400)
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.6548680006726081

Create function to separate categorical and continuous data, scale continuous data, and return joined data

In [10]:
def sep_scale_combine(X, cat_vars):
    X_cat = X[cat_vars]
    X_cont = X.drop(columns=cat_vars, axis=1)

    X_cont[X_cont.columns] = scaler.fit_transform(X_cont)
    X_scaled = pd.concat([X_cat, X_cont], axis=1)
    return X_scaled

Moving window run with previously optimized hyperparameters and then again with newly optimized hyperparameters.

In [12]:
index_min = 0
index_max = 10000
rf = RandomForestClassifier(n_estimators=1000, max_depth=30, min_samples_leaf=1)
rf_predictions = pd.DataFrame(columns=['y_actual', 'y_predict'], dtype=int)

while index_max <= 39000:
    X_train_rf = X.iloc[index_min : index_max]
    X_train_rf_scaled = sep_scale_combine(X_train_rf, cat_vars)
    y_train_rf = y.iloc[index_min : index_max]
    
    X_test_rf = X.iloc[index_max : (index_max + 1000)]
    X_test_rf_scaled = sep_scale_combine(X_test_rf, cat_vars)
    y_test_rf = y.iloc[index_max : (index_max + 1000)]
    
    rf.fit(X_train_rf_scaled, y_train_rf)
    y_predictions_rf = rf.predict(X_test_rf_scaled)
    y_actual_and_predict_rf = pd.DataFrame({'y_actual': y_test_rf, 'y_predict': y_predictions_rf})
    rf_predictions = rf_predictions.append(y_actual_and_predict_rf, ignore_index=False)
    index_min += 1000
    index_max += 1000

In [13]:
accuracy_score(rf_predictions['y_actual'].values, rf_predictions['y_predict'].values)

0.6540952637970584

In [11]:
X_scaled = sep_scale_combine(X, cat_vars)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.3, random_state=15)

rf_scaled = RandomForestClassifier()
rf_params_scaled = {'n_estimators': [200, 400, 750, 1000], 'max_depth' : [15, 20, 30], 'min_samples_leaf' : [1, 5, 10]}
gsRf_scaled = GridSearchCV(estimator=rf_scaled, param_grid=rf_params_scaled)

gsRf_scaled.fit(X_train_scaled, y_train_scaled)
print(gsRf_scaled.best_params_ )
gsRf_scaled.predict(X_test_scaled)
print(gsRf_scaled.score(X_test_scaled, y_test_scaled)) # Accuracy

{'max_depth': 30, 'min_samples_leaf': 1, 'n_estimators': 1000}
0.6787455860097528


Training data has an accuracy of 68% while testing data has an accuracy of 65%. Does not generalize well.