In [29]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [30]:
head = ['Page Popularity','Page Checkins','Page talking about','Page Category'] \
       + ['extra_{}'.format(i - 4) for i in range(4, 29)] \
       + ['CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','Post Promotion Status','H Local'] \
       + ['published_weekday_{}'.format(i - 39) for i in range(39, 46)] \
       + ['base_weekday_{}'.format(i-45) for i in range(45, 52)] + ['Target']

df = pd.read_csv('./dataset2.csv', names=head)
df.drop('Post Promotion Status', axis=1, inplace=True)

In [31]:
def is_binary(x):
    return x.unique().shape[0] == 2

def normalize(x):
    return (x - x.mean())/x.std()


bin_free = df.columns[~df.apply(is_binary)]
df[bin_free] = df[bin_free].apply(normalize, axis=0)
df['w0_reg_constant'] = 1

In [32]:
df = shuffle(df)

In [33]:
def gradient_descent(X, y, theta=1e-3, epsilon=1e-5):
    w = np.random.normal(size=X.shape[1])    
    w = np.ones(X.shape[1])
    new_w = 1000*w
    N = X.shape[0]
    iter_num = 0

    while np.linalg.norm(w-new_w) >= epsilon:
        new_w=w
        dw = (2 / N) * (X.T.dot( X.dot (w)) - X.T.dot(y))
        w = w - theta*dw

    return w


def R2(x, y):
    return 1 - np.sum(np.power(y - x, 2)) / np.sum(np.power(y - y.mean(), 2))


def reg_prediction(X, w):
    return X.dot(w)


def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])

In [34]:
folds_index = 5
fold_size = round(df.shape[0] / folds_index)

features = pd.DataFrame()
RMSE_test = []
RMSE_train = []
R2_test = []
R2_train = []

features2 = pd.DataFrame()
RMSE_test2 = []
RMSE_train2 = []
R2_test2 = []
R2_train2 = []

for i in range(folds_index):
    test = df[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = df[(i + 1) * fold_size:]
    else:
        train = df[:i * fold_size]
        if i != 4:
            train = train.append(df[(i + 1) * fold_size:], ignore_index=False)

    Features = train.drop('Target', axis=1)
    Target = train['Target']
    w = gradient_descent(Features, Target, 1e-3, 1e-4)
    features = features.append(w, ignore_index=True)

    train_pred = reg_prediction(train.drop('Target', axis=1), w)
    R2_train.append(R2(train_pred, train['Target']))
    RMSE_train.append(RMSE(train_pred, train['Target']))

    test_pred = reg_prediction(test.drop('Target', axis=1), w)
    R2_test.append(R2(test_pred, test['Target']))
    RMSE_test.append(RMSE(test_pred, test['Target']))

res_df = pd.DataFrame(np.vstack([R2_test, R2_train, RMSE_test, RMSE_train]),
                      index=['R2_test', 'R2_train', 'RMSE_test', 'RMSE_train'])
res_df = res_df.append(features.T)
res_df.columns = ['T1', 'T2', 'T3', 'T4', 'T5']
res_df = pd.concat([res_df, res_df.mean(axis=1).rename('E(mean)'), res_df.std(axis=1).rename('STD')], axis=1)

res_df

Unnamed: 0,T1,T2,T3,T4,T5,E(mean),STD
R2_test,0.269858,0.212673,0.139642,0.299444,0.291423,0.242608,0.066809
R2_train,0.28846,0.278979,0.300297,0.281217,0.284274,0.286645,0.008419
RMSE_test,0.904996,1.076581,0.879859,0.903457,0.77926,0.90883,0.107018
RMSE_train,0.830581,0.853796,0.846839,0.830101,0.860981,0.84446,0.013826
Base Time,-0.102532,-0.100804,-0.100255,-0.099668,-0.101052,-0.100862,0.001075
CC1,-0.063099,-0.048179,-0.067186,-0.050991,-0.056358,-0.057163,0.007987
CC2,0.003755,0.005986,0.015036,0.002123,0.012208,0.007822,0.00556
CC3,0.448977,0.438584,0.448054,0.444382,0.447372,0.445474,0.004219
CC4,-0.076401,-0.062279,-0.084641,-0.074664,-0.076829,-0.074963,0.008065
CC5,0.600132,0.609801,0.610064,0.602263,0.608264,0.606105,0.004594
