In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
head = ['Page Popularity','Page Checkins','Page talking about','Page Category'] \
       + ['extra_{}'.format(i - 4) for i in range(4, 29)] \
       + ['CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','Post Promotion Status','H Local'] \
       + ['published_weekday_{}'.format(i - 39) for i in range(39, 46)] \
       + ['base_weekday_{}'.format(i-45) for i in range(45, 52)] + ['Target']

df = pd.read_csv('./dataset2.csv', names=head)
df.drop('Post Promotion Status', axis=1, inplace=True)

In [3]:
def is_binary(x):
    return x.unique().shape[0] == 2

def normalize(x):
    return (x - x.mean())/x.std()


T = df['Target']
df.drop('Target', axis=1, inplace=True)
bin_free = df.columns[~df.apply(is_binary)]
df[bin_free] = df[bin_free].apply(normalize, axis=0)
df['w0_reg_constant'] = 1
df['Target'] = T

In [4]:
df = shuffle(df)

In [5]:
def gradient_descent(X, y, theta=1e-3, epsilon=1e-5):
    w = np.random.normal(size=X.shape[1])    
    w = np.ones(X.shape[1])
    new_w = 1000*w
    N = X.shape[0]
    iter_num = 0

    while np.linalg.norm(w-new_w) >= epsilon:
        new_w=w
        dw = (2 / N) * (X.T.dot( X.dot (w)) - X.T.dot(y))
        w = w - theta*dw
        
        if iter_num > 10000:
            return w
        iter_num += 1

    return w


def R2(x, y):
    return 1 - np.sum(np.power(y - x, 2)) / np.sum(np.power(y - y.mean(), 2))


def reg_prediction(X, w):
    return X.dot(w)


def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])

In [6]:
folds_index = 5
fold_size = round(df.shape[0] / folds_index)

features = pd.DataFrame()
RMSE_test = []
RMSE_train = []
R2_test = []
R2_train = []

features2 = pd.DataFrame()
RMSE_test2 = []
RMSE_train2 = []
R2_test2 = []
R2_train2 = []

for i in range(folds_index):
    test = df[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = df[(i + 1) * fold_size:]
    else:
        train = df[:i * fold_size]
        if i != 4:
            train = train.append(df[(i + 1) * fold_size:], ignore_index=False)

    Features = train.drop('Target', axis=1)
    Target = train['Target']
    w = gradient_descent(Features, Target, 1e-3, 1e-4)
    features = features.append(w, ignore_index=True)

    train_pred = reg_prediction(train.drop('Target', axis=1), w)
    R2_train.append(R2(train_pred, train['Target']))
    RMSE_train.append(RMSE(train_pred, train['Target']))

    test_pred = reg_prediction(test.drop('Target', axis=1), w)
    R2_test.append(R2(test_pred, test['Target']))
    RMSE_test.append(RMSE(test_pred, test['Target']))

res_df = pd.DataFrame(np.vstack([R2_test, R2_train, RMSE_test, RMSE_train]),
                      index=['R2_test', 'R2_train', 'RMSE_test', 'RMSE_train'])
res_df = res_df.append(features.T)
res_df.columns = ['T1', 'T2', 'T3', 'T4', 'T5']
res_df = pd.concat([res_df, res_df.mean(axis=1).rename('E(mean)'), res_df.std(axis=1).rename('STD')], axis=1)

res_df

Unnamed: 0,T1,T2,T3,T4,T5,E(mean),STD
R2_test,0.304713,0.189091,0.311218,0.361785,0.320702,0.297502,0.064544
R2_train,0.323026,0.337489,0.322988,0.309482,0.3207,0.322737,0.00997
RMSE_test,31.991725,29.81963,30.033019,28.174383,28.273922,29.658536,1.559633
RMSE_train,28.581425,29.354341,29.060449,29.541629,29.493894,29.206347,0.396489
Base Time,-3.825447,-3.975015,-4.111289,-4.19424,-4.171322,-4.055463,0.154254
CC1,-0.407737,-1.101275,-0.536146,-0.649251,-0.712987,-0.681479,0.261876
CC2,10.921531,10.79832,10.352196,10.269298,10.34646,10.537561,0.29928
CC3,3.443956,3.344832,3.398733,3.361357,3.379158,3.385607,0.038305
CC4,-1.188415,-1.501864,-1.652159,-1.320683,-1.262999,-1.385224,0.188877
CC5,7.27876,7.252679,6.846766,6.807098,6.856839,7.008428,0.235789
