In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
head = ['Page Popularity','Page Checkins','Page talking about','Page Category'] \
       + ['extra_{}'.format(i - 4) for i in range(4, 29)] \
       + ['CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','Post Promotion Status','H Local'] \
       + ['published_weekday_{}'.format(i - 39) for i in range(39, 46)] \
       + ['base_weekday_{}'.format(i-45) for i in range(45, 52)] + ['Target']

df = pd.read_csv('./dataset2.csv', names=head)
df.drop('Post Promotion Status', axis=1, inplace=True)

In [3]:
def is_binary(x):
    return x.unique().shape[0] == 2

def normalize(x):
    return (x - x.mean())/x.std()


T = df['Target']
df.drop('Target', axis=1, inplace=True)
bin_free = df.columns[~df.apply(is_binary)]
df[bin_free] = df[bin_free].apply(normalize, axis=0)
df['w0_reg_constant'] = 1
df['Target'] = T

In [4]:
df = shuffle(df)

In [5]:
def gradient_descent(X, y, theta=1e-3, epsilon=1e-5):
    w = np.random.normal(size=X.shape[1])    
    w = np.ones(X.shape[1])
    new_w = 1000*w
    N = X.shape[0]
    iter_num = 0

    while np.linalg.norm(w-new_w) >= epsilon:
        new_w=w
        dw = (2 / N) * (X.T.dot( X.dot (w)) - X.T.dot(y))
        w = w - theta*dw
        
        if iter_num > 10000:
            return w
        iter_num += 1

    return w


def R2(x, y):
    return 1 - np.sum(np.power(y - x, 2)) / np.sum(np.power(y - y.mean(), 2))


def reg_prediction(X, w):
    return X.dot(w)


def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])

In [7]:
folds_index = 5
fold_size = round(df.shape[0] / folds_index)

features = pd.DataFrame()
RMSE_test = []
RMSE_train = []
R2_test = []
R2_train = []

features2 = pd.DataFrame()
RMSE_test2 = []
RMSE_train2 = []
R2_test2 = []
R2_train2 = []

for i in range(folds_index):
    test = df[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = df[(i + 1) * fold_size:]
    else:
        train = df[:i * fold_size]
        if i != 4:
            train = train.append(df[(i + 1) * fold_size:], ignore_index=False)

    Features = train.drop('Target', axis=1)
    Target = train['Target']
    w = gradient_descent(Features, Target, 1e-2, 1e-3)
    features = features.append(w, ignore_index=True)

    train_pred = reg_prediction(train.drop('Target', axis=1), w)
    R2_train.append(R2(train_pred, train['Target']))
    RMSE_train.append(RMSE(train_pred, train['Target']))

    test_pred = reg_prediction(test.drop('Target', axis=1), w)
    R2_test.append(R2(test_pred, test['Target']))
    RMSE_test.append(RMSE(test_pred, test['Target']))

res_df = pd.DataFrame(np.vstack([R2_test, R2_train, RMSE_test, RMSE_train]),
                      index=['R2_test', 'R2_train', 'RMSE_test', 'RMSE_train'])
res_df = res_df.append(features.T)
res_df.columns = ['T1', 'T2', 'T3', 'T4', 'T5']
res_df = pd.concat([res_df, res_df.mean(axis=1).rename('E(mean)'), res_df.std(axis=1).rename('STD')], axis=1)

res_df

Unnamed: 0,T1,T2,T3,T4,T5,E(mean),STD
R2_test,0.309981,0.188326,0.311438,0.361894,0.321097,0.298547,0.065129
R2_train,0.325274,0.339296,0.325166,0.311664,0.322766,0.324833,0.009839
RMSE_test,31.87029,29.833685,30.028213,28.171987,28.2657,29.633975,1.517069
RMSE_train,28.533923,29.314273,29.013666,29.494918,29.449012,29.161158,0.397768
Base Time,-3.777395,-3.902124,-4.051989,-4.137681,-4.110259,-3.995889,0.152368
CC1,3.73419,1.682253,4.787892,3.273209,2.300909,3.155691,1.215959
CC2,11.39113,11.235403,10.901703,10.680289,10.757265,10.993158,0.307799
CC3,3.834616,3.722808,3.830128,3.685877,3.734185,3.761523,0.067115
CC4,-6.01261,-4.913608,-7.740024,-5.819654,-4.85618,-5.868415,1.168585
CC5,7.368459,7.325327,6.97115,6.898719,6.925289,7.097789,0.229379
