In [1]:
import pandas as pd
import numpy as np
import copy as cp


def get_weights(X, y):
    A = np.dot(X.T, X)
    b = np.dot(X.T, y)
    return np.linalg.solve(A, b)


def gradient_descent(X, y, theta=1e-2, epsilon=1e-5):
    dw = np.inf
    w = get_weights(X, y)
    np.random.seed(42)

    while dw >= epsilon:
        rand_ind = np.random.randint(X.shape[0])
        new_w = gradient_step(X, y, w, rand_ind, theta)
        dw = np.linalg.norm(w - new_w)
        w = new_w
    return w


def gradient_step(X, y, w, train_ind, theta=0.01):
    N = X.shape[0]
    x = X.iloc[train_ind, :]
    y_pred = reg_prediction(x, w)
    rs = (y_pred - y.iloc[train_ind])
    return w - 2 * theta / N * x * rs


def is_binary(x):
    return x.unique().shape[0] == 2


def normalize(x):
    return (x - x.mean()) / x.std()


def R2(x, y):
    return 1 - np.sum(np.power(y - x, 2)) / np.sum(np.power(y - y.mean(), 2))


def reg_prediction(X, w):
    return np.dot(X, w)


def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])


### ввод
df = pd.read_csv('./dataset.csv')
df.drop(df.columns[0], axis=1, inplace=True)
df.drop('Post Promotion Status', axis=1, inplace=True)

### нормализуем
bin_free = df.columns[~df.apply(is_binary)]
df[bin_free] = df[bin_free].apply(normalize, axis=0)
df['w0_reg_constant'] = 1

### перемешиваем
df = df.sample(frac=1).reset_index(drop=True)

### обучаем
features = pd.DataFrame()
RMSE_test = []
RMSE_train = []
R2_test = []
R2_train = []
folds_index = 5
fold_size = round(df.shape[0] / folds_index)

for i in range(folds_index):
    test = df[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = df[(i + 1) * fold_size:]
    else:
        train = df[:i * fold_size]
        if i != 4:
            train = train.append(df[(i + 1) * fold_size:], ignore_index=False)

    Features = train.drop('Target', axis=1)
    Target = train['Target']
    w = gradient_descent(Features, Target)
    features = features.append(w, ignore_index=True)

    train_pred = reg_prediction(train.drop('Target', axis=1), w)
    R2_train.append(R2(train_pred, train['Target']))
    RMSE_train.append(RMSE(train_pred, train['Target']))

    test_pred = reg_prediction(test.drop('Target', axis=1), w)
    R2_test.append(R2(test_pred, test['Target']))
    RMSE_test.append(RMSE(test_pred, test['Target']))

res_df = pd.DataFrame(np.vstack([R2_test, R2_train, RMSE_test, RMSE_train]),
                      index=['R2_test', 'R2_train', 'RMSE_test', 'RMSE_train'])
res_df = res_df.append(features.T)
res_df.columns = ['T1', 'T2', 'T3', 'T4', 'T5']
res_df = pd.concat([res_df, res_df.mean(axis=1).rename('E(mean)'), res_df.std(axis=1).rename('STD')], axis=1)

#print(res_df)
# res_df.to_csv('out.csv', sep='\t', encoding='utf-8')
res_df

Unnamed: 0,T1,T2,T3,T4,T5,E(mean),STD
R2_test,0.283498,0.285561,0.291685,0.239351,0.440405,0.3081,0.076834
R2_train,0.335923,0.332868,0.319477,0.353461,0.297468,0.32784,0.020853
RMSE_test,0.917296,0.800015,0.739138,0.988589,0.699055,0.828818,0.121517
RMSE_train,0.796922,0.827332,0.848172,0.774901,0.85133,0.819732,0.033148
Base Time,-0.11244,-0.111504,-0.114815,-0.108226,-0.118736,-0.113144,0.003919
CC1,-0.027008,0.234791,0.149849,0.222127,0.248171,0.165586,0.114178
CC2,4.577482,-55.750623,2.245735,6.796252,-1.158503,-8.657931,26.489908
CC3,-3.841726,51.791301,-1.688769,-5.859552,1.462384,8.372728,24.422887
CC4,-0.016272,-0.323463,-0.253151,-0.29482,-0.330675,-0.243676,0.130722
CC5,-5.021324,68.696083,-2.174935,-7.70394,1.989746,11.157126,32.365098
