In [5]:
import pandas as pd
import numpy as np
import copy as cp


def get_weights(X, y):
    A = np.dot(X.T, X)
    b = np.dot(X.T, y)
    return np.linalg.solve(A, b)


def gradient_descent(X, y, theta=1e-2, epsilon=1e-5):
    dw = np.inf
    w = get_weights(X, y)
    np.random.seed(42)

    while dw >= epsilon:
        rand_ind = np.random.randint(X.shape[0])
        new_w = gradient_step(X, y, w, rand_ind, theta)
        dw = np.linalg.norm(w - new_w)
        w = new_w
    return w


def gradient_step(X, y, w, train_ind, theta=0.01):
    N = X.shape[0]
    x = X.iloc[train_ind, :]
    y_pred = reg_prediction(x, w)
    rs = (y_pred - y.iloc[train_ind])
    # print(N, '\n', x, '\n', y_pred, '\n', rs, '\n')
    return w - 2 * theta / N * x * rs


def is_binary(x):
    return x.unique().shape[0] == 2


def normalize(x):
    return (x - x.mean()) / x.std()


def R2(x, y):
    return 1 - np.sum(np.power(y - x, 2)) / np.sum(np.power(y - y.mean(), 2))


def reg_prediction(X, w):
    return np.dot(X, w)


def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])


### ввод
df = pd.read_csv('./dataset.csv')
df.drop(df.columns[0], axis=1, inplace=True)
df.drop('Post Promotion Status', axis=1, inplace=True)

### нормализуем
bin_free = df.columns[~df.apply(is_binary)]
df[bin_free] = df[bin_free].apply(normalize, axis=0)
df['w0_reg_constant'] = 1

### перемешиваем
df = df.sample(frac=1).reset_index(drop=True)
folds_index = 5
fold_size = round(df.shape[0] / folds_index)

### обучаем
features = pd.DataFrame()
RMSE_test = []
RMSE_train = []
R2_test = []
R2_train = []

for i in range(folds_index):
    test = df[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = df[(i + 1) * fold_size:]
    else:
        train = df[:i * fold_size]
        if i != 4:
            train = train.append(df[(i + 1) * fold_size:], ignore_index=False)

    Features = train.drop('Target', axis=1)
    Target = train['Target']
    w = gradient_descent(Features, Target)
    features = features.append(w, ignore_index=True)

    train_pred = reg_prediction(train.drop('Target', axis=1), w)
    R2_train.append(R2(train_pred, train['Target']))
    RMSE_train.append(RMSE(train_pred, train['Target']))

    test_pred = reg_prediction(test.drop('Target', axis=1), w)
    R2_test.append(R2(test_pred, test['Target']))
    RMSE_test.append(RMSE(test_pred, test['Target']))

res_df = pd.DataFrame(np.vstack([R2_test, R2_train, RMSE_test, RMSE_train]),
                      index=['R2_test', 'R2_train', 'RMSE_test', 'RMSE_train'])
res_df = res_df.append(features.T)
res_df.columns = ['T1', 'T2', 'T3', 'T4', 'T5']
res_df = pd.concat([res_df, res_df.mean(axis=1).rename('E(mean)'), res_df.std(axis=1).rename('STD')], axis=1)

#print(res_df)
# res_df.to_csv('out.csv', sep='\t', encoding='utf-8')
res_df

Unnamed: 0,T1,T2,T3,T4,T5,E(mean),STD
R2_test,0.367573,0.361563,0.242632,0.345268,0.035011,0.270409,0.141012
R2_train,0.315509,0.317009,0.343992,0.314025,0.35226,0.328559,0.01813
RMSE_test,0.732697,0.728549,0.860359,0.86036,1.080974,0.852588,0.143214
RMSE_train,0.842802,0.843651,0.81221,0.814592,0.783233,0.819297,0.025092
Base Time,-0.114948,-0.117137,-0.11115,-0.116866,-0.104993,-0.113019,0.005084
CC1,0.069873,0.099486,0.253748,0.181062,0.175199,0.155873,0.072738
CC2,4.223119,3.78679,1.907455,-5.545461,3.75659,1.625699,4.106779
CC3,-3.485798,-3.103965,-1.347151,5.485196,-3.076908,-1.105725,3.776301
CC4,-0.154192,-0.169066,-0.329744,-0.249227,-0.288057,-0.238057,0.075539
CC5,-4.568503,-4.049383,-1.722446,7.328691,-4.013169,-1.404962,5.004528
