In [3]:
import numpy as np
from sklearn.utils import shuffle

training_data = np.load('training_data.npy')
prices = np.load('prices.npy')

print(training_data[:4])
print(prices[:4])

training_data, prices = shuffle(training_data, prices, random_state=0)

print(training_data[:4])
print(prices[:4])

[[2.0150e+03 4.1000e+04 1.9670e+01 1.5820e+03 1.2620e+02 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0110e+03 4.6000e+04 1.8200e+01 1.1990e+03 8.8700e+01 5.0000e+00
  1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0120e+03 8.7000e+04 2.0770e+01 1.2480e+03 8.8760e+01 7.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0130e+03 8.6999e+04 2.3080e+01 1.4610e+03 6.3100e+01 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]]
[12.5  4.5  6.   3.5]
[[2.0170e+03 1.8351e+04 2.1900e+01 6.2400e+02 3.7480e+01 4.0000e+00
  1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 1.0000e+00]
 [2.0150e+03 9.1000e+04 2.1100e+01 8.1400e+02 5.5200e+01 5.0000e+00
  1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0180e+0

In [4]:
import sklearn.preprocessing as preprocessing

def normalize(train_data, test_data, type=None):
    if type is None:
        return train_data, test_data
    elif type == 'standard':
        scaler = preprocessing.StandardScaler()
        scaler.fit(train_data)
        return scaler.transform(train_data), scaler.transform(test_data)
    elif type == 'min-max':
        scaler = preprocessing.MinMaxScaler()
        scaler.fit(train_data)
        return scaler.transform(train_data), scaler.transform(test_data)
    elif type == 'l1' or type == 'l2':
        scaler = preprocessing.Normalizer(type)
        scaler.fit(train_data)
        return scaler.transform(train_data), scaler.transform(test_data)

In [5]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso


def train_and_validate(train_data, train_labels, n_folds, model):
    kf = KFold(n_splits=n_folds)

    kf.get_n_splits(train_data)

    mses = []
    maes = []

    for train_index, test_index in kf.split(train_data):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = train_labels[train_index], train_labels[test_index]

        X_train_scaled, X_test_scaled = normalize(X_train, X_test, 'standard')
        
        model.fit(X_train_scaled, y_train)

        predictions = model.predict(X_test_scaled)
        # calculam MSE, MAE
        mse_value = mean_squared_error(y_test, predictions)
        mae_value = mean_absolute_error(y_test, predictions)

        mses.append(mse_value)
        maes.append(mae_value)

    return np.mean(maes), np.mean(mses)

train_and_validate(training_data, prices, 3, LinearRegression())
# train_and_validate(training_data, prices, 3, Ridge(alpha=128))

(1.3203744, 3.169078)

In [6]:
for alpha in [1, 10, 100, 1000]:
    mae, mse = train_and_validate(training_data, prices, 3, Ridge(alpha=alpha))
    print(f'MAE for alpha={alpha} is {mae}')
    print(f'MSE for alpha={alpha} is {mse}')
    print('------------------------------------')

MAE for alpha=1 is 1.3195840120315552
MSE for alpha=1 is 3.167421340942383
------------------------------------
MAE for alpha=10 is 1.3193808794021606
MSE for alpha=10 is 3.1672849655151367
------------------------------------
MAE for alpha=100 is 1.318595051765442
MSE for alpha=100 is 3.1722867488861084
------------------------------------
MAE for alpha=1000 is 1.366579532623291
MSE for alpha=1000 is 3.4331789016723633
------------------------------------


In [9]:
alpha = 100

X_train_scaled, _ = normalize(training_data, training_data, 'standard')
y_train = prices

model = Ridge(alpha=alpha)
model.fit(X_train_scaled, y_train)
weights = model.coef_

In [10]:
weights

array([ 1.6153115 , -0.16623873, -0.41921714,  0.46560568,  1.2771597 ,
        0.13427809, -0.09638961,  0.        ,  0.35462627, -0.35463724,
        0.        ,  0.        , -0.2362309 ,  0.23623155], dtype=float32)

In [11]:
idxes = np.argsort(weights)

print('cel mai importnant', idxes[-1] + 1)
print('urmatorul', idxes[-2] + 1)
print('cel mai putin importnant', idxes[0] + 1)

cel mai importnant 1
urmatorul 5
cel mai putin importnant 3
