In [1]:
import numpy as np

from scipy.sparse import load_npz
from sklearn import metrics
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB

### Features

- 0
  - skupina bodového hodnotenia
  - ordinálny atribút
  - Z+
- 1
  - hodnotenie
  - numericky spojitý atribút
  - <0, 1>
- 2
  - cena
  - numericky spojitý atribút
  - <0, 1>
- 3
  - hodnotenie ^ 2
  - numericky spojitý atribút
  - <0, 1>
- 4
  - cena ^ 2
  - numericky spojitý atribút
  - <0, 1>
- 5
  - hodnotenie * cena
  - numericky spojitý atribút
  - <0, 1>
- 6 - 46 (41 atribútov)
  - krajina pôvodu
  - nominálny atribút
  - \[0, 1]
- 47 - 65 (19 atribútov)
  - priateľ vína
  - nominálny atribút
  - \[0, 1]
- 66 - n-1
  - recenzia
  - tf-idf (spojitý)
  - <0, 1>

##### Disclaimer
Niektoré atribúty sú tie isté alebo navzájom závislé, a preto je potrebné zvážiť ich výber a použitie.

In [2]:
def sparse_delete_columns(mat, indices):
    indices = list(indices)
    mask = np.ones(mat.shape[1], dtype=bool)
    mask[indices] = False
    return mat[:, mask]


def select_for_points_regression(data):
    cols = [0, 3, 5]  # points_group, points^2, points*price
    data = sparse_delete_columns(data, cols)
    y = data[:, 0].toarray().flatten() # points
    X = data[:, 1:]
    return X, y


def select_for_points_classification(data):
    cols = [1, 3, 5]  # points, points^2, points*price
    data = sparse_delete_columns(data, cols)
    y = data[:, 0].toarray().flatten()  # points_group
    X = data[:, 1:]
    return X, y


def select_for_country_classification(data):
    y = data[:, 6:47]  # country
    # TODO revert one hot for y
    cols = [0] + list(range(6, 47))  # points_group, country
    X = sparse_delete_columns(data, cols)
    return X, y

In [3]:
def evaluate_reg(reg, X, y):

    predicted = reg.predict(X)

    print('Mean squared error:')
    print(metrics.mean_squared_error(y, predicted))
    print()

    print('Mean absolute error:')
    print(metrics.mean_absolute_error(y, predicted))
    print()

    print('R^2 score function:')
    print(metrics.r2_score(y, predicted))
    print()


def evaluate_clf(clf, X, y, labels=None):

    predicted = clf.predict(X)

    print('Confusion matrix:')
    print(metrics.confusion_matrix(y, predicted, labels=labels))
    print()

    print('Accuracy:')
    print(metrics.accuracy_score(y, predicted))
    print()

    print('Precision per class:')
    print(metrics.precision_score(y, predicted, average=None, labels=labels))
    print()

    print('Recall per class:')
    print(metrics.recall_score(y, predicted, average=None, labels=labels))
    print()

    print('F1 per class:')
    print(metrics.f1_score(y, predicted, average=None, labels=labels))
    print()

In [4]:
train = load_npz('../data/preprocessed/1-gram/train.npz')
test = load_npz('../data/preprocessed/1-gram/test.npz')

In [5]:
X_train, y_train = select_for_points_regression(train)
X_test, y_test = select_for_points_regression(test)

In [6]:
dummy_reg = DummyRegressor()
dummy_reg.fit(X_train, y_train)
evaluate_reg(dummy_reg, X_test, y_test)

Mean squared error:
0.021965083977195234

Mean absolute error:
0.1215780835068947

R^2 score function:
-0.00012332037438933519



In [7]:
linear_reg = LinearRegression(n_jobs=-1)
linear_reg.fit(X_train, y_train)
evaluate_reg(linear_reg, X_test, y_test)

Mean squared error:
0.0038012523643798225

Mean absolute error:
0.047789441308890505

R^2 score function:
0.82691979961508



In [8]:
X_train, y_train = select_for_points_classification(train)
X_test, y_test = select_for_points_classification(test)

In [9]:
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)
evaluate_clf(dummy_clf, X_test, y_test)

Confusion matrix:
[[   6   78  106  104   13    0]
 [  71  928 1481 1437  180    0]
 [ 118 1459 2232 2172  254    4]
 [ 126 1410 2160 2092  289    1]
 [  15  165  269  266   44    0]
 [   0    2    0    1    1    0]]

Accuracy:
0.3032486845115534

Precision per class:
[0.01785714 0.22958931 0.35723431 0.34453228 0.05633803 0.        ]

Recall per class:
[0.01954397 0.2265072  0.35774964 0.34419217 0.05797101 0.        ]

F1 per class:
[0.01866252 0.22803784 0.35749179 0.34436214 0.05714286 0.        ]





In [10]:
linear_clf = LogisticRegression(random_state=42, verbose=1, n_jobs=-1)
linear_clf.fit(X_train, y_train)
evaluate_clf(linear_clf, X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.2s finished
  _warn_prf(average, modifier, msg_start, len(result))


Confusion matrix:
[[  72  223   10    2    0    0]
 [  24 2705 1273   93    2    0]
 [   4  834 4240 1157    4    0]
 [   0   52 1273 4651  102    0]
 [   0    0    3  577  179    0]
 [   0    0    0    1    3    0]]

Accuracy:
0.6775909402882636

Precision per class:
[0.72       0.70922916 0.62362112 0.71763617 0.61724138 0.        ]

Recall per class:
[0.23452769 0.6602392  0.67959609 0.76521882 0.23583663 0.        ]

F1 per class:
[0.35380835 0.68385792 0.6504065  0.74066407 0.34127741 0.        ]



In [11]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
evaluate_clf(nb_clf, X_test, y_test)

Confusion matrix:
[[  40  226   26   15    0    0]
 [  19 2096 1490  492    0    0]
 [   2 1237 2986 2014    0    0]
 [   0  343 1251 4473   11    0]
 [   0    4   23  702   30    0]
 [   0    0    0    3    1    0]]

Accuracy:
0.5505033173186914

Precision per class:
[0.6557377  0.53661034 0.51696676 0.58098454 0.71428571 0.        ]

Recall per class:
[0.13029316 0.51159385 0.47860234 0.73593287 0.03952569 0.        ]

F1 per class:
[0.2173913  0.52380357 0.49704536 0.64934311 0.07490637 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))
