In [1]:
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB



In [2]:
dataset = datasets.load_wine()
X = dataset.data
y = dataset.target

print("\nX.shape =", X.shape)
print("\ny.shape =", y.shape)
print("\nwine categories:\n", dataset['target'])
(category, counts) = np.unique(np.array(y), return_counts=True)
n_categories = len(category)
print("\nnumber of wine categories: {}\n".format(n_categories), np.asarray((category, counts)).T)
print("\nfeatures names:\n", dataset['feature_names'])


X.shape = (178, 13)

y.shape = (178,)

wine categories:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]

number of wine categories: 3
 [[ 0 59]
 [ 1 71]
 [ 2 48]]

features names:
 ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


### 1 Train-test split

In [18]:
# train-test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# categories in training
print("\nwine categories in test set:\n", y_train)


wine categories in test set:
 [2 2 0 1 1 0 2 2 0 1 1 1 1 1 0 1 1 2 0 0 2 1 1 0 2 0 2 0 1 1 0 0 2 0 1 2 1
 0 0 1 1 0 0 1 0 0 2 0 2 0 0 0 1 2 2 2 1 0 0 1 1 2 1 2 0 0 2 1 1 2 0 1 2 1
 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 0 2 0 1 2 1 1 1 2 1 2 0 1 1 2 0 2 0 2 0 2
 1 0 0 2 1 0 2 2 1 0 1 1 0]


In [19]:
# check categories
def check_categories(y, number_in_whole_dataset=n_categories):
    '''checks all categoriest are present in y target set'''
    
    return len(np.unique(np.array(y), return_counts=False)) == n_categories

print("number of target categories in train set is equal to that of the whole dataset: {}".format(check_categories(y_train)))
    

number of target categories in train set is equal to that of the whole dataset: True


### 2 Logistic regression model

In [22]:
# model
model_LR = LogisticRegression()

In [25]:
# fitting to training se
model_LR.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [27]:
# predictions 
y_predicted_LR = model_LR.predict(X_test)

# checking number of predudicted categories
print("number of predicted categories from test set X_test is equal to that of the whole dataset: {}".format(check_categories(y_predicted)))

number of predicted categories from test set X_test is equal to that of the whole dataset: True


In [29]:
# define names for wine categories
target_names = ['category 0', 'category 1', 'category 2']

# classification report
print(classification_report(y_test, y_predicted, target_names=target_names))

              precision    recall  f1-score   support

  category 0       1.00      0.87      0.93        15
  category 1       0.88      0.95      0.91        22
  category 2       0.94      0.94      0.94        17

    accuracy                           0.93        54
   macro avg       0.94      0.92      0.93        54
weighted avg       0.93      0.93      0.93        54



### 3 Gaussian Naive Bayes model

In [32]:
### define model
model_NB = GaussianNB()

In [33]:
# fitting to training set
model_NB.fit(X_train, y_train)

GaussianNB()

In [34]:
# predictions 
y_predicted_NB = model_NB.predict(X_test)

# checking number of predudicted categories
print("number of predicted categories from test set X_test is equal to that of the whole dataset: {}".format(check_categories(y_predicted_NB)))

number of predicted categories from test set X_test is equal to that of the whole dataset: True


In [35]:
# classification report
print(classification_report(y_test, y_predicted_NB, target_names=target_names))

              precision    recall  f1-score   support

  category 0       1.00      0.93      0.97        15
  category 1       0.95      0.91      0.93        22
  category 2       0.89      1.00      0.94        17

    accuracy                           0.94        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54

