
# Проект классификации качества "Wine Quality"

## Цель проекта

Предсказать класс качества вина.

## Описание

Датасет содержит физико-химические характеристики и и органолептические (выходные) переменные красного и белого португальского вина Vinho Verde. 

Набор данных был загружен из репозитория машинного обучения [UCI](https://https://archive.ics.uci.edu/ml/datasets/wine+quality).

Классы упорядочены и не сбалансированы.


Описание от  [Kaggle](https://https://www.kaggle.com/datasets/rajyellow46/wine-quality?resource=download)

## Начало работы с данными (изучение)

### Загрузка необходимых библиотек

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeCV


# Modelling Helpers
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import  Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

### Загрузка данных

Описание переменных:

0 - тип (белое или красное)

1 - fixed acidity (фиксированная кислотность)

2 - volatile acidity (летучая кислотность)

3 - citric acid (лимонная кислота)

4 - residual sugar (остаточный сахар)

5 - chlorides (хлорид)

6 - free sulfur dioxide (свободный диоксид серы)

7 - total sulfur dioxide (общий диоксид серы)

8 - density (плотность)

9 - pH

10 - sulphates (сульфаты)

11 - alcohol (содержание спирта)

Выходные данные:

12 - quality (score between 0 and 10) - качество от 0 до 10


In [None]:
!wget https://raw.githubusercontent.com/lteplova/wine-project/main/data/winequalityN.csv

--2023-05-13 09:49:01--  https://raw.githubusercontent.com/lteplova/wine-project/main/data/winequalityN.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 390376 (381K) [text/plain]
Saving to: ‘winequalityN.csv’


2023-05-13 09:49:01 (31.8 MB/s) - ‘winequalityN.csv’ saved [390376/390376]



In [None]:
df = pd.read_csv('winequalityN.csv')
# переменная для исходных данных

In [None]:
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


## Baseline классификация на необработанных данных

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


In [None]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [None]:
# удаление пропусков
df= df.dropna(axis = 0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6463 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6463 non-null   object 
 1   fixed acidity         6463 non-null   float64
 2   volatile acidity      6463 non-null   float64
 3   citric acid           6463 non-null   float64
 4   residual sugar        6463 non-null   float64
 5   chlorides             6463 non-null   float64
 6   free sulfur dioxide   6463 non-null   float64
 7   total sulfur dioxide  6463 non-null   float64
 8   density               6463 non-null   float64
 9   pH                    6463 non-null   float64
 10  sulphates             6463 non-null   float64
 11  alcohol               6463 non-null   float64
 12  quality               6463 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 706.9+ KB


In [None]:
# данные для обучения
X = df[['type', 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = df['quality']

In [None]:
# работа с признаком 'type'
le = LabelEncoder()
le.fit(X['type'])

In [None]:
X['type'] = le.transform(X['type'])
X

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,1,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6491,0,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
6492,0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
6494,0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
6495,0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [None]:
# разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

### Модель SVC()

In [None]:
# создание датасета с результирующими метриками
df_metrics = pd.DataFrame(columns = ['model', 'f1_score', 'accuracy'])
df_metrics

Unnamed: 0,model,f1_score,accuracy


In [None]:
# обучение
clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) 
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

In [None]:
# оценка
predicted = clf.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        48
           5       0.63      0.61      0.62       430
           6       0.53      0.76      0.62       540
           7       0.63      0.27      0.38       228
           8       0.00      0.00      0.00        42
           9       0.00      0.00      0.00         1

    accuracy                           0.57      1293
   macro avg       0.26      0.23      0.23      1293
weighted avg       0.54      0.57      0.53      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'SVC',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)

### Модель LogisticRegression()

In [None]:
%time

param_grid = {'penalty':['l2', 'l1', 'elasticnet'],
              'max_iter': [100, 500, 1000, 1500, 2000],
            #   'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'multi_class': ['auto', 'ovr', 'multinomial']
}
model = LogisticRegression()
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

grid.fit(X,y)

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
# обучение
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1500, multi_class='ovr')) 
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       1.00      0.04      0.08        48
           5       0.60      0.59      0.59       430
           6       0.50      0.74      0.59       540
           7       0.58      0.17      0.26       228
           8       0.00      0.00      0.00        42
           9       0.00      0.00      0.00         1

    accuracy                           0.53      1293
   macro avg       0.38      0.22      0.22      1293
weighted avg       0.55      0.53      0.49      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'LogisticRegression',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)

### Модель LinearRegression()

In [None]:
# инициализация модели
clf = make_pipeline(StandardScaler(), LinearRegression()) 

# обучение
clf.fit(X_train, y_train)
# полученные предсказания
predicted = clf.predict(X_test)

In [None]:
# расчет метрики RMSE
rmse_start = np.sqrt(mean_squared_error(y_test, predict)) 
rmse_start

0.7196084743645155

In [None]:
# расчет метрики R2
r2 = r2_score(y_test, predict)
r2

0.34647498390395914

In [None]:
accuracy_score(y_test, predicted)

0.534416086620263

###  Модель DecisionTreeClassifier()

In [None]:
clf =  make_pipeline(StandardScaler(), DecisionTreeClassifier(max_depth=5, random_state=123))
clf.fit(X_train, y_train)

In [None]:
# метрики
predicted = clf.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        48
           5       0.62      0.58      0.60       430
           6       0.50      0.72      0.59       540
           7       0.38      0.18      0.24       228
           8       0.00      0.00      0.00        42
           9       0.00      0.00      0.00         1

    accuracy                           0.53      1293
   macro avg       0.22      0.21      0.20      1293
weighted avg       0.48      0.53      0.49      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'DecisionTreeClassifier',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)

### Модель RandomForestClassifier()

In [None]:
clf= make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=5, random_state=123))
clf.fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        48
           5       0.65      0.60      0.62       430
           6       0.51      0.78      0.62       540
           7       0.59      0.18      0.27       228
           8       0.00      0.00      0.00        42
           9       0.00      0.00      0.00         1

    accuracy                           0.56      1293
   macro avg       0.25      0.22      0.22      1293
weighted avg       0.53      0.56      0.51      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'RandomForestClassifier',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)


### Модель BaggingClassifier()

In [None]:
clf = make_pipeline(StandardScaler(), BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=10,
                           max_samples=0.5,
                           max_features=0.5,
                           random_state=10))
clf.fit(X_train, y_train)

In [None]:
predicted = bagging.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.27      0.08      0.13        48
           5       0.65      0.71      0.68       430
           6       0.59      0.69      0.64       540
           7       0.64      0.46      0.53       228
           8       0.75      0.14      0.24        42
           9       0.00      0.00      0.00         1

    accuracy                           0.61      1293
   macro avg       0.41      0.30      0.32      1293
weighted avg       0.61      0.61      0.60      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'BaggingClassifier',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)


### Модель StackingClassifier()

In [None]:
# Создаем стэккинг и обучаем его на наших данных 
clf = StackingClassifier(
    [
        ('DecisionTreeClassifier', model_tree),
        ('RandomForestClassifier', model_forest),
        ('LinearSVC', LinearSVC())
    ])
clf.fit(X_train, y_train)

In [None]:
clf = stacking.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.27      0.08      0.13        48
           5       0.65      0.71      0.68       430
           6       0.59      0.69      0.64       540
           7       0.64      0.46      0.53       228
           8       0.75      0.14      0.24        42
           9       0.00      0.00      0.00         1

    accuracy                           0.61      1293
   macro avg       0.41      0.30      0.32      1293
weighted avg       0.61      0.61      0.60      1293



In [None]:
# Запись результатов
df_metrics = df_metrics.append({
'model' : 'StackingClassifier',
'f1_score' : f1_score(y_test, predicted, average='weighted'),
'accuracy': accuracy_score(y_test, predicted)
}, ignore_index=True)

### Итоговый датасет с результатами по метрикам

In [None]:
df_metrics.sort_values(by = 'accuracy', ascending=False, ignore_index=True)

Unnamed: 0,model,f1_score,accuracy
0,BaggingClassifier,0.597771,0.614076
1,StackingClassifier,0.597771,0.614076
2,SVC,0.532912,0.568445
3,RandomForestClassifier,0.511943,0.556071
4,LogisticRegression,0.493592,0.534416
5,DecisionTreeClassifier,0.4891,0.525909
