In [16]:
# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [19]:
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=";")

In [20]:
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 4 else 'medium'
if value <= 6 else 'high')

# 3 4 5 are low, 6 7 medium, 8 and nonexistent 9 are high
#changed to: 3 4 low, 5 6 medium, 7 8 (and 9) high

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,medium
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,medium
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,medium
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,medium
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,medium
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,medium
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,medium
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,high


In [21]:
#Skewness measures asymmetry in the data distribution

print(white_wine['residual sugar'].astype(float).skew())

1.0770937564240868


In [22]:
#Kurtosis identifies outliers: data with high kurtosis (heavy-tailed data) is proof of outliers, and data with low kurtosis (light-tailed data) lacks them.

print(white_wine['residual sugar'].astype(float).kurt())

3.4698201025634265


In [23]:
white_wine["quality"].unique()

array([6, 5, 7, 8, 4, 3, 9])

In [24]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [25]:
# x is all the features Im keeping
# y is target, quality , the new name.

In [26]:
#X.drop(['SalePrice'],axis=1,inplace=True) REMEMBER THE INPLACE!!

In [27]:
white_wine["quality_label"].unique()
white_wine ["quality_label"].value_counts()

medium    3655
high      1060
low        183
Name: quality_label, dtype: int64

In [28]:
le = LabelEncoder()

In [34]:
new_wine = pd.DataFrame(white_wine, columns=['quality_label'])

new_wine['quality_label'] = new_wine['quality_label'].astype('category')

new_wine['quality_label_Cat'] = new_wine['quality_label'].cat.codes

new_wine

Unnamed: 0,quality_label,quality_label_Cat
0,medium,1
1,medium,1
2,medium,1
3,medium,1
4,medium,1
...,...,...
4893,medium,1
4894,medium,1
4895,medium,1
4896,high,2


In [36]:
new_wine.tail(20)

Unnamed: 0,quality_label,quality_label_Cat
4878,low,0
4879,medium,1
4880,medium,1
4881,medium,1
4882,medium,1
4883,medium,1
4884,medium,1
4885,medium,1
4886,high,2
4887,high,2


In [42]:
x_data = white_wine[["fixed acidity",	"volatile acidity",	"citric acid",	"residual sugar",	"chlorides",	"free sulfur dioxide",	"total sulfur dioxide",	"density",	"pH",	"sulphates",	"alcohol"]]
y_data = white_wine["quality_label"]


In [45]:
new_frame = [white_wine, new_wine]


In [59]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data ,test_size = 0.3, shuffle=False)
x_train, x_test, y_train, y_test


(      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
 0               7.0              0.27         0.36           20.70      0.045   
 1               6.3              0.30         0.34            1.60      0.049   
 2               8.1              0.28         0.40            6.90      0.050   
 3               7.2              0.23         0.32            8.50      0.058   
 4               7.2              0.23         0.32            8.50      0.058   
 ...             ...               ...          ...             ...        ...   
 3423            7.1              0.18         0.39           14.50      0.051   
 3424            6.4              0.32         0.27            4.90      0.034   
 3425            7.1              0.17         0.40           14.55      0.047   
 3426            7.1              0.17         0.40           14.55      0.047   
 3427            5.8              0.24         0.26           10.05      0.039   
 
       free su

In [61]:
# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(x_train)

# transform training data
X_train_norm = norm.transform(x_train)

# transform testing data
X_test_norm = norm.transform(x_test)