In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/content/winequality-red.csv')

In [3]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
data['quality'].unique()

array([5, 6, 7, 4, 8, 3])

##Separate Features and Target

In [6]:
x = data.iloc[:,:-1].values
y = data.iloc[:, -1].values

##Splitting Data into Train and Test sets

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

##Standardization

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
dtrain = y_train.reshape(len(y_train), 1)
dtest = y_test.reshape(len(y_test), 1)

In [11]:
x_sc = StandardScaler()
y_sc = StandardScaler()

In [12]:
xd_train = x_sc.fit_transform(x_train)
yd_train = y_sc.fit_transform(dtrain)

In [13]:
xd_test = x_sc.transform(x_test)
yd_test = y_sc.transform(dtest)

In [14]:
xd_train

array([[ 1.10718145, -0.94501236,  1.90607936, ..., -0.52235889,
         0.01009351,  1.92271655],
       [-0.80982259,  0.58865691, -0.51280532, ...,  0.58053967,
        -0.3897614 , -0.12981832],
       [-0.75173156,  0.89539076, -1.02746164, ...,  0.4507869 ,
        -0.33263927,  0.80315207],
       ...,
       [ 2.90800342, -0.55462382,  1.59728557, ..., -1.62525745,
        -0.96098269, -1.34267983],
       [ 0.35199804, -0.16423528, -0.3069428 , ...,  0.12640497,
         0.98116971,  0.61655799],
       [-1.565006  ,  0.42134753, -1.1303929 , ...,  1.61856184,
        -0.61824991,  2.2959047 ]])

In [15]:
yd_train

array([[ 0.43720767],
       [ 0.43720767],
       [ 0.43720767],
       ...,
       [-0.80267838],
       [ 0.43720767],
       [-0.80267838]])

##Train on different algorithms


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures

In [17]:
r_reg = RandomForestRegressor()
l_reg = LinearRegression()
d_reg = DecisionTreeRegressor()
s_reg = SVR()
p_reg = LinearRegression()

In [18]:
x_pol = PolynomialFeatures(degree=2)
x_pol = x_pol.fit_transform(xd_train)

In [19]:
temp = PolynomialFeatures(degree=2)
temp = temp.fit_transform(xd_test)

In [20]:
r_reg.fit(xd_train, yd_train)
l_reg.fit(xd_train, yd_train)
d_reg.fit(xd_train, yd_train)
s_reg.fit(xd_train, yd_train)
p_reg.fit(x_pol, yd_train)

  """Entry point for launching an IPython kernel.
  y = column_or_1d(y, warn=True)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
r_pred = r_reg.predict(xd_test)
l_pred = l_reg.predict(xd_test)
d_pred = d_reg.predict(xd_test)
s_pred = s_reg.predict(xd_test)
p_pred = p_reg.predict(temp)

In [22]:
from sklearn.metrics import r2_score

In [23]:
r = r2_score(yd_test, r_pred)
l = r2_score(yd_test, l_pred)
d = r2_score(yd_test, d_pred)
s = r2_score(yd_test, s_pred)
p = r2_score(yd_test, p_pred)

In [24]:
print('Random Forest: ' + str(r) + '\nLinear Regression: ' + str(l) + '\nDecision Tree: ' + str(d) + '\nSVR: ' + str(s)  + '\nPolynomial Features: ' + str(p))

Random Forest: 0.41706210540455824
Linear Regression: 0.34823931841758815
Decision Tree: 0.07819519694323795
SVR: 0.3926658368713908
Polynomial Features: 0.3278968861965941


In [25]:
cond = [(data['quality']>=7), (data['quality']<7)]
gb = ['Good', 'Bad']
data['review'] = np.select(cond, gb)

In [26]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,review
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Bad
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,Bad
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,Bad
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,Bad
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,Bad
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,Bad
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,Bad
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,Bad
