# Red Wine Quality Prediction

## Importing the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_columns = None

## Importing the Dataset

In [0]:
dataset = pd.read_csv('RedWineQuality_data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
dataset.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [4]:
dataset.iloc[:, :-1].head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [5]:
# checking for missing data
dataset.isnull().values.any()

False

## Splitting data into training and test set

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Getting the max accuracy

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
high = 0
for i in range(100, 300):
    classifier = RandomForestClassifier(n_estimators=i, random_state=0)
    classifier.fit(X_train, y_train)
    num = accuracy_score(y_test, y_pred = classifier.predict(X_test))
    if high<num:
        high = num
        index = i
high

0.7375

In [10]:
index

251

## Training Random Forest Model

In [11]:
classifier = RandomForestClassifier(n_estimators=index, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=251,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Predicting Results of the test set

In [0]:
y_pred = classifier.predict(X_test)

In [13]:
print(y_pred.shape)
print(y_pred[:10])

(320,)
[6 5 7 5 5 6 5 6 5 5]


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

[[  0   0   0   2   0   0]
 [  0   0   7   4   0   0]
 [  0   0 113  20   2   0]
 [  0   0  26 107   9   0]
 [  0   0   1   8  16   2]
 [  0   0   0   1   2   0]]


0.7375