# Decision Tree Implementation - 5 Nov 2022 
# Random Forest Implementation - 13 Nov 2022

In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/shrikant-temburwar/Wine-Quality-Dataset/master/winequality-red.csv", sep=";")

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
df['quality'].nunique()

6

In [6]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
df.duplicated().sum()

240

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [11]:
y = df['quality']
X = df.drop('quality', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size=0.2)

## Decision Tree

In [13]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [14]:
# test score
accuracy_score(y_test,y_pred)

0.46691176470588236

In [15]:
# train score
accuracy_score(y_train,model.predict(X_train))

1.0

In [16]:
# Test score too low and train score is to high so we need to make hyperparameter tunning

In [17]:
grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf':range(1,10,1),
    'min_samples_split' : range(2,10,1),
    'splitter' : ['best','random']
    }

In [18]:
grid_search = GridSearchCV(estimator=model,param_grid=grid_param,cv=5)

In [19]:
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_params_

In [None]:
model_with_best_params = DecisionTreeClassifier(criterion= 'entropy',
 max_depth= 7,
 min_samples_leaf= 5,
 min_samples_split= 8,
 splitter= 'random')

In [None]:
model_with_best_params.fit(X_train,y_train)

In [None]:
y_pred = model_with_best_params.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

## Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)

In [25]:
# train score
accuracy_score(y_train,rf_model.predict(X_train))

1.0

In [24]:
# test score
accuracy_score(y_test,y_pred)

0.6102941176470589

In [27]:
grid_param = {
    'n_estimators' : [70,120,150],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf':range(1,10,1),
    'min_samples_split' : range(2,10,1),
    'max_features' : ['auto', 'log2']
    }

In [28]:
grid_search = GridSearchCV(estimator=rf_model,param_grid=grid_param,cv=5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
rf_model_with_best_params = RandomForestClassifier(**grid_search.best_params_)

In [None]:
y_pred = model_with_best_params.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)