In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# 1. Load the dataset wineQualityReds (consider the first column as the index)

In [4]:
dataset = pd.read_csv("wineQualityReds.csv", index_col=0)

In [5]:
print(dataset.head())

   fixed.acidity  volatile.acidity  citric.acid  residual.sugar  chlorides  \
1            7.4              0.70         0.00             1.9      0.076   
2            7.8              0.88         0.00             2.6      0.098   
3            7.8              0.76         0.04             2.3      0.092   
4           11.2              0.28         0.56             1.9      0.075   
5            7.4              0.70         0.00             1.9      0.076   

   free.sulfur.dioxide  total.sulfur.dioxide  density    pH  sulphates  \
1                 11.0                  34.0   0.9978  3.51       0.56   
2                 25.0                  67.0   0.9968  3.20       0.68   
3                 15.0                  54.0   0.9970  3.26       0.65   
4                 17.0                  60.0   0.9980  3.16       0.58   
5                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
1      9.4        5  
2      9.8        5  
3      9.8        5 

# 2. Split the dataset in train and test considering that the target column is called 'quality'

In [6]:
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# 3. Normalize train and test data

In [8]:
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

# 4. Create a RandomForest classifier with 300 estimators

In [9]:
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

# 5. Get the accuracy of the model in cross validation (with K=5)

In [10]:

all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

In [11]:
print(all_accuracies)

[0.6375     0.64166667 0.6875     0.6875     0.69037657]


In [12]:
print(all_accuracies.mean())

0.6689086471408647


In [13]:
print(all_accuracies.std())

0.0240032376491525


# 6. Apply in cross validation a GridSearch with the following parameters to be tested:
- 'n_estimators': [100, 300, 500, 800, 1000],
- 'criterion': ['gini', 'entropy'],
- 'bootstrap': [True, False]

In [14]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [15]:

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [16]:
gd_sr.fit(X_train, y_train)

# 7. Indicate the configuration that generates the best accuracy

In [17]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 1000}


In [18]:
best_result = gd_sr.best_score_
print(best_result)

0.6739016736401673
