Hyperparameters to look at for Random Forest: 
min samples leaf, followed by max features per split and number of estimators

For XGBoost
For learning rate, higher values of 0.2 and above tended to perform better.
For subsample, the density was increased above 0.8.
For min child weight, there is increased density at lower values. A setting of 0 means no minimum weight is required on the sum of the instance weight in a child, therefore no regularization. This means the algorithm can be much slower, and thus setting this parameter to 1 (which is the default value) represents a good trade off between performance and training time.

  - n_estimators: represents the number of decision trees in the "forest". A higher number will usually give a better response, but will require a lot more processing power. 
  - min sample leaf: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regressi
  - max_features:  the maximum number of features to be considered for a split in a tree. Usually the ideal number lies somewhere around the square root of the number of features present, so this function will accept either a number, or a string value that defines the calculation that should be made for a number.


In [16]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import shapiro 
from scipy.stats import lognorm
from scipy.stats import kstest
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.metrics import cohen_kappa_score
import imblearn
from sklearn.metrics import balanced_accuracy_score
from imblearn.pipeline import make_pipeline
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn import datasets
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd

In [17]:
red_wines = pd.read_csv("winequality-red.csv", sep = ";")
white_wines = pd.read_csv("winequality-white.csv", sep =";")

# formatting
red_wines.columns= red_wines.columns.str.replace(' ','_')
white_wines.columns = white_wines.columns.str.replace(' ','_')

#Quality categories
red_wines ['quality_label'] = red_wines['quality'].apply(lambda value: 'low' if value <= 5 
                                                        else 'medium' if value <= 7 
                                                        else 'high')

red_wines['quality_label'] = pd.Categorical(red_wines['quality_label'],
categories=['low', 'medium', 'high'])

white_wines ['quality_label'] = white_wines['quality'].apply(lambda value: 'low' if value <= 5 
                                                        else 'medium' if value <= 7 
                                                        else 'high')

white_wines["quality_label"] = pd.Categorical(white_wines["quality_label"], categories = ["low","medium","high"])

# Type of wine categories
red_wines ["type"] = 'Red Wine'
red_wines['type'] = pd.Categorical(red_wines['type'],
categories=["Red Wine","White Wine"])

white_wines ["type"] = 'White Wine'
white_wines['type'] = pd.Categorical(white_wines['type'],
categories=["Red Wine","White Wine"])


In [18]:
# filtered outliers
white_wines_clean= white_wines.drop("quality", axis=1)
numeric_columns_white = white_wines_clean.select_dtypes(include=['float64', 'int64'])

white_wine_filtered= white_wines.copy()
for column in numeric_columns_white.columns:
        q1 = np.quantile(numeric_columns_white[column], 0.25)
        q3 = np.quantile(numeric_columns_white[column], 0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        # Filter rows based on the column's outlier range
        white_wine_filtered = white_wine_filtered[(white_wine_filtered[column] >= lower) & (white_wine_filtered[column] <= upper)]
print("white wine shape; ", white_wines.shape, "\nfiltered white wine shape: ", white_wine_filtered.shape)

red_wines_clean= red_wines.drop("quality", axis=1)
numeric_columns_red = red_wines_clean.select_dtypes(include=['float64', 'int64'])

red_wine_filtered= red_wines.copy()
for column in numeric_columns_red.columns: 
        q1 = np.quantile(numeric_columns_red[column], 0.25)
        q3 = np.quantile(numeric_columns_red[column], 0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        # Filter rows based on the column's outlier range
        red_wine_filtered = red_wine_filtered[(red_wine_filtered[column] >= lower) & (red_wine_filtered[column] <= upper)]
print("red wine shape; ", red_wines.shape, "\nfiltered red wine shape: ", red_wine_filtered.shape)

white wine shape;  (4898, 14) 
filtered white wine shape:  (4015, 14)
red wine shape;  (1599, 14) 
filtered red wine shape:  (1194, 14)


In [None]:
# merge datasets
white_and_red_filtered = pd.merge(red_wine_filtered, white_wine_filtered, how = "outer")

# Encoding
enc = OrdinalEncoder(categories=[['low', 'medium', 'high']])
white_and_red_filtered['quality_label_encoded'] = enc.fit_transform(white_and_red_filtered[['quality_label']])

#Feauture / Target split
X=white_and_red_filtered.drop(["type","quality_label","quality","quality_label_encoded"], axis=1)
y= white_and_red_filtered["quality_label_encoded"]

#Smote
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)
## standarization
norm = MinMaxScaler().fit(X_train_smote)

# transform training data
X_train_smote_norm = norm.transform(X_train_smote)

# transform testing data
X_test_smote_norm = norm.transform(X_test_smote)

# define model
RF_clf = RandomForestClassifier(random_state=42)
# fit model

RF_clf.fit(X_train_smote, y_train_smote)

RF_preds = RF_clf.predict(X_test_smote)

RF_acc = accuracy_score(y_test_smote, RF_preds)


In [20]:
n_estimators = np.arange(100,210,10)
min_samples_leaf = np.arange(2, 12, 2)
max_features = ["sqrt", "log2", None]

param_grid = {
  'n_estimators': n_estimators,
  'min_samples_leaf': min_samples_leaf,
  'max_features': max_features,
}

param_grid

{'n_estimators': array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]),
 'min_samples_leaf': array([ 2,  4,  6,  8, 10]),
 'max_features': ['sqrt', 'log2', None]}

# Randomized Search CV

In [None]:
RF_clf = RandomForestClassifier(random_state=42)
RS_grid = RandomizedSearchCV(estimator=RF_clf, param_distributions=param_grid, n_iter=10)
RS_grid

In [29]:
RS_grid.fit(X_train_smote_norm, y_train_smote)


In [27]:
print(
  'best score: ', RS_grid.best_score_,
  '\nparams: ', RS_grid.best_params_
)

best score:  0.8710493046776232 
params:  {'n_estimators': np.int64(110), 'min_samples_leaf': np.int64(2), 'max_features': 'sqrt'}
