## Importing Packages

In [83]:
import pandas as pd
from IPython.display import display
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn import tree

from xgboost import XGBClassifier, plot_importance

# To ignore warnings in the notebook
import warnings
warnings.filterwarnings("ignore")


## Importing Dataset

In [3]:
df = pd.read_csv('XWines_Full_100K_wines.csv')

## Data Cleaning

### Grapes

In [4]:
# creating a function to remove square brackets and quotation marks
# to be used on Grapes, Harmonize and Vintages
# inputs: data is the dataframe, column_names is the name of the column (string) or list
def clean_column(data, column_names):
    
    for column_name in column_names:

        if column_name in ['Grapes', 'Harmonize']:
            # extracting all words inside 
            data[column_name] = data[column_name].apply(lambda x: re.findall(r"'(.*?)'", x))

            # convert the list of words back to a string
            data[column_name] = data[column_name].apply(lambda x: ', '.join(x))
        
        else: 
            # removing the square brackets
            data[column_name] = data[column_name].apply(lambda x: str(x).strip('[]'))

    return data


df = clean_column(df, ['Grapes', 'Harmonize', 'Vintages'])

In [5]:
# creating function to get the counts
# inputs: data is the dataframe, columns_name are the list of columns to get the counts

def get_counts(data, column_names):
    for column_name in column_names:
        data[column_name] = data[column_name].apply(lambda x: len(x.split(', ')))

    return data

df = get_counts(df, ['Grapes'])

### Harmonize

In [6]:
unique_foods = df['Harmonize'].str.split(', ', expand=True).stack().unique()

unique_foods

array(['Pork', 'Rich Fish', 'Shellfish', 'Beef', 'Barbecue', 'Codfish',
       'Pasta', 'Pizza', 'Cheese', 'Lamb', 'Poultry', 'Sweet Dessert',
       'Game Meat', 'Veal', 'Spicy Food', 'Maturated Cheese',
       'Hard Cheese', 'Vegetarian', 'Appetizer', 'Snack', 'Fish',
       'Citric Dessert', 'Mushrooms', 'Soft Cheese', 'Grilled', 'Soufflé',
       'Cured Meat', 'Seafood', 'Risotto', 'Medium-cured Cheese', 'Salad',
       'Lean Fish', 'Fruit Dessert', 'Dessert', 'Meat', 'Fruit',
       'Blue Cheese', 'Chicken', 'Mild Cheese', 'Chocolate', 'Ham',
       'Cake', 'Yakissoba', 'Sashimi', 'Curry Chicken', 'Sushi',
       'Baked Potato', 'Goat Cheese', 'Beans', 'Lasagna',
       'Eggplant Parmigiana', 'Aperitif', 'Chestnut', 'French Fries',
       'Tomato Dishes', 'Dried Fruits', 'Spiced Fruit Cake',
       'Light Stews', 'Duck', 'Asian Food', 'Cream', 'Cookies',
       'Cold Cuts', 'Roast', 'Tagliatelle', 'Paella'], dtype=object)

In [7]:
# re-classifying similar types of foods with the similar names into same categories
red_meat = ['Beef', 'Pork', 'Lamb', 'Veal', 'Meat', 'Ham', 'Red Meat']
white_meat = ['Chicken', 'Poultry', 'Duck', 'Cold Cuts']
cheese = ['Mild Cheese', 'Medium-cured Cheese', 'Cheese', 'Soft Cheese', 'Maturated Cheese', 'Hard Cheese', 'Goat Cheese', 'Blue Cheese']
seafood = ['Shellfish', 'Rich Fish', 'Lean Fish', 'Fish', 'Codfish', 'Seafood']
italian = ['Pasta', 'Risotto', 'Tagliatelle', 'Lasagna', 'Eggplant Parmigiana', 'Pizza']
dessert = ['Sweet Dessert', 'Fruit Dessert', 'Dessert', 'Citric Dessert', 'Cake', 'Soufflé', 'Chocolate', 'Spiced Fruit Cake']
vegetarian = ['Vegetarian', 'Mushrooms', 'Salad', 'Beans', 'Baked Potato', 'Chestnut']
snacks = ['Snack', 'French Fries', 'Fruit', 'Cookies']
others = ['Sushi', 'Sashimi', 'Yakissoba', 'Asian Food', 'Roast', 'Tomato Dishes', 'Cream', 'Curry Chicken', 'Barbeque', 'Light Stews', 'Paella', 'Grilled', 'Dried Fruits']
appetizer = ['Appetizer', 'Aperitif']

# checking if all other categories except Game Meat, Cured Meat and Spicy Food are classified
# total 67 (64 + 3)
# len(red_meat) + len(white_meat) + len(cheese) + len(seafood) + len(italian) + len(dessert) + len(vegetarian) + len(snacks) + len(others) + len(appetizer)

In [8]:
list_of_lists = [red_meat, white_meat, cheese, seafood, italian, dessert, vegetarian, snacks, others, appetizer]
names = ['Red Meat', 'White Meat', 'Cheese', 'Seafood', 'Italian', 'Dessert', 'Vegetarian', 'Snacks', 'Appetiser']

# define a function to re-assign the categories for each row
def reassign_categories(row):
    # splitting the food in the string and making it a list
    food_list = row.split(', ')

    # iterate through the list and re-assign the categories
    for i in range(len(food_list)):
        for lst, name in zip(list_of_lists, names):
            if food_list[i] in lst:
                food_list[i] = name

    # remove repeated food categories for each row
    new_row = list(set(food_list))

    # joining the list back into a string
    new_row = ', '.join(new_row)

    return new_row

# apply the function to each row of the DataFrame
df['Harmonize'] = df['Harmonize'].apply(reassign_categories)

### Body

In [9]:
# removing '-bodied' from body column
df['Body'] = df['Body'].str.replace('-bodied', '')

In [10]:
# saving the cleaned data
df.to_csv('wines_cleaned.csv')

### Type

In [11]:
# replace dessert/port to just dessert wine
df['Type'] = df['Type'].str.replace('Dessert/Port', 'Dessert')

In [12]:
# splitting data into catalogue and training data
df = df.sample(frac = 1, random_state = 100)
catalogue = df[:80000]
df = df[80000:]

## Model Training

In [13]:
df = df[['Type', 'Grapes', 'Harmonize', 'ABV', 'Body', 'Acidity', 'Country']]

In [14]:
# one-hot encoding the harmonise column
# Step 1: Create a list of all unique food types
food_types = df['Harmonize'].str.split(', ').explode().unique()
food_types = ['Harmonize_' + food_type for food_type in food_types]

# Step 2: Create a new DataFrame with a column for each unique food type
dummies = df['Harmonize'].str.get_dummies(', ').reindex(columns=food_types, fill_value=0)

# Step 3: Concatenate the original DataFrame with the new DataFrame
df = pd.concat([df, dummies], axis=1)
df = df.drop(columns = ['Harmonize'], axis = 1)

In [15]:
# splitting the data into training and test
X = df.drop(columns = ['Type'])
y = df['Type']

# train_test_split on dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [16]:
# checking if there is imbalanced data
# imbalanced data so we should use ensemble techniques
y.value_counts(normalize=True)

Type
Red          0.559237
White        0.289305
Sparkling    0.075705
Rosé         0.041606
Dessert      0.034147
Name: proportion, dtype: float64

In [17]:
# function to get all categorical variables

def getCategorical(X_train, data):
    categorical_variables = []
    
    for column in X_train.columns:
        if data[column].dtype == "object":
            categorical_variables.append(column)

    return categorical_variables

In [18]:
# function to create a transformer to encode categorical variables

def transformer(categorical_variables):
    # One-hot encoding
    enc_rf = OneHotEncoder(sparse_output = False, handle_unknown = "ignore")

    transformer_rf = ColumnTransformer([
        ("categorical", enc_rf, categorical_variables)
    ], remainder="passthrough")

    return transformer_rf

In [19]:
# function to transform data

def transformData(X_train, X_test, transformer_rf):
    # when making predictions, there is no X_train and X_test
    # this condition is to handle that case
    if X_train is X_test:
        X_encoded = pd.DataFrame(transformer_rf.fit_transform(X_train), columns = transformer_rf.get_feature_names_out())
        
        return X_encoded

    else:  
        X_train_encoded_rf = pd.DataFrame(transformer_rf.fit_transform(X_train), columns = transformer_rf.get_feature_names_out())
        X_test_encoded_rf = pd.DataFrame(transformer_rf.fit_transform(X_test), columns = transformer_rf.get_feature_names_out())
        
        return [X_train_encoded_rf, X_test_encoded_rf]

In [20]:
# function to rename the column to increase readability

def renameCol(categorical_variables, X_train_encoded_rf, X_test_encoded_rf):
    
    X_train_encoded_rf.columns = X_train_encoded_rf.columns.str.replace(re.compile(r'categorical__|remainder__'), '', regex = True)
    X_test_encoded_rf.columns = X_test_encoded_rf.columns.str.replace(re.compile(r'categorical__|remainder__'), '', regex = True)

    # used when making predictions
    if X_train_encoded_rf is X_test_encoded_rf:
        X_encoded = X_train_encoded_rf
        
        return X_encoded
    
    else:   
            return [X_train_encoded_rf, X_test_encoded_rf]


In [21]:
# function that combines all the above functions into a function called preprocess
def preprocess(X_train, X_test, data):
    
    # use the getCategorical function to get categorical variables in the dataset
    categorical_variables = getCategorical(X_train, data)
    
    # use tranformer function to create the transformer
    transformer_rf = transformer(categorical_variables)
    
    # use transformData function
    X_train_encoded_rf, X_test_encoded_rf = transformData(X_train, X_test, transformer_rf)

    # renaming the columns for readability
    X_train_encoded_rf, X_test_encoded_rf = renameCol(categorical_variables, X_train_encoded_rf, X_test_encoded_rf)

    return [X_train_encoded_rf, X_test_encoded_rf, transformer_rf]

### Random Forest Classifier

In [22]:
# unpacking values
X_train_encoded_rf, X_test_encoded_rf, transformer_rf = preprocess(X_train, X_test, df)

In [23]:
# class_weight = 'balanced' is used to give more weight to minority class
# the classes will be weighted inversely proportional to how frequently they appear in the data

rf = RandomForestClassifier(criterion = 'entropy', 
                            max_depth = 5, 
                            min_samples_leaf = 8, 
                            min_samples_split = 5, 
                            n_estimators = 100,
                            class_weight = 'balanced', 
                            random_state = 100)

rf.fit(X_train_encoded_rf, y_train)

In [24]:
# creating a pipeline
pipeline_rf = Pipeline([("transformer", transformer_rf), ("random_forest", rf)])

In [65]:
# Helper function for cross validating
def show_cv_results(pipeline):
  if pipeline is pipeline_xgb:
    scores = cross_validate(pipeline, X_train, y_train_encoded, cv = 10, return_train_score = True)
  
  else:
    scores = cross_validate(pipeline, X_train, y_train, cv = 10, return_train_score = True)
    
  print("Mean test score:", scores["test_score"].mean())
  display(pd.DataFrame(scores))

show_cv_results(pipeline_rf)

Mean test score: 0.5781672784708951


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.259641,0.009371,0.558111,0.579723
1,0.240437,0.009057,0.59322,0.591564
2,0.244899,0.0095,0.566586,0.581876
3,0.249879,0.009861,0.57385,0.583288
4,0.247261,0.008932,0.575061,0.578377
5,0.252553,0.009663,0.591404,0.588469
6,0.244693,0.009902,0.568141,0.570938
7,0.241054,0.012579,0.59782,0.584057
8,0.247083,0.009623,0.574803,0.578944
9,0.252852,0.009874,0.582677,0.5889


In [26]:
# hyperparameters tuning
grid_rf = {
    'random_forest__n_estimators': [80, 100, 200, 300, 500, 700],
    'random_forest__max_depth': [5, 10, 15, 20],
    'random_forest__min_samples_split': [3, 5, 8, 10],
    'random_forest__min_samples_leaf': [2, 3, 5, 8]
}

rf_gs = GridSearchCV(estimator = pipeline_rf,
                      param_grid = grid_rf,
                      cv = 10,
                      n_jobs = -1,
                      return_train_score = True,
                      scoring = 'roc_auc_ovr')

rf_gs.fit(X_train, y_train)

In [27]:
# helper function to view grid search results

def report_GridSearchCV_results(gs):
    print("Best combination of hyperparams:\n", gs.best_params_, "\n")
    print("Best mean_test_score score:\n", gs.best_score_, "\n")
    
    scores = []
    for i in range(gs.n_splits_):
        scores.append(gs.cv_results_['split{}_test_score'.format(i)][gs.best_index_])
    print("Score by fold for best estimator:\n", scores, "\n")
    
    # View top 5 hyperparams combinations by mean_test_score (mean on "validation" set)
    print("Top 5 hyperparams combinations by mean_test_score:")
    display(pd.DataFrame(gs.cv_results_)[["rank_test_score", "mean_test_score"] 
                                            + ["param_" + param for param in gs.param_grid]]\
              .sort_values(by = "mean_test_score", ascending = False)\
              .set_index("rank_test_score").head(5))


report_GridSearchCV_results(rf_gs)

Best combination of hyperparams:
 {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 300} 

Best mean_test_score score:
 0.8963809867617643 

Score by fold for best estimator:
 [0.8830147144408297, 0.892507810260652, 0.8974718157909802, 0.9149422523074817, 0.9047900177369037, 0.9004362922971509, 0.9082205972270743, 0.884677829761016, 0.885854627223847, 0.8918939105717083] 

Top 5 hyperparams combinations by mean_test_score:


Unnamed: 0_level_0,mean_test_score,param_random_forest__n_estimators,param_random_forest__max_depth,param_random_forest__min_samples_split,param_random_forest__min_samples_leaf
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.896381,300,20,10,2
2,0.896202,200,20,10,2
3,0.896187,300,20,8,2
4,0.896047,200,20,8,2
5,0.895759,300,15,10,2


#### Fitting the best model

In [28]:
best_params = rf_gs.best_params_

clf = RandomForestClassifier(n_estimators = best_params['random_forest__n_estimators'],  
                             max_depth = best_params['random_forest__max_depth'],
                             min_samples_split = best_params['random_forest__min_samples_split'], 
                             min_samples_leaf = best_params['random_forest__min_samples_leaf'],
                             class_weight='balanced',
                             random_state = 100)

clf.fit(X_train_encoded_rf, y_train)

### MLPClassifier

In [29]:
# unpacking values
X_train_encoded_nn, X_test_encoded_nn, transformer_nn = preprocess(X_train, X_test, df)

In [30]:
# Using MLPClassifier
nn = MLPClassifier(hidden_layer_sizes = (10,10,10), max_iter = 25000, random_state = 100)

nn.fit(X_train_encoded_nn, y_train)

In [31]:
# creating pipeline
pipeline_nn = Pipeline([("transformer", transformer_nn), ("neural_network", nn)])

In [32]:
# showing CV results
show_cv_results(pipeline_nn)

Mean test score: 0.7689515709167385


Unnamed: 0,fit_time,score_time,test_score,train_score
0,5.361612,0.007419,0.774213,0.781082
1,3.692469,0.004472,0.762712,0.772874
2,4.643907,0.012786,0.770581,0.78088
3,4.676416,0.003204,0.76937,0.777718
4,4.756309,0.00391,0.771792,0.777449
5,3.828138,0.002878,0.760896,0.7757
6,3.979933,0.002915,0.784373,0.780357
7,5.052426,0.003186,0.75954,0.778607
8,5.256257,0.003016,0.767414,0.773764
9,5.10031,0.002815,0.768625,0.7778


In [33]:
# hyperparameter tuning
grid_nn = {
    'neural_network__hidden_layer_sizes': [(6,6,6), (8,8,8), (10,10,10)],
    'neural_network__activation': ['relu', 'tanh', 'logistic'],
    'neural_network__alpha': [0.0001, 0.01, 0.05, 0.1],
    'neural_network__learning_rate': ['constant', 'adaptive']
}

nn_gs = GridSearchCV(estimator = pipeline_nn, 
                     param_grid = grid_nn,  
                     cv = 10, 
                     n_jobs = -1,
                     return_train_score = True,
                     scoring = 'roc_auc_ovr')

nn_gs.fit(X_train, y_train)

In [34]:
# getting hyperparameter tuning results
report_GridSearchCV_results(nn_gs)

Best combination of hyperparams:
 {'neural_network__activation': 'tanh', 'neural_network__alpha': 0.0001, 'neural_network__hidden_layer_sizes': (10, 10, 10), 'neural_network__learning_rate': 'constant'} 

Best mean_test_score score:
 0.9016295404075132 

Score by fold for best estimator:
 [0.9014290530530282, 0.8941003652225931, 0.9070235041284791, 0.9053846160109741, 0.9085684443139822, 0.9050446987453068, 0.9138986735599163, 0.8933505830178842, 0.8930228899831517, 0.8944725760398151] 

Top 5 hyperparams combinations by mean_test_score:


Unnamed: 0_level_0,mean_test_score,param_neural_network__hidden_layer_sizes,param_neural_network__activation,param_neural_network__alpha,param_neural_network__learning_rate
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.90163,"(10, 10, 10)",tanh,0.0001,constant
1,0.90163,"(10, 10, 10)",tanh,0.0001,adaptive
3,0.901336,"(10, 10, 10)",tanh,0.01,adaptive
3,0.901336,"(10, 10, 10)",tanh,0.01,constant
5,0.901246,"(10, 10, 10)",tanh,0.1,constant


#### Fitting the best model

In [35]:
nn_best_params = nn_gs.best_params_

clf1 = MLPClassifier(hidden_layer_sizes = nn_best_params['neural_network__hidden_layer_sizes'], 
                    activation = nn_best_params['neural_network__activation'], 
                    alpha = nn_best_params['neural_network__alpha'], 
                    learning_rate = nn_best_params['neural_network__learning_rate'],
                    random_state = 100)

clf1.fit(X_train_encoded_nn, y_train)

### XGBClassifier

In [114]:
# unpacking values
X_train_encoded_xgb, X_test_encoded_xgb, transformer_xgb = preprocess(X_train, X_test, df)

In [115]:
# label encoding y_train for xgb
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

In [116]:
xgb = XGBClassifier(learning_rate = 0.1,
                    max_depth = 5,
                    n_estimators = 100,
                    min_child_weight = 5,
                    gamma = 0.5,
                    random_state = 100)

xgb.fit(X_train_encoded_xgb, y_train_encoded)

In [117]:
# creating a pipeline
pipeline_xgb = Pipeline([("transformer", transformer_xgb), ("xgb", xgb)])

In [118]:
# showing CV results
show_cv_results(pipeline_xgb)

Mean test score: 0.7697987352298042


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.495482,0.005117,0.763923,0.778458
1,0.48516,0.005923,0.77724,0.779602
2,0.487801,0.004952,0.765738,0.779804
3,0.518397,0.004825,0.780872,0.779871
4,0.556657,0.00454,0.763317,0.773547
5,0.481453,0.004479,0.771792,0.7831
6,0.503966,0.004808,0.776499,0.780558
7,0.519586,0.006786,0.754694,0.780155
8,0.45605,0.00447,0.777105,0.781366
9,0.469833,0.004414,0.766808,0.77854


In [119]:
# hyperparameter tuning
grid_xgb = {
    'xgb__n_estimators': [80, 100, 200, 300, 500, 700],
    'xgb__max_depth': [3, 5, 8, 10],
    'xgb__learning_rate': [0.01, 0.1, 1],
    'xgb__min_child_weight': [1, 5, 8, 10],
    'xgb__gamma': [0, 0.1, 0.5, 1]
}

xgb_gs = GridSearchCV(estimator = pipeline_xgb,
                      param_grid = grid_xgb,
                      cv = 10,
                      n_jobs = -1,
                      return_train_score = True,
                      scoring = 'roc_auc_ovr')

xgb_gs.fit(X_train, y_train_encoded)

In [None]:
# getting hyperparameter tuning results
report_GridSearchCV_results(xgb_gs)

#### Fitting the best model

In [None]:
xgb_best_params = xgb_gs.best_params_

clf2 = XGBClassifier(learning_rate = xgb_best_params['xgb__learning_rate'], 
                    max_depth = xgb_best_params['xgb__max_depth'], 
                    n_estimators = xgb_best_params['xgb__n_estimators'], 
                    min_child_weight = xgb_best_params['xgb__min_child_weight'], 
                    gamma = xgb_best_params['xgb__gamma'],
                    random_state = 100)

clf2.fit(X_train_encoded_xgb, y_train)

## Model Evaluation

In [None]:
# Helper function to score model
def score_model(model, x, y):
    pred = model.predict(x)

    if model is clf2:
        pred = le.inverse_transform(pred)
        
    print(classification_report(y, pred))

    cm = confusion_matrix(y, pred)
    columns = np.unique(y)
    df_cm = pd.DataFrame(cm, index=columns, columns=columns)
    ax = sns.heatmap(df_cm, cmap='Oranges', annot=True, fmt='g')
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

In [None]:
def get_AUC (model, X, y):
    
    ###########
    # Calculate the AUC score.
    # Input: reg_model is the classifier, X is the X_test, Y is Y_test
    # Output: The AUC value.
    ###########

    # Make predictions on the test set
    y_pred = model.predict_proba(X)

    # Calculate the AUC score
    auc_score = roc_auc_score(y, y_pred, multi_class='ovr')
   
    return auc_score

In [None]:
# function to calculate the AUC of base and best model, as well as the percentage improvment
def calculate_AUC_scores(base, best, X_test, y_test):
    base_model = round(get_AUC(base, X_test, y_test), 2)
    best_model = round(get_AUC(best, X_test, y_test), 2)

    # increase in peformance
    improvement = round((best_model - base_model)/base_model * 100, 2)

    print(f'The AUC of the base model is: {base_model}')
    print(f'The AUC of the best model is: {best_model}')
    print(f'The improvement in performance is: {improvement}%')

### Random Forest

#### Performance on test set

In [None]:
# Performance on the test set
score_model(clf, X_test_encoded_rf, y_test)

In [None]:
# Model improvement and AUC scores
calculate_AUC_scores(rf, clf, X_test_encoded_rf, y_test)

### MLPClassifier

In [None]:
# Performance on the test set
score_model(clf1, X_test_encoded_nn, y_test)

In [None]:
# Model improvement and AUC scores
calculate_AUC_scores(nn, clf1, X_test_encoded_nn, y_test)

### XGBClassifier

In [None]:
# Performance on test set
score_model(clf2, X_test_encoded_xgb, y_test)

In [None]:
# Model improvement and AUC scores
calculate_AUC_scores(xgb, clf2, X_test_encoded_xgb, y_test)

## Model Interpretation

### Random Forest

In [None]:
# helper function to get feature importance    
def plot_importance(model, X):

    importances = pd.Series(data=model.feature_importances_,
                            index= list(X.columns))
    
    importances_sorted = importances.sort_values()
    
    importances_sorted = importances_sorted[-10:]

    plt.figure(figsize = (12, 5))
    importances_sorted.plot(kind='barh', color='blue')
    plt.title('Top 10 Feature Importance')
    plt.xlabel("Importance", fontweight = 'bold')
    plt.ylabel("Features", fontweight = 'bold')
    plt.show()

plot_importance(rf_gs.best_estimator_.named_steps["random_forest"], X_train_encoded_rf)

### MLPClassifier

In [None]:
# plot feature importance
result = permutation_importance(clf, X_test_encoded_nn, y_test, n_repeats = 10, random_state = 100)

# Plot the feature importances
fig, ax = plt.subplots(figsize=(10, 6))
sorted_idx = result.importances_mean.argsort()[-10:]
ax.barh(X_test_encoded_nn.columns[sorted_idx], result.importances_mean[sorted_idx], color='blue')
ax.set_title("Top 10 Permutation Importances")
ax.set_facecolor('lightgrey')
ax.set_xlabel('Importance', fontweight = 'bold')
ax.set_ylabel('Feature', fontweight = 'bold')
ax.set_yticklabels(X_test_encoded_nn.columns[sorted_idx], fontsize=10)
fig.tight_layout()
plt.show()

### XGBClassifier

In [None]:
# fig, ax = plt.subplots(figsize=(10, 6))
# plot_importance(clf2, max_num_features=10, ax=ax)
# plt.show()

In [None]:
plot_importance(xgb_gs.best_estimator_.named_steps['xgb'], X_train_encoded_xgb)