## In this part, we try to use the tf-idf as input to build the GICS classification model and improve the classification results

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=Warning)

### Read in files and preprocess them

In [12]:
tfidf_df = pd.read_csv('TF-IDF.csv',index_col=0)
GICS_df = pd.read_csv('gics.csv')
companies_df = pd.read_csv('tickers.csv')

In [13]:
GICS_df.rename(columns={'Symbol':'ticker'},inplace = True)
tfidf_df[np.isnan(tfidf_df)] = 0
tfidf_df['ticker'] =  companies_df['ticker']

In [14]:
tfidf_df['ticker']

0         A
1      AVGO
2      AMAT
3       ADI
4       BDX
       ... 
491     CEG
492     OGN
493    GEHC
494       C
495      MA
Name: ticker, Length: 496, dtype: object

In [15]:
merged_df = pd.merge(tfidf_df, GICS_df, on='ticker', how='left')

In [16]:
merged_df.head()

Unnamed: 0,aa,aaa,aaac,aaadm,aaahc,aaalac,aaam,aaammf,aabp,aac,...,zynq,zynteglo,zynyz,zypar,zyprexa,zyrtec,zytel,zytiga,zz,GICS Sector
0,0.003247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Health Care
1,0.001217,0.003171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Health Care


In [17]:
merged_df.dropna(inplace=True)
merged_df.shape

(496, 41094)

In [18]:
merged_df.head()

Unnamed: 0,aa,aaa,aaac,aaadm,aaahc,aaalac,aaam,aaammf,aabp,aac,...,zynq,zynteglo,zynyz,zypar,zyprexa,zyrtec,zytel,zytiga,zz,GICS Sector
0,0.003247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Health Care
1,0.001217,0.003171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Information Technology
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Health Care


## We explore different ML models for classification

### Split the data in to training data and testing data

In [19]:
X = merged_df.drop(columns=['GICS Sector'])
y = merged_df['GICS Sector']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

X_train_input = X_train.drop(columns=['ticker'])
X_train_label = X_train['ticker']

X_test_input = X_test.drop(columns=['ticker'])
X_test_label = X_test['ticker']

## LogisticRegression 

In [20]:
classifier = LogisticRegression(penalty='l2', multi_class='multinomial', solver='lbfgs', max_iter=10000)
classifier.fit(X_train_input, y_train)
y_pred = classifier.predict(X_test_input)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 12.10%


## Random Forest

In [21]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_input, y_train)
y_pred = clf.predict(X_test_input)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 69.35%


### Explore the difference between prediction results and actual GICS codes

In [22]:
df_result = pd.DataFrame()
df_result['x_test_label'] = X_test_label
df_result['y_test'] = y_test
df_result['y_pred'] = y_pred
df_result

Unnamed: 0,x_test_label,y_test,y_pred
382,PHM,Consumer Discretionary,Industrials
429,WEC,Utilities,Utilities
227,LYB,Materials,Industrials
151,ETN,Industrials,Industrials
441,WTW,Financials,Financials
...,...,...,...
320,MRNA,Health Care,Health Care
65,BG,Consumer Staples,Financials
217,HBAN,Financials,Financials
372,PLD,Real Estate,Real Estate


In [23]:
df_result.loc[df_result['y_test'] != df_result['y_pred']]

Unnamed: 0,x_test_label,y_test,y_pred
382,PHM,Consumer Discretionary,Industrials
227,LYB,Materials,Industrials
370,TAP,Consumer Staples,Health Care
42,AVY,Materials,Industrials
463,SWK,Industrials,Information Technology
237,GNRC,Industrials,Information Technology
129,ECL,Materials,Consumer Discretionary
486,SLB,Energy,Industrials
266,ILMN,Health Care,Industrials
9,AZO,Consumer Discretionary,Industrials


### Use Grid Search to improve the Random Forest model

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the hyperparameters and their possible values
param_grid = {
    
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30, 35],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
#     'bootstrap': [True, False]
}

clf = RandomForestClassifier(random_state=42)

# Grid search of parameters
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_input, y_train)

# Check the best parameters found by GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

# Predict using the best model
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test_input)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with GridSearch: {accuracy * 100:.2f}%")


Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy with GridSearch: 71.77%
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimator

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estima

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estim

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estim

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estim

### Model with the best parameters

In [15]:
clf = RandomForestClassifier(n_estimators=200, max_depth= None, min_samples_leaf= 2, min_samples_split= 2, random_state=42)
clf.fit(X_train_input, y_train)
y_pred = clf.predict(X_test_input)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.77%
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estima

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   3.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_esti

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_esti

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   3.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estima

## Bagging with RandomForest

In [None]:
from sklearn.ensemble import BaggingClassifier

# Ensemble: Bagging with RandomForest
bag_clf = BaggingClassifier(
    RandomForestClassifier(n_estimators=200, max_depth= None, min_samples_leaf= 2, min_samples_split= 2, random_state=42),
    n_estimators=10,
    max_samples=0.8,
    bootstrap=True,
    n_jobs=-1,
    random_state=42)
bag_clf.fit(X_train_input, y_train)
y_pred_bag = bag_clf.predict(X_test_input)
print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred_bag) * 100:.2f}%")

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Ensemble: Gradient Boosting
gboost_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
gboost_clf.fit(X_train_input, y_train)
y_pred_gboost = gboost_clf.predict(X_test_input)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_gboost) * 100:.2f}%")

### Grid Search for Gradient Boosting

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
}
gboost_clf = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gboost_clf, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train_input, y_train)
best_gboost = grid_search.best_estimator_
y_pred_best = best_gboost.predict(X_test_input)
print(f"Best Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_best) * 100:.2f}%")
print(grid_search.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_distributions = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
}

# Create the base model to tune
gboost_clf = GradientBoostingClassifier(random_state=42)

# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
random_search = RandomizedSearchCV(estimator=gboost_clf, param_distributions=param_distributions,
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='accuracy')

# Fit the random search model
random_search.fit(X_train_input, y_train)

# Get the best estimator and make predictions
best_gboost = random_search.best_estimator_
y_pred_best = best_gboost.predict(X_test_input)

# Print the results
print(f"Best Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_best) * 100:.2f}%")
print(random_search.best_params_)


## Xgboost 

In [25]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


# Encode the 'GICS Sector' labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize the XGBoost Classifier
xgboost_clf = xgb.XGBClassifier(n_estimators=100, random_state=40, use_label_encoder=False, eval_metric='mlogloss')

# Fit the model using the encoded labels
xgboost_clf.fit(X_train_input, y_train_encoded)

# Make predictions (these will be in the encoded integer format)
y_pred_xgboost_encoded = xgboost_clf.predict(X_test_input)

# Calculate accuracy using the encoded labels
accuracy = accuracy_score(y_test_encoded, y_pred_xgboost_encoded)
print(f"XGBoost Accuracy: {accuracy * 100:.2f}%")

# If you need to convert predictions back to the original labels for interpretation
y_pred_xgboost_labels = label_encoder.inverse_transform(y_pred_xgboost_encoded)


XGBoost Accuracy: 75.00%


### Grid Search for Xgboost

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost Classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_input, y_train_encoded)

# Get the best estimator
best_xgb_clf = grid_search.best_estimator_

# Make predictions with the best estimator
y_pred_xgboost_encoded = best_xgb_clf.predict(X_test_input)

# Calculate accuracy
accuracy = accuracy_score(y_train_encoded, y_pred_xgboost_encoded)
print(f"Best XGBoost Accuracy: {accuracy * 100:.2f}%")

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
