# Project: Classification

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from  sklearn.ensemble import IsolationForest
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing  import LabelEncoder
from sklearn import linear_model 
from sklearn import tree 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

## Part 1. Data Exploration, Handling Missing Values and Outliers

### Bank Client Data:
- **age**: age
- **job**: employment field
- **marital**: marital status
- **education**: education level
- **default**: has overdue credit
- **housing**: has housing loan
- **loan**: has personal loan
- **balance**: balance

### Data Related to Last Contact in the Current Marketing Campaign:
- **contact**: type of contact with the client
- **month**: month of last contact
- **day**: day of last contact
- **duration**: duration of contact in seconds

### Other Features:
- **campaign**: number of contacts during the current campaign
- **pdays**: number of days passed since the last marketing campaign until contact in the current campaign
- **previous**: number of contacts before the current campaign
- **poutcome**: result of the previous marketing campaign

### Target Variable:
- **deposit**: whether the client agreed to open a deposit (this is our target variable for prediction)

### Task 1

In [3]:
df = pd.read_csv('data/bank_fin.csv', sep = ';')
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,"2 343,00 $",yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,"45,00 $",no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,"1 270,00 $",yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,"2 476,00 $",yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,"184,00 $",no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [4]:
# Investigate the data for missing values. Where are there missing values? How many are there?

df.info()
df['balance'].isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11137 non-null  object
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(6), object(11)
memory usage: 1.4+ MB


25

### Task 2

In [5]:
# Are there any missing values in the job feature? Maybe they are indicated by a specific word?

df['job'].value_counts()

job
management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: count, dtype: int64

### Task 3

In [6]:
# Convert the balance feature so that it is correctly interpreted as a float

df['balance'] = df['balance'].apply(lambda x: str(x).replace(' ', '').replace(',', '.').replace('$', '') if pd.notnull(x) else x).astype(float)

In [7]:
round(df['balance'].mean(), 3)  # mean value

1529.129

### Task 4

In [8]:
# Handle missing values in the balance feature by replacing them with the median value for this feature

df['balance'].fillna(df['balance'].median(), inplace=True)

In [9]:
round(df['balance'].mean(),3) # mean value

1526.936

### Task 5

In [10]:
# Handle missing values in categorical features: job and education by replacing them with the mode (most frequent value)

job_mode = df['job'].mode()[0]
education_mode = df['education'].mode()[0]

df['job'] = df['job'].apply(lambda x: job_mode if x == 'unknown' else x)
df['education'] = df['education'].apply(lambda x: education_mode if x == 'unknown' else x)

In [11]:
top_education = df['education'].mode()[0]
top_job = df['job'].mode()[0]

mask1 = df['education'] == top_education
mask2 = df['job'] == top_job

round(df[mask1 & mask2]['balance'].mean(), 3)

1598.883

### Task 6

In [12]:
# Remove all outliers from the balance feature

px.box(df, x='balance', title='Balance Box Plot', width=800, height=500).show()

In [13]:
def outliers_iqr(data, feature):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

In [15]:
outliers, cleaned = outliers_iqr(df, 'balance')
print(f"Number of outliers by Tukey's method: {outliers.shape[0]}")
print(f'Resulting number of records: {cleaned.shape[0]}')

Number of outliers by Tukey's method: 1057
Resulting number of records: 10105


In [16]:
df = cleaned

## Part 2: Exploratory Data Analysis

### Task 1

In [17]:
# Study the class distribution in your data to assess for imbalance, and illustrate the result

# Count the number of instances in each class
class_counts = cleaned['deposit'].value_counts()

# Visualize the class distribution
fig = px.bar(
  class_counts, 
  x=class_counts.index,
  y=class_counts.values, 
  labels={'x': 'Deposit Opened', 'y': 'Count'}, 
  title='Class Distribution of Deposit Opened vs Not Opened', 
  width=800, height=500,
)

fig.show()

The data looks balanced.

### Tasks 2 and 3

In [18]:
# Calculate descriptive statistics for numeric variables and interpret the result

numeric_features = ['age', 'day', 'duration', 'campaign']

# Loop over each feature to create individual boxplots
for feature in numeric_features:
    px.box(df, x='deposit', y=feature, color='deposit', title=f'{feature} Distribution for Deposited vs Not Deposited', 
       width=800, height=500).show()

- The median ages of clients who made a deposit and those who didn’t are very similar. Those who didn’t make a deposit are slightly younger, but the difference is not significant.
- Conversations with clients who agreed to open a deposit lasted, on average, longer than with those who did not agree.

### Tasks 4 and 5

In [19]:
px.histogram(df, x='housing', color='deposit', title='Housing Loan Distribution by Deposit', width=800, height=500).show()


People with a housing loan are less likely to open a deposit compared to those without a loan.

In [20]:
px.histogram(df, x='loan', color='deposit', title='Personal Loan Distribution by Deposit', width=800, height=500).show()


Clients without a loan are much more likely to open a deposit compared to those who have a loan.

In [21]:
px.histogram(df, x='contact', color='deposit', title='Contact Communication Type Distribution by Deposit', width=800, height=500).show()


Clients who were called on a mobile phone are more likely to open a deposit compared to those who were called on a landline.

In [22]:
px.histogram(df, x='poutcome', color='deposit', title='Outcome of Previous Marketing Campaign Distribution by Deposit', width=800, height=500).show()

### Task 6

In [23]:
# Find out for which previous campaign outcome the success rate in the current campaign exceeds the number of failures.

# Group data by previous campaign outcome and deposit result
grouped = df.groupby(['poutcome', 'deposit']).size().unstack()

# Normalize to show proportions (success vs. failure)
grouped_normalized = grouped.div(grouped.sum(axis=1), axis=0)

# Create a stacked bar plot
fig = px.bar(grouped_normalized, 
             title='Success Rate of Current Campaign Based on Previous Campaign Outcome',
             labels={'value': 'Proportion'},
             barmode='stack',
             width=800, 
             height=500)

# Show the plot
fig.show()

### Task 7

In [24]:
# Find out in which month clients most often declined the offer to open a deposit

px.histogram(df, x='month', color='deposit', title='Last Contact Month Distribution by Deposit', width=800, height=500).show()

The most successful months for opening deposits after contact are May, July, and August. However, May is also the least successful month for opening deposits.

### Task 8

In [25]:
# Create age groups and determine in which groups people are more likely to open a deposit than decline the offer

df['age_group'] = df['age'].apply(lambda x: '<30' if x < 30 else ('30-40' if x >=30 and x < 40 else ('40-50' if x >=40 and x < 50 else ('50-60' if x >=50 and x < 60 else '60+'))))

px.histogram(df, x='age_group', color='deposit', title='Age Group Distribution by Deposit', width=800, height=500).show()

### Tasks 9 and 10

In [26]:
# Create visualizations for clients who opened and did not open a deposit based on marital status

px.histogram(df, x='marital', color='deposit', title='Marital Status Distribution by Deposit', width=800, height=500).show()

The dataset contains mostly married clients, and they are also the most likely to open a deposit. Meanwhile, clients who are divorced are less likely to open a deposit.

In [27]:
# Create visualizations for clients who opened and did not open a deposit based on education level

px.histogram(df, x='education', color='deposit', title='Education Distribution by Deposit', width=800, height=500).show()

Most clients who agreed to open a deposit have either secondary or post-secondary education, while clients with a lower level of education are less likely to open a deposit.

In [28]:
# Create visualizations for clients who opened and did not open a deposit based on the type of professional occupation

px.histogram(df, x='job', color='deposit', title='Job Distribution by Deposit', width=800, height=500).show()

The dataset includes 11 areas of professional occupation. Most clients who made a deposit work in “management” and “technical professions.” Meanwhile, the fewest clients who agreed to open a deposit are in the “housemaid” and “self-employed” sectors.

### Task 11

In [29]:
# Create a pivot table to determine which combination of education and marital status is the most numerous
# (for those who opened a deposit)

df_deposit_yes = df[df['deposit'] == 'yes']
df_deposit_no = df[df['deposit'] == 'no']

px.density_heatmap(df_deposit_yes, x='education', y='marital', title='Heatmap of Education and Marital Status for Deposited', width=800, height=500).show()
px.density_heatmap(df_deposit_no, x='education', y='marital', title='Heatmap of Education and Marital Status for Not Deposited', width=800, height=500).show()

## Part 3: Data Transformation

### Task 1

In [30]:
# Transform education levels

LE = LabelEncoder()
df['education'] = LE.fit_transform(df['education'])

df['education'].sum()

11995

### Tasks 2 and 3

In [31]:
# Convert binary variables to a representation of zeros and ones

df['age_group'] = LE.fit_transform(df['age_group'])

In [32]:
df['deposit'] = df['deposit'].apply(lambda x: 1 if x == 'yes' else 0)

In [33]:
round(df['deposit'].describe()['std'],3)

0.499

In [34]:
df['default'] = df['default'].apply(lambda x: 1 if x == 'yes' else 0)
df['housing'] = df['housing'].apply(lambda x: 1 if x == 'yes' else 0)
df['loan'] = df['loan'].apply(lambda x: 1 if x == 'yes' else 0)

In [35]:
round(df['default'].mean() + df['housing'].mean() + df['loan'].mean(),3)

0.635

### Task 4

In [36]:
# Create dummy variables

nominal_features = ['job', 'marital', 'contact', 'month', 'poutcome']

dummy_data = pd.get_dummies(df[nominal_features], drop_first=False, dtype=int)

df = pd.concat([df, dummy_data], axis=1)

df.drop(columns=nominal_features, inplace=True)

df['deposit'].value_counts()

deposit
0    5424
1    4681
Name: count, dtype: int64

### Tasks 5 and 6

In [37]:
# Build a correlation matrix and evaluate the data for multicollinearity

px.imshow(df.corr(), title='Correlation Matrix', width=800, height=500).show()

# Table of features and their correlation with the target variable
correlation_with_target = df.corr()['deposit'].sort_values(ascending=False)
correlation_with_target

# Bar plot of features with their correlation with the target variable
fig = px.bar(correlation_with_target, 
             title='Correlation of Features with Deposit', 
             labels={'value': 'Correlation Coefficient'},
             width=800, 
             height=500)

fig.show()

### Tasks 7 and 8

In [38]:
X = df.drop(['deposit'], axis=1)
y = df['deposit']
 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.33)

In [39]:
# Calculate the necessary metrics

print(X_test.shape)  # Print the shape of the test feature set

print(y_test.mean())  # Calculate the mean of the target variable in the test set

(3335, 45)
0.46326836581709147


### Task 9

In [40]:
# Use SelectKBest to select the 15 most relevant features

selector = SelectKBest(f_classif, k=15)
selector.fit(X_train, y_train)

selector.get_feature_names_out()

array(['balance', 'housing', 'duration', 'campaign', 'pdays', 'previous',
       'age_group', 'contact_cellular', 'contact_unknown', 'month_mar',
       'month_may', 'month_oct', 'month_sep', 'poutcome_success',
       'poutcome_unknown'], dtype=object)

In [41]:
X_train = X_train[selector.get_feature_names_out()]
X_test = X_test[selector.get_feature_names_out()]


### Task 10

In [42]:
# Normalize the data using MinMaxScaler

# Create a Min-Max Scaler object
scaler = preprocessing.MinMaxScaler()
# Compute the min and max for each column to establish the scaling parameters
scaler.fit(X_train)
# Apply the transformation to both the training and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
first_column_mean = X_test_scaled[:, 0].mean()

# Round the result to two decimal places
first_column_mean_rounded = round(first_column_mean, 2)

print(first_column_mean_rounded)

0.47


# Part 4: Solving the Classification Task: Logistic Regression and Decision Trees

### Task 1

In [44]:
# Train the logistic regression model and calculate performance metrics

lr = linear_model.LogisticRegression(
  random_state=42,
  solver='sag',
  max_iter=1000,
)

# Train the model
lr.fit(X_train_scaled, y_train)

# Make predictions for the training set
y_train_pred = lr.predict(X_train_scaled)

# Output classification metrics for the training set
print('TRAIN')
print(metrics.classification_report(y_train, y_train_pred))

# Make predictions for the test set
y_test_pred = lr.predict(X_test_scaled)

# Output classification metrics for the test set
print('TEST')
print(metrics.classification_report(y_test, y_test_pred))

TRAIN
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      3634
           1       0.84      0.76      0.80      3136

    accuracy                           0.82      6770
   macro avg       0.83      0.82      0.82      6770
weighted avg       0.83      0.82      0.82      6770

TEST
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1790
           1       0.83      0.73      0.78      1545

    accuracy                           0.80      3335
   macro avg       0.81      0.80      0.80      3335
weighted avg       0.81      0.80      0.80      3335



### Tasks 2,3,4

In [45]:
# Train the decision tree model and adjust the maximum depth

dt_model = tree.DecisionTreeClassifier(random_state=42, criterion='entropy')

# Train the model
dt_model.fit(X_train_scaled, y_train)

# Make predictions for the training set
y_train_pred = dt_model.predict(X_train_scaled)

# Output classification metrics for the training set
print('TRAIN')
print(metrics.classification_report(y_train, y_train_pred))

# Make predictions for the test set
y_test_pred = dt_model.predict(X_test_scaled)

# Output classification metrics for the test set
print('TEST')
print(metrics.classification_report(y_test, y_test_pred))

TRAIN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3634
           1       1.00      1.00      1.00      3136

    accuracy                           1.00      6770
   macro avg       1.00      1.00      1.00      6770
weighted avg       1.00      1.00      1.00      6770

TEST
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1790
           1       0.74      0.71      0.72      1545

    accuracy                           0.75      3335
   macro avg       0.75      0.74      0.75      3335
weighted avg       0.75      0.75      0.75      3335



### Task 5

In [46]:
# select optimal parameters using gridsearch
from sklearn.model_selection import GridSearchCV

param_grid = {
        'min_samples_split': [2, 5, 7, 10],
        'max_depth':[3,5,7]
        }
      
grid_search_tree = GridSearchCV(
  estimator=dt_model,
  param_grid=param_grid, 
  cv=5, 
  n_jobs = -1
)  
%time grid_search_tree.fit(X_train_scaled, y_train) 
print("accuracy on the test set: {:.2f}".format(grid_search_tree.score(X_test_scaled, y_test)))
y_test_pred = grid_search_tree.predict(X_test_scaled)
print('f1_score on the test set: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Best hyperparameter values: {}".format(grid_search_tree.best_params_))

CPU times: user 103 ms, sys: 140 ms, total: 243 ms
Wall time: 2.26 s
accuracy on the test set: 0.82
f1_score on the test set: 0.81
Best hyperparameter values: {'max_depth': 7, 'min_samples_split': 10}



# Part 5: Solving the classification problem: ensembles of models and forecasting

### Task 1

In [47]:
# train a random forest on your data

rf_model = ensemble.RandomForestClassifier(random_state=42, n_estimators=100, criterion='gini', min_samples_leaf=5, max_depth=10)

rf_model.fit(X_train_scaled, y_train)


# Make predictions for the training set
y_train_pred = rf_model.predict(X_train_scaled)

#Output report on classification metrics
print('TRAIN')
print(metrics.classification_report(y_train, y_train_pred))

#Making a prediction for the test sample
y_test_pred = rf_model.predict(X_test_scaled)

#Output report on classification metrics
print('TEST')
print(metrics.classification_report(y_test, y_test_pred))

TRAIN
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      3634
           1       0.84      0.86      0.85      3136

    accuracy                           0.86      6770
   macro avg       0.86      0.86      0.86      6770
weighted avg       0.86      0.86      0.86      6770

TEST
              precision    recall  f1-score   support

           0       0.85      0.82      0.84      1790
           1       0.80      0.83      0.82      1545

    accuracy                           0.83      3335
   macro avg       0.83      0.83      0.83      3335
weighted avg       0.83      0.83      0.83      3335



### Task 2 and 3

In [48]:
# use gradient boosting for classification and compare quality with random forest

gr_bust_model = ensemble.GradientBoostingClassifier(random_state=42, n_estimators=300, learning_rate=0.05, max_depth=5, min_samples_leaf=5)

gr_bust_model.fit(X_train_scaled, y_train)

# Make predictions for the training set
y_train_pred = gr_bust_model.predict(X_train_scaled)

#Output report on classification metrics
print('TRAIN')
print(metrics.classification_report(y_train, y_train_pred))

#Making a prediction for the test sample
y_test_pred = gr_bust_model.predict(X_test_scaled)

#Output report on classification metrics
print('TEST')
print(metrics.classification_report(y_test, y_test_pred))

TRAIN
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3634
           1       0.87      0.89      0.88      3136

    accuracy                           0.89      6770
   macro avg       0.89      0.89      0.89      6770
weighted avg       0.89      0.89      0.89      6770

TEST
              precision    recall  f1-score   support

           0       0.85      0.82      0.84      1790
           1       0.80      0.83      0.82      1545

    accuracy                           0.83      3335
   macro avg       0.83      0.83      0.83      3335
weighted avg       0.83      0.83      0.83      3335



### Task 4

In [49]:
from sklearn.ensemble import StackingClassifier
# combine algorithms you already know using staking

# Create a list of tuples of the form: (model name, model)
estimators = [
    ('lr', lr),
    ('dt',  dt_model),
    ('rf',  rf_model),
]

lr_meta = linear_model.LogisticRegression(
  random_state=42
)

# Create an object of the stacking class
reg = StackingClassifier(
    estimators=estimators,
    final_estimator=lr_meta
  )

# Train the model
reg.fit(X_train_scaled, y_train)

# Make predictions for the training set
y_train_pred = reg.predict(X_train_scaled)

#Output report on classification metrics
print('TRAIN')
print(metrics.classification_report(y_train, y_train_pred))

#Making a prediction for the test sample
y_test_pred = reg.predict(X_test_scaled)

#Output report on classification metrics
print('TEST')
print(metrics.classification_report(y_test, y_test_pred))

TRAIN
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      3634
           1       0.84      0.85      0.84      3136

    accuracy                           0.85      6770
   macro avg       0.85      0.85      0.85      6770
weighted avg       0.85      0.85      0.85      6770

TEST
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1790
           1       0.81      0.82      0.81      1545

    accuracy                           0.83      3335
   macro avg       0.82      0.82      0.82      3335
weighted avg       0.83      0.83      0.83      3335



### Task 5

In [50]:
# evaluate which features show the most importance in a gradient boosting model

# Visualize the importance of features
importance = gr_bust_model.feature_importances_

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Build a graph of the importance of features

fig = px.bar(feature_importance_df,
              x='Feature',
              y='Importance',
              title='Feature Importance in Gradient Boosting Model',
              width=800,
              height=500)

fig.show()

### Tasks 6,7,8

In [51]:
import optuna

In [52]:
# implement hyperparameter optimization using Optuna

def optuna_rf(trial):
  # set search spaces for hyperparameters
  n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
  max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

  # create a model
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=42)
  # train the model
  model.fit(X_train_scaled, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train_scaled))

  return score
  
  

In [53]:
%%time
# create a study object
# we can directly indicate that we need to maximize the metric direction="maximize"
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# looking for the best combination of hyperparameters n_trials times
study.optimize(optuna_rf, n_trials=20)

[I 2024-09-19 10:57:15,026] A new study created in memory with name: RandomForestClassifier
[I 2024-09-19 10:57:15,747] Trial 0 finished with value: 0.8952887775864767 and parameters: {'n_estimators': 183, 'max_depth': 27, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8952887775864767.
[I 2024-09-19 10:57:16,231] Trial 1 finished with value: 0.8618369582225004 and parameters: {'n_estimators': 146, 'max_depth': 24, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8952887775864767.
[I 2024-09-19 10:57:16,588] Trial 2 finished with value: 0.8516371612094626 and parameters: {'n_estimators': 109, 'max_depth': 27, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8952887775864767.
[I 2024-09-19 10:57:16,952] Trial 3 finished with value: 0.8765374435621983 and parameters: {'n_estimators': 102, 'max_depth': 21, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8952887775864767.
[I 2024-09-19 10:57:17,499] Trial 4 finished with value: 0.8688447709863999 and parameters: {'n_

CPU times: user 12 s, sys: 864 ms, total: 12.8 s
Wall time: 12 s


In [54]:
# display results on the training set
print("Best hyperparameter values ​​{}".format(study.best_params))
print("f1_score on training set: {:.2f}".format(study.best_value))

Best hyperparameter values ​​{'n_estimators': 200, 'max_depth': 30, 'min_samples_leaf': 2}
f1_score on training set: 0.92


In [55]:
# calculate the accuracy for the test sample
model = ensemble.RandomForestClassifier(**study.best_params,random_state=42, )
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
print("accuracy on test set: {:.2f}".format(model.score(X_test_scaled, y_test)))
y_test_pred = model.predict(X_test_scaled)
print('f1_score on test set: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

accuracy on test set: 0.82
f1_score on test set: 0.82
