# <span style="color:#ff5f27"> üë®üèª‚Äçüè´ Sklearn Transformation Functions Registration</span>

## <span style="color:#ff5f27">üóÑÔ∏è Table of Contents</span>
- [üìù Imports](#1)
- [üíΩ Loading Data](#2)
- [üîÆ Connecting to Hopsworks Feature Store](#3)
- [ü™Ñ Creating Feature Groups](#4)
- [üñç Feature View Creation](#5)
- [üë©üèª‚Äçüî¨ Data Transformation Pipeline](#6)
- [üß¨ Modeling](#7)
- [üíæ Saving the Model in Model Registry](#8)
- [üìÆ Retrieving the Model from Model Registry](#9)
- [üë®üèª‚Äç‚öñÔ∏è Batch Prediction](#10)
- [üë®üèª‚Äç‚öñÔ∏è Serving Feature Vector Prediction](#11)

<a name='1'></a>
## <span style='color:#ff5f27'> üìù Imports </span>

In [None]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2

import xgboost as xgb
from sklearn.metrics import accuracy_score

<a name='2'></a>
## <span style="color:#ff5f27;"> üíΩ Loading Data </span>

In [None]:
df_original = pd.read_csv("https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_eu.csv")
df_original['target'] = np.random.choice([0, 1], size=len(df_original))

df_original.head(3)

<a name='3'></a>
## <span style="color:#ff5f27;"> üîÆ Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

<a name='4'></a>
## <span style="color:#ff5f27;">ü™Ñ Creating Feature Groups</span>

In [None]:
feature_group = fs.get_or_create_feature_group(
    name='feature_group_online',
    description='Online Feature Group',
    version=1,
    primary_key=['city_name', 'date'],
    online_enabled=True,
)    
feature_group.insert(df_original)

<a name='5'></a>
## <span style="color:#ff5f27;"> üñç Feature View Creation</span>

In [None]:
query = feature_group.select_except(['date'])

feature_view = fs.get_or_create_feature_view(
    name='serving_fv',
    version=1,
    query=query,
    labels=['target']
)

## <span style="color:#ff5f27;"> üèãÔ∏è Training Dataset Creation</span>


In [None]:
# Create a train-test split dataset
td_version, job = feature_view.create_train_test_split(
    test_size=0.1,
    description='Description of the dataset',
    data_format='csv'
)

### <span style="color:#ff5f27;">ü™ù Training Dataset Retrieval</span>

In [None]:
X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
    training_dataset_version=td_version
)

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)

<a name='6'></a>
## <span style="color:#ff5f27;">üë©üèª‚Äçüî¨ Data Transformation Pipeline</span>

In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ['pm2_5']),
        ("cat", categorical_transformer, ['city_name']),
    ]
)

<a name='7'></a>
## <span style="color:#ff5f27;">üß¨ Modeling</span>

In [None]:
xgb_classifier = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", xgb.XGBClassifier())
    ]
)

# Fit the classifier
xgb_classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("üëÆüèª‚Äç‚ôÇÔ∏è Accuracy:", accuracy)

## <span style="color:#ff5f27;">üóÑ Model Registry</span>

In [None]:
mr = project.get_model_registry()

### <span style="color:#ff5f27;">‚öôÔ∏è Model Schema</span>


In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train.values)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

model_schema.to_dict()

<a name='8'></a>
### <span style="color:#ff5f27;">üíæ Saving the Model</span>

In [None]:
model_dir = "xgb_tf_model"

if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

joblib.dump(xgb_classifier, model_dir + '/xgb_classifier.pkl')

In [None]:
model = mr.python.create_model(
    name="xgb_model",
    metrics={"Accuracy": accuracy}, 
    description="XGB model",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(model_dir)

<a name='9'></a>
## <span style="color:#ff5f27;"> üìÆ Retrieving the Model from Model Registry </span>

In [None]:
retrieved_model = mr.get_model(
    name="xgb_model",
    version=1
)
saved_model_dir = retrieved_model.download()

In [None]:
retrieved_xgboost_model = joblib.load(saved_model_dir + "/xgb_classifier.pkl")
retrieved_xgboost_model

<a name='10'></a>
## <span style="color:#ff5f27;"> üë®üèª‚Äç‚öñÔ∏è Batch Prediction </span>

In [None]:
feature_view.init_batch_scoring(training_dataset_version=td_version)

batch_data = feature_view.get_batch_data()
batch_data.head(3)

In [None]:
predictions_batch = retrieved_xgboost_model.predict(batch_data)
predictions_batch[:10]

<a name='11'></a>
## <span style="color:#ff5f27;"> üë®üèª‚Äç‚öñÔ∏è Serving Feature Vector Prediction</span>

In [None]:
def to_df(feature_vector):
    if isinstance(feature_vector[0], list): 
        city_names = [vector[0] for vector in feature_vector]
        pm2_5_values = [vector[1] for vector in feature_vector]
        data = pd.DataFrame(
            {
                'city_name': city_names,
                'pm2_5': pm2_5_values,
            }
        )
        return data

    data = pd.DataFrame(
            {
                'city_name': [feature_vector[0]],
                'pm2_5': [feature_vector[1]],
            }
        )
    return data

In [None]:
feature_view.init_serving(1)

feature_vector = feature_view.get_feature_vector(
    entry = {
        "city_name": 'Amsterdam',
        "date": '2013-01-01',
    }
)
feature_vector

In [None]:
feature_vector_df = to_df(feature_vector)
feature_vector_df

In [None]:
prediction_feature_vector = retrieved_xgboost_model.predict(feature_vector_df)
prediction_feature_vector

In [None]:
feature_vectors = feature_view.get_feature_vectors(
    entry = [
        {"city_name": 'Amsterdam', "date": '2013-01-01'},
        {"city_name": 'Amsterdam', "date": '2014-01-01'},
    ]
)
feature_vectors

In [None]:
feature_vectors_df = to_df(feature_vectors)
feature_vectors_df

In [None]:
prediction_feature_vectors = retrieved_xgboost_model.predict(feature_vectors_df)
prediction_feature_vectors

---