# `hopsworks-cloud-sdk` Integration Tests

This notebook assumes:
* a hopsworks vm, where the `FeaturestoreTourPython` notebook was run
* the hopsworks vm has the following config
    * host: `ec2-18-223-107-57.us-east-2.compute.amazonaws.com`
    * project: `demo_featurestore_admin000`
    * port: `8181`


Config of Hopsworks VM:  
(__If you use another VM then the one specified above, change the details below.__)

In [None]:
HOST = 'ec2-18-223-107-57.us-east-2.compute.amazonaws.com'
PROJECT_NAME = 'demo_featurestore_admin000'
PORT = 8181
AWS_REGION = 'default'


## Imports


In [None]:
import os

import pandas as pd
import matplotlib

from hops import featurestore
from hops import constants

## Feature Store Tests

##### Connect to Featurestore

In [None]:
featurestore.connect(host=HOST, project_name=PROJECT_NAME, port=PORT)

In [None]:
# assert environment variables 
assert os.environ[constants.ENV_VARIABLES.REST_ENDPOINT_END_VAR] == HOST + ':' + str(PORT)
assert os.environ[constants.ENV_VARIABLES.HOPSWORKS_PROJECT_NAME_ENV_VAR] == PROJECT_NAME
assert os.environ[constants.ENV_VARIABLES.REGION_NAME_ENV_VAR] == AWS_REGION
assert isinstance(int(os.environ[constants.ENV_VARIABLES.HOPSWORKS_PROJECT_ID_ENV_VAR]), int)
assert os.environ["CERT_KEY"]

# assert that keyfiles are downloaded from aws secrets manager
assert os.path.isfile('keyStore.jks')
assert os.path.isfile('trustStore.jks')

#### Test Featurestore Utility Operations, 

- `featurestore.get_metadata()`,
- `featurestore.project_featurestore()`, 
- `featurestore.get_latest_featuregroup_version()`, 
- `featurestore.get_features_list()`

In [None]:
featurestore.get_featurestore_metadata(update_cache=True)

In [None]:
featurestore.project_featurestore()

In [None]:
assert featurestore.project_featurestore() == PROJECT_NAME + "_featurestore"

In [None]:
assert featurestore.project_featurestore() in featurestore.get_project_featurestores()

In [None]:
assert len(featurestore.get_project_featurestores()) == 1

In [None]:
assert featurestore.get_latest_featuregroup_version("teams_features_spanish") == 2

In [None]:
assert featurestore.get_latest_featuregroup_version("teams_features") == 1

In [None]:
assert "away_team_id" in featurestore.get_features_list()

In [None]:
assert "home_team_id" in featurestore.get_features_list()

In [None]:
assert (PROJECT_NAME + "_featurestore", 'JDBC') in featurestore.get_storage_connectors()

In [None]:
assert len(featurestore.get_storage_connectors()) >= 3

#### Test Read operations of Features and Feature Groups, 

- `featurestore.get_feature()`, 
- `featurestore.get_features()`, 
- `featurestore.get_featuregroup()`

In [None]:
tmp = featurestore.get_feature("team_budget")
assert isinstance(tmp, pd.core.frame.DataFrame)
assert tmp.count()[0] == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

In [None]:
tmp = featurestore.get_feature(
    "team_budget", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup="teams_features", 
    featuregroup_version = 1
)
assert isinstance(tmp, pd.core.frame.DataFrame)
assert tmp.count()[0] == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

In [None]:
tmp = featurestore.get_featuregroup("teams_features")
assert tmp.count()[0] == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

In [None]:
tmp = featurestore.get_featuregroup(
    "teams_features", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup_version = 1
)
assert tmp.count()[0] == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

In [None]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features
)
assert set(features) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
features = ["teams_features_1.team_budget", "attendances_features_1.average_attendance"]
tmp = featurestore.get_features(features)
assert set(["team_budget", "average_attendance"]) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
# TODO

tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    },
    join_key = "team_id"
)
assert set(features) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
features = ["team_budget", "average_attendance",
    "team_position", "sum_attendance"
    ]
tmp = featurestore.get_features(
   features
)
assert set(features) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
features = ["team_budget", "team_id",]
tmp = featurestore.get_features(
    features,
    featuregroups_version_dict = {
        "teams_features" : 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count()[0] == 50
assert len(tmp.columns) == len(features)

In [None]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5")
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp["team_position"].values:
    assert x < 5

In [None]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5",
                featurestore=featurestore.project_featurestore())
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp["team_position"].values:
    assert x < 5

#### Test Featurestore Partitions

* get_featuregroup_partitions()

In [None]:
tmp = featurestore.get_featuregroup_partitions("games_features_partitioned")
assert "partition" in tmp.columns

#### Test Featurestore Visualizations

- `featurestore.visualize_featuregroup_distributions()`
- `featurestore.visualize_featuregroup_correlations()`
- `featurestore.visualize_featuregroup_clusters()`
- `featurestore.visualize_featuregroup_descriptive_stats()`
- `featurestore.visualize_training_dataset_distributions()`
- `featurestore.visualize_training_dataset_correlations()`
- `featurestore.visualize_traniing_dataset_clusters()`
- `featurestore.visualize_training_dataset_descriptive_stats()`

In [None]:
fig = featurestore.visualize_featuregroup_distributions("players_features", plot=False)
assert isinstance(fig, matplotlib.figure.Figure)

In [None]:
fig = featurestore.visualize_featuregroup_distributions("games_features", plot=False)
assert isinstance(fig, matplotlib.figure.Figure)

In [None]:
fig = featurestore.visualize_featuregroup_distributions("teams_features", plot=False)
assert isinstance(fig, matplotlib.figure.Figure)

In [None]:
fig = featurestore.visualize_featuregroup_correlations("games_features", plot=False)
assert isinstance(fig, matplotlib.figure.Figure)

In [None]:
fig = featurestore.visualize_featuregroup_clusters("games_features", plot=False)
assert isinstance(fig, matplotlib.figure.Figure)


In [None]:
stats = featurestore.visualize_featuregroup_descriptive_stats("games_features")
assert isinstance(stats, pd.core.frame.DataFrame)

#### Test Training Dataset Utility Methods

- `featurestore.get_training_dataset_path()`
- `featurestore.get_training_dataset_tf_record_schema`

In [None]:
assert PROJECT_NAME + "_Training_Datasets" in featurestore.get_training_dataset_path("team_position_prediction_csv")


#### Test Featurestore Get Statistics

* `featurestore.get_featuregroup_statistics()`


In [None]:
stats = featurestore.get_featuregroup_statistics("teams_features")

assert not stats.cluster_analysis is None
assert not stats.cluster_analysis.clusters is None
assert not stats.cluster_analysis.datapoints is None
assert len(stats.cluster_analysis.clusters) == len(stats.cluster_analysis.datapoints)
assert not stats.cluster_analysis.clusters[0].datapoint_name is None
assert not stats.cluster_analysis.clusters[0].cluster is None
assert not stats.correlation_matrix is None
assert not stats.correlation_matrix.feature_correlations is None
assert len(stats.correlation_matrix.feature_correlations) > 0
assert not stats.correlation_matrix.feature_correlations[0].feature_name is None
assert not stats.correlation_matrix.feature_correlations[0].correlation_values is None
assert len(stats.correlation_matrix.feature_correlations[0].correlation_values) == \
len(stats.correlation_matrix.feature_correlations)
assert not stats.descriptive_stats is None
assert not stats.descriptive_stats.descriptive_stats is None
assert len(stats.descriptive_stats.descriptive_stats) > 0
assert not stats.descriptive_stats.descriptive_stats[0].feature_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values is None
assert len(stats.descriptive_stats.descriptive_stats[0].metric_values) > 0
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].metric_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].value is None
assert not stats.feature_histograms is None
assert not stats.feature_histograms.feature_distributions is None
assert len(stats.feature_histograms.feature_distributions) > 0
assert not stats.feature_histograms.feature_distributions[0].feature_name is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution is None
assert len(stats.feature_histograms.feature_distributions[0].frequency_distribution) > 0
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].bin is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].frequency is None