Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,6 @@
" description='Air Quality characteristics of each day',\n",
" version=1,\n",
" primary_key=['unix_time','city_name'],\n",
" online_enabled=False,\n",
" event_time=[\"unix_time\"],\n",
") "
]
Expand Down Expand Up @@ -613,7 +612,6 @@
" description='Weather characteristics of each day',\n",
" version=1,\n",
" primary_key=['unix_time','city_name'],\n",
" online_enabled=False,\n",
" event_time=[\"unix_time\"],\n",
") "
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Build a query object with selected features for training dataset\n",
"query = air_quality_fg.select_all().join(\n",
"# Select features for training data.\n",
"selected_features = air_quality_fg.select_all().join(\n",
" weather_fg.select_except(['unix_time']), \n",
" on=['city_name', 'date'],\n",
")"
Expand All @@ -145,22 +145,8 @@
},
"outputs": [],
"source": [
"# here you can check out the merged dataframe\n",
"\n",
"# query_df = query.read()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e582de6-09aa-4160-be66-0cdd831783d2",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# query_df.city_name.value_counts()"
"# Uncomment this if you would like to view your selected features\n",
"# selected_features.show(5)"
]
},
{
Expand Down Expand Up @@ -198,7 +184,7 @@
"feature_view = fs.get_or_create_feature_view(\n",
" name='air_quality_fv',\n",
" version=1,\n",
" query=query,\n",
" query=selected_features,\n",
")"
]
},
Expand Down
17 changes: 5 additions & 12 deletions advanced_tutorials/citibike/3_citibike_training_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,6 @@
" version=1,\n",
")\n",
"\n",
"citibike_stations_info_fg = fs.get_or_create_feature_group(\n",
" name=\"citibike_stations_info\",\n",
" version=1,\n",
")\n",
"\n",
"us_holidays_fg = fs.get_or_create_feature_group(\n",
" name=\"us_holidays\",\n",
" version=1,\n",
Expand Down Expand Up @@ -138,8 +133,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Select features for training data.\n",
"query = meteorological_measurements_fg.select_except([\"timestamp\"])\\\n",
"# Select features for training data\n",
"selected_features = meteorological_measurements_fg.select_except([\"timestamp\"])\\\n",
" .join(\n",
" us_holidays_fg.select_except([\"timestamp\"]),\n",
" on=\"date\", join_type=\"left\"\n",
Expand All @@ -159,10 +154,8 @@
},
"outputs": [],
"source": [
"# # uncomment and run cell below if you want to see some rows from this query\n",
"# # but you will have to wait some time\n",
"\n",
"# query.read()"
"# Uncomment this if you would like to view your selected features\n",
"# selected_features.show(5)"
]
},
{
Expand Down Expand Up @@ -198,7 +191,7 @@
"source": [
"feature_view = fs.get_or_create_feature_view(\n",
" name='citibike_fv',\n",
" query=query,\n",
" query=selected_features,\n",
" labels=[\"users_count\"],\n",
" version=1, \n",
")"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Build a query object \n",
"query = bureaus_fg.select_except(['sk_id_curr','sk_id_bureau','datetime'])\\\n",
"# Select features for training data\n",
"selected_features = bureaus_fg.select_except(['sk_id_curr','sk_id_bureau','datetime'])\\\n",
" .join(applications_fg.select_except(['sk_id_curr',\n",
" 'datetime',\n",
" 'flag_mobil',\n",
Expand All @@ -234,8 +234,8 @@
" .join(credit_card_balances_fg.select_except(['sk_id_prev', 'sk_id_curr']))\\\n",
" .join(previous_loan_counts_fg.select_except('sk_id_curr'))\n",
"\n",
"query_show5 = query.show(5)\n",
"query_show5"
"selected_features_show5 = selected_features.show(5)\n",
"selected_features_show5"
]
},
{
Expand Down Expand Up @@ -284,8 +284,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Extracting the names of categorical columns in the 'query_show5' DataFrame\n",
"cat_cols = query_show5.dtypes[query_show5.dtypes == 'object'].index\n",
"# Extracting the names of categorical columns in the 'selected_features_show5query_show5' DataFrame\n",
"cat_cols = selected_features_show5.dtypes[selected_features_show5.dtypes == 'object'].index\n",
"\n",
"# Retrieving the Label Encoder transformation function from Featuretools\n",
"le = fs.get_transformation_function(name='label_encoder') \n",
Expand Down Expand Up @@ -338,7 +338,7 @@
" version=1,\n",
" labels=['target'],\n",
" transformation_functions=transformation_functions,\n",
" query=query,\n",
" query=selected_features,\n",
")"
]
},
Expand Down Expand Up @@ -619,7 +619,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@
"metadata": {},
"outputs": [],
"source": [
"fg_query = electricity_prices_fg.select_all()\\\n",
"# Select features for training data\n",
"selected_features = electricity_prices_fg.select_all()\\\n",
" .join(\n",
" meteorological_measurements_fg\\\n",
" .select_except([\"timestamp\"])\n",
Expand All @@ -136,8 +137,8 @@
"metadata": {},
"outputs": [],
"source": [
"# uncomment this if you would like to view query results\n",
"fg_query.show(5)"
"# Uncomment this if you would like to view your selected features\n",
"# selected_features.show(5)"
]
},
{
Expand Down Expand Up @@ -215,7 +216,7 @@
" version=1,\n",
" labels=[], # you will define our 'y' later manualy\n",
" transformation_functions=mapping_transformers,\n",
" query=fg_query,\n",
" query=selected_features,\n",
")"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@
" event_time=\"pickup_datetime\",\n",
" description=\"Rides features\",\n",
" time_travel_format=\"HUDI\", \n",
" online_enabled=False, \n",
" statistics_config=True,\n",
")\n",
"\n",
Expand Down Expand Up @@ -208,7 +207,6 @@
" primary_key=[\"ride_id\"], \n",
" description=\"Taxi fares features\",\n",
" time_travel_format=\"HUDI\", \n",
" online_enabled=False,\n",
" statistics_config=True,\n",
") \n",
"\n",
Expand Down Expand Up @@ -246,7 +244,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,15 @@
"metadata": {},
"outputs": [],
"source": [
"# Select features for training data.\n",
"query = fares_fg.select(['total_fare', \"tolls\"])\\\n",
"# Select features for training data\n",
"selected_features = fares_fg.select(['total_fare', \"tolls\"])\\\n",
" .join(rides_fg.select_except(['taxi_id', \"driver_id\", \"pickup_datetime\",\n",
" \"pickup_longitude\", \"pickup_latitude\",\n",
" \"dropoff_longitude\", \"dropoff_latitude\"]),\n",
" on=['ride_id'])\n",
"\n",
"# Uncomment the line below if you want to display the first 2 rows of the resulting DataFrame\n",
"# query.show(2)"
"# Uncomment this if you would like to view your selected features\n",
"# selected_features.show(5)"
]
},
{
Expand Down Expand Up @@ -151,7 +151,7 @@
"feature_view = fs.get_or_create_feature_view(\n",
" name='nyc_taxi_fares_fv',\n",
" version=1,\n",
" query=query,\n",
" query=selected_features,\n",
" labels=[\"total_fare\"],\n",
")"
]
Expand Down
74 changes: 16 additions & 58 deletions advanced_tutorials/recommender-system/1_feature_engineering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
"source": [
"## <span style=\"color:#ff5f27\">👩🏻‍🔬 Feature Engineering </span>\n",
"\n",
"**Note**: This tutorial does not support Google Colab.\n",
"\n",
"**Your Python Jupyter notebook should be configured for >8GB of memory.**\n",
"\n",
"In this series of tutorials, we will build a recommender system for fashion items. It will consist of two models: a *retrieval model* and a *ranking model*. The idea is that the retrieval model should be able to quickly generate a small subset of candidate items from a large collection of items. This comes at the cost of granularity, which is why we also train a ranking model that can afford to use more features than the retrieval model.\n",
Expand All @@ -31,59 +33,6 @@
"## <span style=\"color:#ff5f27\">📝 Imports </span>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Hosted notebook environments may not have the local features package\n",
"import os\n",
"\n",
"def need_download_modules():\n",
" if 'google.colab' in str(get_ipython()):\n",
" return True\n",
" if 'HOPSWORKS_PROJECT_ID' in os.environ:\n",
" return True\n",
" return False\n",
"\n",
"if need_download_modules():\n",
" print(\"⚙️ Downloading modules...\")\n",
" os.system('mkdir -p features')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/articles.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/customers.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/transactions.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/ranking.py') \n",
" print('✅ Done!')\n",
"else:\n",
" print(\"Local environment\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" from features.articles import prepare_articles\n",
" from features.customers import prepare_customers\n",
" from features.transactions import prepare_transactions\n",
" from features.ranking import compute_ranking_dataset\n",
"except ImportError:\n",
" print(\"⚙️ Downloading modules...\")\n",
" os.system('mkdir -p features')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/articles.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/customers.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/transactions.py')\n",
" os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/recommender-system/features/ranking.py') \n",
" print('✅ Done!')\n",
" from features.articles import prepare_articles\n",
" from features.customers import prepare_customers\n",
" from features.transactions import prepare_transactions\n",
" from features.ranking import compute_ranking_dataset "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -94,7 +43,12 @@
"import numpy as np\n",
"\n",
"import great_expectations as ge\n",
"from great_expectations.core import ExpectationSuite, ExpectationConfiguration"
"from great_expectations.core import ExpectationSuite, ExpectationConfiguration\n",
"\n",
"from features.articles import prepare_articles\n",
"from features.customers import prepare_customers\n",
"from features.transactions import prepare_transactions\n",
"from features.ranking import compute_ranking_dataset "
]
},
{
Expand Down Expand Up @@ -613,7 +567,11 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df = compute_ranking_dataset(trans_fg, articles_fg, customers_fg)"
"ranking_df = compute_ranking_dataset(\n",
" trans_fg, \n",
" articles_fg, \n",
" customers_fg,\n",
")"
]
},
{
Expand Down Expand Up @@ -687,7 +645,7 @@
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
},
"kernelspec": {
"display_name": "Python",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -701,9 +659,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading