diff --git a/examples/User_and_product_embeddings.ipynb b/examples/User_and_product_embeddings.ipynb index 565ccd2890..877775f797 100644 --- a/examples/User_and_product_embeddings.ipynb +++ b/examples/User_and_product_embeddings.ipynb @@ -1,13 +1,15 @@ { "cells": [ { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "raw", "source": [ "## User and product embeddings\n", "\n", - "We calculate user and product embeddings based on the training set, and evaluate the results on the unseen test set. We will evaluate the results by plotting the user and product similarity versus the review score. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb)." - ] + "We calculate user and product embeddings based on the training set, and evaluate the results on the unseen test set. We will evaluate the results by plotting the user and product similarity versus the review score." + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "markdown", @@ -21,13 +23,16 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-21T23:42:37.817424499Z", + "start_time": "2023-06-21T23:42:35.111594233Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "(24502, 19035)" - ] + "text/plain": "(577, 706)" }, "execution_count": 2, "metadata": {}, @@ -39,12 +44,12 @@ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "\n", - "df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below\n", - "df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)\n", + "df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv', index_col=0)\n", + "df['embedding'] = df.embedding.apply(eval).apply(np.array)\n", "X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)\n", "\n", - "user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)\n", - "prod_embeddings = X_train.groupby('ProductId').babbage_similarity.apply(np.mean)\n", + "user_embeddings = X_train.groupby('UserId').embedding.apply(np.mean)\n", + "prod_embeddings = X_train.groupby('ProductId').embedding.apply(np.mean)\n", "len(user_embeddings), len(prod_embeddings)\n" ] }, @@ -67,7 +72,12 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-21T23:42:38.241991598Z", + "start_time": "2023-06-21T23:42:37.819696246Z" + } + }, "outputs": [], "source": [ "from openai.embeddings_utils import cosine_similarity\n", @@ -99,26 +109,27 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-21T23:42:38.452315248Z", + "start_time": "2023-06-21T23:42:38.243944540Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Correlation between user&vector similarity percentile metric and review number of stars (score): 22.11%\n" + "Correlation between user & vector similarity percentile metric and review number of stars (score): 17.25%\n" ] }, { "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" + "text/plain": "
", + "image/png": "" }, + "metadata": {}, "output_type": "display_data" } ], @@ -152,8 +163,9 @@ "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" }, "kernelspec": { - "display_name": "Python 3.7.3 64-bit ('base': conda)", - "name": "python3" + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" }, "language_info": { "codemirror_mode": {