updates embedding examples with new embedding model

openai · Dec 15, 2022 · fd181ec · fd181ec
1 parent 7de3d50
commit fd181ec
Show file tree

Hide file tree

Showing 12 changed files with 12,317 additions and 12,320 deletions.
diff --git a/README.md b/README.md
@@ -446,11 +446,11 @@ Embeddings can be used for search either by themselves or as a feature in a larg
 The simplest way to use embeddings for search is as follows:
 
 * Before the search (precompute):
-  * Split your text corpus into chunks smaller than the token limit (e.g., ~2,000 tokens)
-  * Embed each chunk using a 'doc' model (e.g., `text-search-curie-doc-001`)
+  * Split your text corpus into chunks smaller than the token limit (e.g., <8,000 tokens)
+  * Embed each chunk
   * Store those embeddings in your own database or in a vector search provider like [Pinecone](https://www.pinecone.io) or [Weaviate](https://weaviate.io)
 * At the time of the search (live compute):
-  * Embed the search query using the corresponding 'query' model (e.g. `text-search-curie-query-001`)
+  * Embed the search query
   * Find the closest embeddings in your database
   * Return the top results, ranked by cosine similarity
 
@@ -460,7 +460,7 @@ In more advanced search systems, the the cosine similarity of embeddings can be
 
 #### Recommendations
 
-Recommendations are quite similar to search, except that instead of a free-form text query, the inputs are items in a set. And instead of using pairs of doc-query models, you can use a single symmetric similarity model (e.g., `text-similarity-curie-001`).
+Recommendations are quite similar to search, except that instead of a free-form text query, the inputs are items in a set.
 
 An example of how to use embeddings for recommendations is shown in [Recommendation_using_embeddings.ipynb](examples/Recommendation_using_embeddings.ipynb).
 

diff --git a/examples/Classification_using_embeddings.ipynb b/examples/Classification_using_embeddings.ipynb
diff --git a/examples/Clustering.ipynb b/examples/Clustering.ipynb
diff --git a/examples/Code_search.ipynb b/examples/Code_search.ipynb
diff --git a/examples/Get_embeddings.ipynb b/examples/Get_embeddings.ipynb
@@ -17,7 +17,7 @@
     {
      "data": {
       "text/plain": [
-       "12288"
+       "1536"
       ]
      },
      "execution_count": 1,
@@ -29,8 +29,8 @@
     "import openai\n",
     "\n",
     "embedding = openai.Embedding.create(\n",
-    "    input=\"Sample document text goes here\",\n",
-    "    engine=\"text-similarity-davinci-001\"\n",
+    "    input=\"Your text goes here\",\n",
+    "    engine=\"text-embedding-ada-002\"\n",
     ")[\"data\"][0][\"embedding\"]\n",
     "len(embedding)\n"
    ]
@@ -44,7 +44,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1024\n"
+      "1536\n"
      ]
     }
    ],
@@ -54,33 +54,15 @@
     "\n",
     "\n",
     "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
-    "def get_embedding(text: str, engine=\"text-similarity-davinci-001\") -> list[float]:\n",
+    "def get_embedding(text: str, engine=\"text-embedding-ada-002\") -> list[float]:\n",
     "\n",
     "    # replace newlines, which can negatively affect performance.\n",
     "    text = text.replace(\"\\n\", \" \")\n",
     "\n",
     "    return openai.Embedding.create(input=[text], engine=engine)[\"data\"][0][\"embedding\"]\n",
     "\n",
     "\n",
-    "embedding = get_embedding(\"Sample query text goes here\", engine=\"text-search-ada-query-001\")\n",
-    "print(len(embedding))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1024\n"
-     ]
-    }
-   ],
-   "source": [
-    "embedding = get_embedding(\"Sample document text goes here\", engine=\"text-search-ada-doc-001\")\n",
+    "embedding = get_embedding(\"Your text goes here\", engine=\"text-embedding-ada-002\")\n",
     "print(len(embedding))\n"
    ]
   }

diff --git a/examples/Obtain_dataset.ipynb b/examples/Obtain_dataset.ipynb
@@ -11,6 +11,14 @@
     "We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding."
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -131,7 +139,7 @@
     "\n",
     "# remove reviews that are too long\n",
     "df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))\n",
-    "df = df[df.n_tokens<2000].tail(1_000)\n",
+    "df = df[df.n_tokens<8000].tail(1_000)\n",
     "len(df)"
    ]
   },
@@ -148,20 +156,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import openai\n",
     "from openai.embeddings_utils import get_embedding\n",
+    "# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
     "\n",
-    "# This will take just under 10 minutes\n",
-    "df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n",
-    "df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n",
+    "# This will take just between 5 and 10 minutes\n",
+    "df['ada_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
+    "df['ada_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
     "df.to_csv('data/fine_food_reviews_with_embeddings_1k.csv')"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.9 ('openai')",
+   "display_name": "openai-cookbook",
    "language": "python",
-   "name": "python3"
+   "name": "openai-cookbook"
   },
   "language_info": {
    "codemirror_mode": {
@@ -173,12 +183,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.9"
+   "version": "3.9.6"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
    }
   }
  },