diff --git a/.github/workflows/build_deploy_doc.yml b/.github/workflows/build_deploy_doc.yml new file mode 100644 index 00000000..3137afbf --- /dev/null +++ b/.github/workflows/build_deploy_doc.yml @@ -0,0 +1,27 @@ +name: Build & Deploy Doc +on: + push: + branches: [ main ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout Repository + uses: actions/checkout@v2 + - name: Set up Python 3 + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: python3 -m pip install nox + - name: Lint the code + run: nox -s docs + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/build/html diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..747ffb7b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt new file mode 100644 index 00000000..b2e8151e --- /dev/null +++ b/docs/requirements-docs.txt @@ -0,0 +1,10 @@ +opensearch-py>=2 +pandas>=1.5,<2 +matplotlib>=3.6.0,<4 +nbval +sphinx +sphinx_rtd_theme +nbsphinx + +# traitlets has been having all sorts of release problems lately. +traitlets<5.1 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..4d49531a --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import datetime +import os +import sys + +sys.path.insert(0, os.path.abspath("../../")) + + +# -- Project information ----------------------------------------------------- + +project = "Opensearch-py-ml" +copyright = f"{datetime.date.today().year}, Opensearch" +author = "OpenSearch Project Contributors" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.viewcode", + "nbsphinx", + "sphinx.ext.todo", +] + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["**.ipynb_checkpoints"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/source/examples/data/online-retail.csv.gz b/docs/source/examples/data/online-retail.csv.gz new file mode 100644 index 00000000..369660e2 Binary files /dev/null and b/docs/source/examples/data/online-retail.csv.gz differ diff --git a/docs/source/examples/demo_notebook.ipynb b/docs/source/examples/demo_notebook.ipynb new file mode 100644 index 00000000..77537b49 --- /dev/null +++ b/docs/source/examples/demo_notebook.ipynb @@ -0,0 +1,4153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demo Notebook for Dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Step 0: Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:28.813370Z", + "iopub.status.busy": "2021-12-15T20:24:28.802670Z", + "iopub.status.idle": "2021-12-15T20:24:30.192643Z", + "shell.execute_reply": "2021-12-15T20:24:30.192931Z" + }, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "# imports to demonstrate DataFrame support\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import opensearch_py_ml as oml\n", + "from opensearchpy import OpenSearch\n", + "\n", + "# Import standard test settings for consistent results\n", + "from opensearch_py_ml.conftest import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Setup clients" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "CLUSTER_URL = 'https://localhost:9200'\n", + "\n", + "def get_os_client(cluster_url = CLUSTER_URL,\n", + " username='admin',\n", + " password='admin'):\n", + " '''\n", + " Get OpenSearch client\n", + " :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443\n", + " :return: OpenSearch client\n", + " '''\n", + " client = OpenSearch(\n", + " hosts=[cluster_url],\n", + " http_auth=(username, password),\n", + " verify_certs=False\n", + " )\n", + " return client" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "client = get_os_client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing similarities and differences between panda dataframe vs opensearch_py_ml dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an opensearch_py_ml.DataFrame from a `flights` index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:30.195931Z", + "iopub.status.busy": "2021-12-15T20:24:30.195563Z", + "iopub.status.idle": "2021-12-15T20:24:30.668077Z", + "shell.execute_reply": "2021-12-15T20:24:30.667531Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "oml_flights = oml.DataFrame(client, 'flights')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:30.673583Z", + "iopub.status.busy": "2021-12-15T20:24:30.673088Z", + "iopub.status.idle": "2021-12-15T20:24:30.677254Z", + "shell.execute_reply": "2021-12-15T20:24:30.677676Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "opensearch_py_ml.dataframe.DataFrame" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(oml_flights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare to pandas DataFrame (created from the same data)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:30.682438Z", + "iopub.status.busy": "2021-12-15T20:24:30.681925Z", + "iopub.status.idle": "2021-12-15T20:24:35.046707Z", + "shell.execute_reply": "2021-12-15T20:24:35.046060Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "pd_flights = oml.opensearch_to_pandas(oml_flights)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.050698Z", + "iopub.status.busy": "2021-12-15T20:24:35.050158Z", + "iopub.status.idle": "2021-12-15T20:24:35.053100Z", + "shell.execute_reply": "2021-12-15T20:24:35.052560Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(pd_flights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attributes and underlying data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.057245Z", + "iopub.status.busy": "2021-12-15T20:24:35.056812Z", + "iopub.status.idle": "2021-12-15T20:24:35.059852Z", + "shell.execute_reply": "2021-12-15T20:24:35.059182Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", + " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", + " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", + " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", + " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", + " 'timestamp'],\n", + " dtype='object')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.063057Z", + "iopub.status.busy": "2021-12-15T20:24:35.062660Z", + "iopub.status.idle": "2021-12-15T20:24:35.065130Z", + "shell.execute_reply": "2021-12-15T20:24:35.064757Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", + " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", + " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", + " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", + " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", + " 'timestamp'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.076896Z", + "iopub.status.busy": "2021-12-15T20:24:35.076493Z", + "iopub.status.idle": "2021-12-15T20:24:35.079007Z", + "shell.execute_reply": "2021-12-15T20:24:35.078592Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice float64\n", + "Cancelled bool\n", + "Carrier object\n", + "Dest object\n", + "DestAirportID object\n", + " ... \n", + "OriginLocation object\n", + "OriginRegion object\n", + "OriginWeather object\n", + "dayOfWeek int64\n", + "timestamp datetime64[ns]\n", + "Length: 27, dtype: object" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.083103Z", + "iopub.status.busy": "2021-12-15T20:24:35.082693Z", + "iopub.status.idle": "2021-12-15T20:24:35.085336Z", + "shell.execute_reply": "2021-12-15T20:24:35.084836Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice float64\n", + "Cancelled bool\n", + "Carrier object\n", + "Dest object\n", + "DestAirportID object\n", + " ... \n", + "OriginLocation object\n", + "OriginRegion object\n", + "OriginWeather object\n", + "dayOfWeek int64\n", + "timestamp datetime64[ns]\n", + "Length: 27, dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.select_dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.090387Z", + "iopub.status.busy": "2021-12-15T20:24:35.089910Z", + "iopub.status.idle": "2021-12-15T20:24:35.113517Z", + "shell.execute_reply": "2021-12-15T20:24:35.113801Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
0841.26564216492.326654...1030.7704160
1882.9826628823.400140...464.3894810
2190.6369040.000000...0.0000000
3181.694216555.737767...222.7490590
4730.04177813358.244200...785.7790710
..................
130541080.4462798058.581753...402.9290886
13055646.6129417088.598322...644.4180296
13056997.75187610920.652972...937.5408116
130571102.81446518748.859647...1697.4049716
13058858.14433716809.141923...1610.7618276
\n", + "

13059 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", + "0 841.265642 16492.326654 ... 1030.770416 0\n", + "1 882.982662 8823.400140 ... 464.389481 0\n", + "2 190.636904 0.000000 ... 0.000000 0\n", + "3 181.694216 555.737767 ... 222.749059 0\n", + "4 730.041778 13358.244200 ... 785.779071 0\n", + "... ... ... ... ... ...\n", + "13054 1080.446279 8058.581753 ... 402.929088 6\n", + "13055 646.612941 7088.598322 ... 644.418029 6\n", + "13056 997.751876 10920.652972 ... 937.540811 6\n", + "13057 1102.814465 18748.859647 ... 1697.404971 6\n", + "13058 858.144337 16809.141923 ... 1610.761827 6\n", + "\n", + "[13059 rows x 7 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.select_dtypes(include=np.number)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:35.131083Z", + "iopub.status.busy": "2021-12-15T20:24:35.130699Z", + "iopub.status.idle": "2021-12-15T20:24:38.362018Z", + "shell.execute_reply": "2021-12-15T20:24:38.360520Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
0841.26564216492.326654...1030.7704160
1882.9826628823.400140...464.3894810
2190.6369040.000000...0.0000000
3181.694216555.737767...222.7490590
4730.04177813358.244200...785.7790710
..................
130541080.4462798058.581753...402.9290886
13055646.6129417088.598322...644.4180296
13056997.75187610920.652972...937.5408116
130571102.81446518748.859647...1697.4049716
13058858.14433716809.141923...1610.7618276
\n", + "
\n", + "

13059 rows × 7 columns

" + ], + "text/plain": [ + " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", + "0 841.265642 16492.326654 ... 1030.770416 0\n", + "1 882.982662 8823.400140 ... 464.389481 0\n", + "2 190.636904 0.000000 ... 0.000000 0\n", + "3 181.694216 555.737767 ... 222.749059 0\n", + "4 730.041778 13358.244200 ... 785.779071 0\n", + "... ... ... ... ... ...\n", + "13054 1080.446279 8058.581753 ... 402.929088 6\n", + "13055 646.612941 7088.598322 ... 644.418029 6\n", + "13056 997.751876 10920.652972 ... 937.540811 6\n", + "13057 1102.814465 18748.859647 ... 1697.404971 6\n", + "13058 858.144337 16809.141923 ... 1610.761827 6\n", + "\n", + "[13059 rows x 7 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.select_dtypes(include=np.number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.empty" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.376647Z", + "iopub.status.busy": "2021-12-15T20:24:38.374422Z", + "iopub.status.idle": "2021-12-15T20:24:38.382068Z", + "shell.execute_reply": "2021-12-15T20:24:38.383242Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.empty" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.391560Z", + "iopub.status.busy": "2021-12-15T20:24:38.390590Z", + "iopub.status.idle": "2021-12-15T20:24:38.463948Z", + "shell.execute_reply": "2021-12-15T20:24:38.463507Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.empty" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.467067Z", + "iopub.status.busy": "2021-12-15T20:24:38.466671Z", + "iopub.status.idle": "2021-12-15T20:24:38.469107Z", + "shell.execute_reply": "2021-12-15T20:24:38.468710Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(13059, 27)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.471904Z", + "iopub.status.busy": "2021-12-15T20:24:38.471491Z", + "iopub.status.idle": "2021-12-15T20:24:38.558583Z", + "shell.execute_reply": "2021-12-15T20:24:38.557300Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(13059, 27)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.index\n", + "\n", + "Note, `opensearch_py_ml.DataFrame.index` does not mirror `pandas.DataFrame.index`. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.569600Z", + "iopub.status.busy": "2021-12-15T20:24:38.568315Z", + "iopub.status.idle": "2021-12-15T20:24:38.575273Z", + "shell.execute_reply": "2021-12-15T20:24:38.574007Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n", + " ...\n", + " '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],\n", + " dtype='object', length=13059)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.index" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.583800Z", + "iopub.status.busy": "2021-12-15T20:24:38.582630Z", + "iopub.status.idle": "2021-12-15T20:24:38.588504Z", + "shell.execute_reply": "2021-12-15T20:24:38.589469Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "oml_flights.index" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.598654Z", + "iopub.status.busy": "2021-12-15T20:24:38.597416Z", + "iopub.status.idle": "2021-12-15T20:24:38.604549Z", + "shell.execute_reply": "2021-12-15T20:24:38.603361Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'_id'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.index.os_index_field" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.values\n", + "\n", + "Note, `opensearch_py_ml.DataFrame.values` is not supported." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.634011Z", + "iopub.status.busy": "2021-12-15T20:24:38.629434Z", + "iopub.status.idle": "2021-12-15T20:24:38.681698Z", + "shell.execute_reply": "2021-12-15T20:24:38.681982Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,\n", + " Timestamp('2018-01-01 00:00:00')],\n", + " [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,\n", + " Timestamp('2018-01-01 18:27:00')],\n", + " [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,\n", + " Timestamp('2018-01-01 17:11:14')],\n", + " ...,\n", + " [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,\n", + " Timestamp('2018-02-11 04:09:27')],\n", + " [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,\n", + " Timestamp('2018-02-11 08:28:21')],\n", + " [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,\n", + " Timestamp('2018-02-11 14:54:34')]], dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.values" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.685031Z", + "iopub.status.busy": "2021-12-15T20:24:38.684450Z", + "iopub.status.idle": "2021-12-15T20:24:38.687106Z", + "shell.execute_reply": "2021-12-15T20:24:38.686771Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This method would scan/scroll the entire OpenSearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.opensearch_to_pandas(ed_df).values`\n" + ] + } + ], + "source": [ + "try:\n", + " oml_flights.values\n", + "except AttributeError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing, iteration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.head" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.694742Z", + "iopub.status.busy": "2021-12-15T20:24:38.694371Z", + "iopub.status.idle": "2021-12-15T20:24:38.696502Z", + "shell.execute_reply": "2021-12-15T20:24:38.696207Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
0841.265642False...02018-01-01 00:00:00
1882.982662False...02018-01-01 18:27:00
2190.636904False...02018-01-01 17:11:14
3181.694216True...02018-01-01 10:33:28
4730.041778False...02018-01-01 05:13:00
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "0 841.265642 False ... 0 2018-01-01 00:00:00\n", + "1 882.982662 False ... 0 2018-01-01 18:27:00\n", + "2 190.636904 False ... 0 2018-01-01 17:11:14\n", + "3 181.694216 True ... 0 2018-01-01 10:33:28\n", + "4 730.041778 False ... 0 2018-01-01 05:13:00\n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:38.699691Z", + "iopub.status.busy": "2021-12-15T20:24:38.699271Z", + "iopub.status.idle": "2021-12-15T20:24:40.675206Z", + "shell.execute_reply": "2021-12-15T20:24:40.676183Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
0841.265642False...02018-01-01 00:00:00
1882.982662False...02018-01-01 18:27:00
2190.636904False...02018-01-01 17:11:14
3181.694216True...02018-01-01 10:33:28
4730.041778False...02018-01-01 05:13:00
\n", + "
\n", + "

5 rows × 27 columns

" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "0 841.265642 False ... 0 2018-01-01 00:00:00\n", + "1 882.982662 False ... 0 2018-01-01 18:27:00\n", + "2 190.636904 False ... 0 2018-01-01 17:11:14\n", + "3 181.694216 True ... 0 2018-01-01 10:33:28\n", + "4 730.041778 False ... 0 2018-01-01 05:13:00\n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.tail" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:40.695639Z", + "iopub.status.busy": "2021-12-15T20:24:40.694910Z", + "iopub.status.idle": "2021-12-15T20:24:40.698148Z", + "shell.execute_reply": "2021-12-15T20:24:40.698704Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
130541080.446279False...62018-02-11 20:42:25
13055646.612941False...62018-02-11 01:41:57
13056997.751876False...62018-02-11 04:09:27
130571102.814465False...62018-02-11 08:28:21
13058858.144337False...62018-02-11 14:54:34
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "13054 1080.446279 False ... 6 2018-02-11 20:42:25\n", + "13055 646.612941 False ... 6 2018-02-11 01:41:57\n", + "13056 997.751876 False ... 6 2018-02-11 04:09:27\n", + "13057 1102.814465 False ... 6 2018-02-11 08:28:21\n", + "13058 858.144337 False ... 6 2018-02-11 14:54:34\n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:40.703670Z", + "iopub.status.busy": "2021-12-15T20:24:40.702923Z", + "iopub.status.idle": "2021-12-15T20:24:42.789898Z", + "shell.execute_reply": "2021-12-15T20:24:42.789460Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
130541080.446279False...62018-02-11 20:42:25
13055646.612941False...62018-02-11 01:41:57
13056997.751876False...62018-02-11 04:09:27
130571102.814465False...62018-02-11 08:28:21
13058858.144337False...62018-02-11 14:54:34
\n", + "
\n", + "

5 rows × 27 columns

" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "13054 1080.446279 False ... 6 2018-02-11 20:42:25\n", + "13055 646.612941 False ... 6 2018-02-11 01:41:57\n", + "13056 997.751876 False ... 6 2018-02-11 04:09:27\n", + "13057 1102.814465 False ... 6 2018-02-11 08:28:21\n", + "13058 858.144337 False ... 6 2018-02-11 14:54:34\n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.keys" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:42.793363Z", + "iopub.status.busy": "2021-12-15T20:24:42.791765Z", + "iopub.status.idle": "2021-12-15T20:24:42.796116Z", + "shell.execute_reply": "2021-12-15T20:24:42.795742Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", + " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", + " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", + " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", + " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", + " 'timestamp'],\n", + " dtype='object')" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:42.799972Z", + "iopub.status.busy": "2021-12-15T20:24:42.799336Z", + "iopub.status.idle": "2021-12-15T20:24:42.802144Z", + "shell.execute_reply": "2021-12-15T20:24:42.801772Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", + " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", + " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", + " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", + " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", + " 'timestamp'],\n", + " dtype='object')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.get" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:42.807664Z", + "iopub.status.busy": "2021-12-15T20:24:42.807137Z", + "iopub.status.idle": "2021-12-15T20:24:42.809592Z", + "shell.execute_reply": "2021-12-15T20:24:42.809225Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Kibana Airlines\n", + "1 Logstash Airways\n", + "2 Logstash Airways\n", + "3 Kibana Airlines\n", + "4 Kibana Airlines\n", + " ... \n", + "13054 Logstash Airways\n", + "13055 Logstash Airways\n", + "13056 Logstash Airways\n", + "13057 JetBeats\n", + "13058 JetBeats\n", + "Name: Carrier, Length: 13059, dtype: object" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.get('Carrier')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:42.813306Z", + "iopub.status.busy": "2021-12-15T20:24:42.812951Z", + "iopub.status.idle": "2021-12-15T20:24:44.238665Z", + "shell.execute_reply": "2021-12-15T20:24:44.239468Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Kibana Airlines\n", + "1 Logstash Airways\n", + "2 Logstash Airways\n", + "3 Kibana Airlines\n", + "4 Kibana Airlines\n", + " ... \n", + "13054 Logstash Airways\n", + "13055 Logstash Airways\n", + "13056 Logstash Airways\n", + "13057 JetBeats\n", + "13058 JetBeats\n", + "Name: Carrier, Length: 13059, dtype: object" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.get('Carrier')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:44.252784Z", + "iopub.status.busy": "2021-12-15T20:24:44.244955Z", + "iopub.status.idle": "2021-12-15T20:24:44.257030Z", + "shell.execute_reply": "2021-12-15T20:24:44.258040Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierOrigin
0Kibana AirlinesFrankfurt am Main Airport
1Logstash AirwaysCape Town International Airport
2Logstash AirwaysVenice Marco Polo Airport
3Kibana AirlinesNaples International Airport
4Kibana AirlinesLicenciado Benito Juarez International Airport
.........
13054Logstash AirwaysPisa International Airport
13055Logstash AirwaysWinnipeg / James Armstrong Richardson Internat...
13056Logstash AirwaysLicenciado Benito Juarez International Airport
13057JetBeatsItami Airport
13058JetBeatsAdelaide International Airport
\n", + "

13059 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Carrier Origin\n", + "0 Kibana Airlines Frankfurt am Main Airport\n", + "1 Logstash Airways Cape Town International Airport\n", + "2 Logstash Airways Venice Marco Polo Airport\n", + "3 Kibana Airlines Naples International Airport\n", + "4 Kibana Airlines Licenciado Benito Juarez International Airport\n", + "... ... ...\n", + "13054 Logstash Airways Pisa International Airport\n", + "13055 Logstash Airways Winnipeg / James Armstrong Richardson Internat...\n", + "13056 Logstash Airways Licenciado Benito Juarez International Airport\n", + "13057 JetBeats Itami Airport\n", + "13058 JetBeats Adelaide International Airport\n", + "\n", + "[13059 rows x 2 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.get(['Carrier', 'Origin'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List input not currently supported by `opensearch_py_ml.DataFrame.get`" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:44.263674Z", + "iopub.status.busy": "2021-12-15T20:24:44.261671Z", + "iopub.status.idle": "2021-12-15T20:24:44.266564Z", + "shell.execute_reply": "2021-12-15T20:24:44.267454Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unhashable type: 'list'\n" + ] + } + ], + "source": [ + "try:\n", + " oml_flights.get(['Carrier', 'Origin'])\n", + "except TypeError as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.query" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:44.291893Z", + "iopub.status.busy": "2021-12-15T20:24:44.279615Z", + "iopub.status.idle": "2021-12-15T20:24:44.294715Z", + "shell.execute_reply": "2021-12-15T20:24:44.294192Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", + "

68 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "8 960.869736 True ... 0 2018-01-01 12:09:35\n", + "26 975.812632 True ... 0 2018-01-01 15:38:32\n", + "311 946.358410 True ... 0 2018-01-01 11:51:12\n", + "651 975.383864 True ... 2 2018-01-03 21:13:17\n", + "950 907.836523 True ... 2 2018-01-03 05:14:51\n", + "... ... ... ... ... ...\n", + "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", + "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", + "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", + "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", + "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", + "\n", + "[68 rows x 27 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`opensearch_py_ml.DataFrame.query` requires qualifier on bool i.e.\n", + "\n", + "`oml_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled')` fails" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:44.317223Z", + "iopub.status.busy": "2021-12-15T20:24:44.316680Z", + "iopub.status.idle": "2021-12-15T20:24:46.365286Z", + "shell.execute_reply": "2021-12-15T20:24:46.366706Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", + "
\n", + "

68 rows × 27 columns

" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "8 960.869736 True ... 0 2018-01-01 12:09:35\n", + "26 975.812632 True ... 0 2018-01-01 15:38:32\n", + "311 946.358410 True ... 0 2018-01-01 11:51:12\n", + "651 975.383864 True ... 2 2018-01-03 21:13:17\n", + "950 907.836523 True ... 2 2018-01-03 05:14:51\n", + "... ... ... ... ... ...\n", + "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", + "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", + "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", + "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", + "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", + "\n", + "[68 rows x 27 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Boolean indexing query" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:46.417274Z", + "iopub.status.busy": "2021-12-15T20:24:46.416156Z", + "iopub.status.idle": "2021-12-15T20:24:46.421120Z", + "shell.execute_reply": "2021-12-15T20:24:46.421781Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", + "

68 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "8 960.869736 True ... 0 2018-01-01 12:09:35\n", + "26 975.812632 True ... 0 2018-01-01 15:38:32\n", + "311 946.358410 True ... 0 2018-01-01 11:51:12\n", + "651 975.383864 True ... 2 2018-01-03 21:13:17\n", + "950 907.836523 True ... 2 2018-01-03 05:14:51\n", + "... ... ... ... ... ...\n", + "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", + "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", + "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", + "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", + "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", + "\n", + "[68 rows x 27 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights[(pd_flights.Carrier==\"Kibana Airlines\") & \n", + " (pd_flights.AvgTicketPrice > 900.0) &\n", + " (pd_flights.Cancelled == True)]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:46.431697Z", + "iopub.status.busy": "2021-12-15T20:24:46.431114Z", + "iopub.status.idle": "2021-12-15T20:24:48.414684Z", + "shell.execute_reply": "2021-12-15T20:24:48.415148Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", + "
\n", + "

68 rows × 27 columns

" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", + "8 960.869736 True ... 0 2018-01-01 12:09:35\n", + "26 975.812632 True ... 0 2018-01-01 15:38:32\n", + "311 946.358410 True ... 0 2018-01-01 11:51:12\n", + "651 975.383864 True ... 2 2018-01-03 21:13:17\n", + "950 907.836523 True ... 2 2018-01-03 05:14:51\n", + "... ... ... ... ... ...\n", + "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", + "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", + "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", + "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", + "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", + "\n", + "[68 rows x 27 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights[(oml_flights.Carrier==\"Kibana Airlines\") & \n", + " (oml_flights.AvgTicketPrice > 900.0) &\n", + " (oml_flights.Cancelled == True)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Function application, GroupBy & window" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.aggs" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:48.421015Z", + "iopub.status.busy": "2021-12-15T20:24:48.420431Z", + "iopub.status.idle": "2021-12-15T20:24:48.428382Z", + "shell.execute_reply": "2021-12-15T20:24:48.428690Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistanceKilometersAvgTicketPrice
sum9.261629e+078.204365e+06
min0.000000e+001.000205e+02
std4.578438e+032.663969e+02
\n", + "
" + ], + "text/plain": [ + " DistanceKilometers AvgTicketPrice\n", + "sum 9.261629e+07 8.204365e+06\n", + "min 0.000000e+00 1.000205e+02\n", + "std 4.578438e+03 2.663969e+02" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`opensearch_py_ml.DataFrame.aggregate` currently only supported numeric columns" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:48.433082Z", + "iopub.status.busy": "2021-12-15T20:24:48.432716Z", + "iopub.status.idle": "2021-12-15T20:24:48.541272Z", + "shell.execute_reply": "2021-12-15T20:24:48.542346Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistanceKilometersAvgTicketPrice
sum9.261629e+078.204365e+06
min0.000000e+001.000205e+02
std4.578614e+032.664071e+02
\n", + "
" + ], + "text/plain": [ + " DistanceKilometers AvgTicketPrice\n", + "sum 9.261629e+07 8.204365e+06\n", + "min 0.000000e+00 1.000205e+02\n", + "std 4.578614e+03 2.664071e+02" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computations / descriptive stats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.count" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:48.574692Z", + "iopub.status.busy": "2021-12-15T20:24:48.574008Z", + "iopub.status.idle": "2021-12-15T20:24:48.580240Z", + "shell.execute_reply": "2021-12-15T20:24:48.579845Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 13059\n", + "Cancelled 13059\n", + "Carrier 13059\n", + "Dest 13059\n", + "DestAirportID 13059\n", + " ... \n", + "OriginLocation 13059\n", + "OriginRegion 13059\n", + "OriginWeather 13059\n", + "dayOfWeek 13059\n", + "timestamp 13059\n", + "Length: 27, dtype: int64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:48.583948Z", + "iopub.status.busy": "2021-12-15T20:24:48.583565Z", + "iopub.status.idle": "2021-12-15T20:24:50.642201Z", + "shell.execute_reply": "2021-12-15T20:24:50.643158Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 13059\n", + "Cancelled 13059\n", + "Carrier 13059\n", + "Dest 13059\n", + "DestAirportID 13059\n", + " ... \n", + "OriginLocation 13059\n", + "OriginRegion 13059\n", + "OriginWeather 13059\n", + "dayOfWeek 13059\n", + "timestamp 13059\n", + "Length: 27, dtype: int64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.describe" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:50.655939Z", + "iopub.status.busy": "2021-12-15T20:24:50.654654Z", + "iopub.status.idle": "2021-12-15T20:24:50.702251Z", + "shell.execute_reply": "2021-12-15T20:24:50.701757Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
count13059.00000013059.000000...13059.00000013059.000000
mean628.2536897092.142455...511.1278422.835975
std266.3968614578.438497...334.7539521.939439
min100.0205280.000000...0.0000000.000000
25%409.8938162459.705673...252.3331921.000000
50%640.5566687610.330866...503.0451703.000000
75%842.1854709736.637600...720.4160364.000000
max1199.72905319881.482315...1902.9020326.000000
\n", + "

8 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", + "count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", + "mean 628.253689 7092.142455 ... 511.127842 2.835975\n", + "std 266.396861 4578.438497 ... 334.753952 1.939439\n", + "min 100.020528 0.000000 ... 0.000000 0.000000\n", + "25% 409.893816 2459.705673 ... 252.333192 1.000000\n", + "50% 640.556668 7610.330866 ... 503.045170 3.000000\n", + "75% 842.185470 9736.637600 ... 720.416036 4.000000\n", + "max 1199.729053 19881.482315 ... 1902.902032 6.000000\n", + "\n", + "[8 rows x 7 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Values returned from `opensearch_py_ml.DataFrame.describe` may vary due to results of Opensearch aggregations." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:50.710321Z", + "iopub.status.busy": "2021-12-15T20:24:50.709905Z", + "iopub.status.idle": "2021-12-15T20:24:50.912876Z", + "shell.execute_reply": "2021-12-15T20:24:50.913469Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelled...FlightTimeMindayOfWeek
count13059.00000013059.000000...13059.00000013059.000000
mean628.2536890.128494...511.1278422.835975
std266.4070610.334664...334.7667701.939513
min100.0205310.000000...0.0000000.000000
25%410.0089180.000000...251.9387101.000000
50%640.3872850.000000...503.1489753.000000
75%842.2134900.000000...720.5057054.000000
max1199.7290041.000000...1902.9019786.000000
\n", + "

8 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled ... FlightTimeMin dayOfWeek\n", + "count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", + "mean 628.253689 0.128494 ... 511.127842 2.835975\n", + "std 266.407061 0.334664 ... 334.766770 1.939513\n", + "min 100.020531 0.000000 ... 0.000000 0.000000\n", + "25% 410.008918 0.000000 ... 251.938710 1.000000\n", + "50% 640.387285 0.000000 ... 503.148975 3.000000\n", + "75% 842.213490 0.000000 ... 720.505705 4.000000\n", + "max 1199.729004 1.000000 ... 1902.901978 6.000000\n", + "\n", + "[8 rows x 9 columns]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "oml_flights.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.info" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:50.944997Z", + "iopub.status.busy": "2021-12-15T20:24:50.918277Z", + "iopub.status.idle": "2021-12-15T20:24:50.958394Z", + "shell.execute_reply": "2021-12-15T20:24:50.958030Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 13059 entries, 0 to 13058\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 AvgTicketPrice 13059 non-null float64 \n", + " 1 Cancelled 13059 non-null bool \n", + " 2 Carrier 13059 non-null object \n", + " 3 Dest 13059 non-null object \n", + " 4 DestAirportID 13059 non-null object \n", + " 5 DestCityName 13059 non-null object \n", + " 6 DestCountry 13059 non-null object \n", + " 7 DestLocation 13059 non-null object \n", + " 8 DestRegion 13059 non-null object \n", + " 9 DestWeather 13059 non-null object \n", + " 10 DistanceKilometers 13059 non-null float64 \n", + " 11 DistanceMiles 13059 non-null float64 \n", + " 12 FlightDelay 13059 non-null bool \n", + " 13 FlightDelayMin 13059 non-null int64 \n", + " 14 FlightDelayType 13059 non-null object \n", + " 15 FlightNum 13059 non-null object \n", + " 16 FlightTimeHour 13059 non-null float64 \n", + " 17 FlightTimeMin 13059 non-null float64 \n", + " 18 Origin 13059 non-null object \n", + " 19 OriginAirportID 13059 non-null object \n", + " 20 OriginCityName 13059 non-null object \n", + " 21 OriginCountry 13059 non-null object \n", + " 22 OriginLocation 13059 non-null object \n", + " 23 OriginRegion 13059 non-null object \n", + " 24 OriginWeather 13059 non-null object \n", + " 25 dayOfWeek 13059 non-null int64 \n", + " 26 timestamp 13059 non-null datetime64[ns]\n", + "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", + "memory usage: 3.1+ MB\n" + ] + } + ], + "source": [ + "pd_flights.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:50.961762Z", + "iopub.status.busy": "2021-12-15T20:24:50.961398Z", + "iopub.status.idle": "2021-12-15T20:24:55.507758Z", + "shell.execute_reply": "2021-12-15T20:24:55.507382Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 13059 entries, 0 to 13058\n", + "Data columns (total 27 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 AvgTicketPrice 13059 non-null float64 \n", + " 1 Cancelled 13059 non-null bool \n", + " 2 Carrier 13059 non-null object \n", + " 3 Dest 13059 non-null object \n", + " 4 DestAirportID 13059 non-null object \n", + " 5 DestCityName 13059 non-null object \n", + " 6 DestCountry 13059 non-null object \n", + " 7 DestLocation 13059 non-null object \n", + " 8 DestRegion 13059 non-null object \n", + " 9 DestWeather 13059 non-null object \n", + " 10 DistanceKilometers 13059 non-null float64 \n", + " 11 DistanceMiles 13059 non-null float64 \n", + " 12 FlightDelay 13059 non-null bool \n", + " 13 FlightDelayMin 13059 non-null int64 \n", + " 14 FlightDelayType 13059 non-null object \n", + " 15 FlightNum 13059 non-null object \n", + " 16 FlightTimeHour 13059 non-null float64 \n", + " 17 FlightTimeMin 13059 non-null float64 \n", + " 18 Origin 13059 non-null object \n", + " 19 OriginAirportID 13059 non-null object \n", + " 20 OriginCityName 13059 non-null object \n", + " 21 OriginCountry 13059 non-null object \n", + " 22 OriginLocation 13059 non-null object \n", + " 23 OriginRegion 13059 non-null object \n", + " 24 OriginWeather 13059 non-null object \n", + " 25 dayOfWeek 13059 non-null int64 \n", + " 26 timestamp 13059 non-null datetime64[ns]\n", + "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", + "memory usage: 64.000 bytes\n", + "OpenSearch storage usage: 10.714 MB\n" + ] + } + ], + "source": [ + "oml_flights.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### max" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.511067Z", + "iopub.status.busy": "2021-12-15T20:24:55.510706Z", + "iopub.status.idle": "2021-12-15T20:24:55.515166Z", + "shell.execute_reply": "2021-12-15T20:24:55.514795Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 1199.729053\n", + "Cancelled True\n", + "DistanceKilometers 19881.482315\n", + "DistanceMiles 12353.780369\n", + "FlightDelay True\n", + "FlightDelayMin 360\n", + "FlightTimeHour 31.715034\n", + "FlightTimeMin 1902.902032\n", + "dayOfWeek 6\n", + "dtype: object" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.max(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`opensearch_py_ml.DataFrame.max,min,mean,sum` only aggregate numeric columns" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.521502Z", + "iopub.status.busy": "2021-12-15T20:24:55.521124Z", + "iopub.status.idle": "2021-12-15T20:24:55.615898Z", + "shell.execute_reply": "2021-12-15T20:24:55.614418Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 1199.729004\n", + "Cancelled 1.000000\n", + "DistanceKilometers 19881.482422\n", + "DistanceMiles 12353.780273\n", + "FlightDelay 1.000000\n", + "FlightDelayMin 360.000000\n", + "FlightTimeHour 31.715034\n", + "FlightTimeMin 1902.901978\n", + "dayOfWeek 6.000000\n", + "dtype: float64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.max(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### min" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.629444Z", + "iopub.status.busy": "2021-12-15T20:24:55.627340Z", + "iopub.status.idle": "2021-12-15T20:24:55.638818Z", + "shell.execute_reply": "2021-12-15T20:24:55.637856Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 100.020528\n", + "Cancelled False\n", + "DistanceKilometers 0.0\n", + "DistanceMiles 0.0\n", + "FlightDelay False\n", + "FlightDelayMin 0\n", + "FlightTimeHour 0.0\n", + "FlightTimeMin 0.0\n", + "dayOfWeek 0\n", + "dtype: object" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.min(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.649953Z", + "iopub.status.busy": "2021-12-15T20:24:55.644304Z", + "iopub.status.idle": "2021-12-15T20:24:55.731911Z", + "shell.execute_reply": "2021-12-15T20:24:55.731393Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 100.020531\n", + "Cancelled 0.000000\n", + "DistanceKilometers 0.000000\n", + "DistanceMiles 0.000000\n", + "FlightDelay 0.000000\n", + "FlightDelayMin 0.000000\n", + "FlightTimeHour 0.000000\n", + "FlightTimeMin 0.000000\n", + "dayOfWeek 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.min(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### mean" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.735976Z", + "iopub.status.busy": "2021-12-15T20:24:55.735398Z", + "iopub.status.idle": "2021-12-15T20:24:55.740446Z", + "shell.execute_reply": "2021-12-15T20:24:55.740037Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 628.253689\n", + "Cancelled 0.128494\n", + "DistanceKilometers 7092.142455\n", + "DistanceMiles 4406.853013\n", + "FlightDelay 0.251168\n", + "FlightDelayMin 47.335171\n", + "FlightTimeHour 8.518797\n", + "FlightTimeMin 511.127842\n", + "dayOfWeek 2.835975\n", + "dtype: float64" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.mean(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.746976Z", + "iopub.status.busy": "2021-12-15T20:24:55.746605Z", + "iopub.status.idle": "2021-12-15T20:24:55.852359Z", + "shell.execute_reply": "2021-12-15T20:24:55.850558Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 628.253689\n", + "Cancelled 0.128494\n", + "DistanceKilometers 7092.142457\n", + "DistanceMiles 4406.853010\n", + "FlightDelay 0.251168\n", + "FlightDelayMin 47.335171\n", + "FlightTimeHour 8.518797\n", + "FlightTimeMin 511.127842\n", + "dayOfWeek 2.835975\n", + "dtype: float64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.mean(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### sum" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.867747Z", + "iopub.status.busy": "2021-12-15T20:24:55.865996Z", + "iopub.status.idle": "2021-12-15T20:24:55.882127Z", + "shell.execute_reply": "2021-12-15T20:24:55.883534Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 8.204365e+06\n", + "Cancelled 1.678000e+03\n", + "DistanceKilometers 9.261629e+07\n", + "DistanceMiles 5.754909e+07\n", + "FlightDelay 3.280000e+03\n", + "FlightDelayMin 6.181500e+05\n", + "FlightTimeHour 1.112470e+05\n", + "FlightTimeMin 6.674818e+06\n", + "dayOfWeek 3.703500e+04\n", + "dtype: float64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.sum(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:55.906057Z", + "iopub.status.busy": "2021-12-15T20:24:55.904964Z", + "iopub.status.idle": "2021-12-15T20:24:55.997011Z", + "shell.execute_reply": "2021-12-15T20:24:55.997419Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "AvgTicketPrice 8.204365e+06\n", + "Cancelled 1.678000e+03\n", + "DistanceKilometers 9.261629e+07\n", + "DistanceMiles 5.754909e+07\n", + "FlightDelay 3.280000e+03\n", + "FlightDelayMin 6.181500e+05\n", + "FlightTimeHour 1.112470e+05\n", + "FlightTimeMin 6.674818e+06\n", + "dayOfWeek 3.703500e+04\n", + "dtype: float64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.sum(numeric_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.nunique" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:56.003863Z", + "iopub.status.busy": "2021-12-15T20:24:56.003311Z", + "iopub.status.idle": "2021-12-15T20:24:56.012352Z", + "shell.execute_reply": "2021-12-15T20:24:56.012798Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Carrier 4\n", + "Origin 156\n", + "Dest 156\n", + "dtype: int64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights[['Carrier', 'Origin', 'Dest']].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:56.018633Z", + "iopub.status.busy": "2021-12-15T20:24:56.018038Z", + "iopub.status.idle": "2021-12-15T20:24:56.109194Z", + "shell.execute_reply": "2021-12-15T20:24:56.108015Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Carrier 4\n", + "Origin 156\n", + "Dest 156\n", + "dtype: int64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights[['Carrier', 'Origin', 'Dest']].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame.drop" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:56.124377Z", + "iopub.status.busy": "2021-12-15T20:24:56.123206Z", + "iopub.status.idle": "2021-12-15T20:24:56.145226Z", + "shell.execute_reply": "2021-12-15T20:24:56.145628Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDestRegion...dayOfWeektimestamp
0Kibana AirlinesSE-BD...02018-01-01 00:00:00
1Logstash AirwaysIT-34...02018-01-01 18:27:00
2Logstash AirwaysIT-34...02018-01-01 17:11:14
3Kibana AirlinesIT-34...02018-01-01 10:33:28
4Kibana AirlinesSE-BD...02018-01-01 05:13:00
..................
13054Logstash AirwaysSE-BD...62018-02-11 20:42:25
13055Logstash AirwaysCH-ZH...62018-02-11 01:41:57
13056Logstash AirwaysRU-AMU...62018-02-11 04:09:27
13057JetBeatsSE-BD...62018-02-11 08:28:21
13058JetBeatsUS-DC...62018-02-11 14:54:34
\n", + "

13059 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " Carrier DestRegion ... dayOfWeek timestamp\n", + "0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n", + "1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n", + "2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n", + "3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n", + "4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n", + "... ... ... ... ... ...\n", + "13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n", + "13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n", + "13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n", + "13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n", + "13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n", + "\n", + "[13059 rows x 20 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd_flights.drop(columns=['AvgTicketPrice', \n", + " 'Cancelled', \n", + " 'DestLocation',\n", + " 'Dest', \n", + " 'DestAirportID', \n", + " 'DestCityName', \n", + " 'DestCountry'])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:24:56.151182Z", + "iopub.status.busy": "2021-12-15T20:24:56.150754Z", + "iopub.status.idle": "2021-12-15T20:25:00.584497Z", + "shell.execute_reply": "2021-12-15T20:25:00.583176Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CarrierDestRegion...dayOfWeektimestamp
0Kibana AirlinesSE-BD...02018-01-01 00:00:00
1Logstash AirwaysIT-34...02018-01-01 18:27:00
2Logstash AirwaysIT-34...02018-01-01 17:11:14
3Kibana AirlinesIT-34...02018-01-01 10:33:28
4Kibana AirlinesSE-BD...02018-01-01 05:13:00
..................
13054Logstash AirwaysSE-BD...62018-02-11 20:42:25
13055Logstash AirwaysCH-ZH...62018-02-11 01:41:57
13056Logstash AirwaysRU-AMU...62018-02-11 04:09:27
13057JetBeatsSE-BD...62018-02-11 08:28:21
13058JetBeatsUS-DC...62018-02-11 14:54:34
\n", + "
\n", + "

13059 rows × 20 columns

" + ], + "text/plain": [ + " Carrier DestRegion ... dayOfWeek timestamp\n", + "0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n", + "1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n", + "2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n", + "3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n", + "4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n", + "... ... ... ... ... ...\n", + "13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n", + "13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n", + "13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n", + "13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n", + "13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n", + "\n", + "[13059 rows x 20 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oml_flights.drop(columns=['AvgTicketPrice', \n", + " 'Cancelled', \n", + " 'DestLocation',\n", + " 'Dest', \n", + " 'DestAirportID', \n", + " 'DestCityName', \n", + " 'DestCountry'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:00.608502Z", + "iopub.status.busy": "2021-12-15T20:25:00.607311Z", + "iopub.status.idle": "2021-12-15T20:25:01.255494Z", + "shell.execute_reply": "2021-12-15T20:25:01.255790Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:01.270515Z", + "iopub.status.busy": "2021-12-15T20:25:01.269865Z", + "iopub.status.idle": "2021-12-15T20:25:02.205574Z", + "shell.execute_reply": "2021-12-15T20:25:02.205194Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oml_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Opensearch utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:02.211768Z", + "iopub.status.busy": "2021-12-15T20:25:02.211404Z", + "iopub.status.idle": "2021-12-15T20:25:02.297769Z", + "shell.execute_reply": "2021-12-15T20:25:02.295526Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "oml_flights2 = oml_flights[(oml_flights.OriginAirportID == 'AMS') & (oml_flights.FlightDelayMin > 60)]\n", + "oml_flights2 = oml_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]\n", + "oml_flights2 = oml_flights2.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:02.319861Z", + "iopub.status.busy": "2021-12-15T20:25:02.305735Z", + "iopub.status.idle": "2021-12-15T20:25:02.325635Z", + "shell.execute_reply": "2021-12-15T20:25:02.326869Z" + }, + "pycharm": { + "is_executing": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "os_index_pattern: flights\n", + "Index:\n", + " os_index_field: _id\n", + " is_source_field: False\n", + "Mappings:\n", + " capabilities:\n", + " os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name\n", + "timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp\n", + "OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID\n", + "DestAirportID DestAirportID True keyword None object True True False DestAirportID\n", + "FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin\n", + "Operations:\n", + " tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]\n", + " size: 5\n", + " sort_params: {'_doc': 'desc'}\n", + " _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']\n", + " body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}\n", + " post_processing: [('sort_index')]\n", + "\n" + ] + } + ], + "source": [ + "print(oml_flights2.os_info())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst new file mode 100644 index 00000000..286f3d0d --- /dev/null +++ b/docs/source/examples/index.rst @@ -0,0 +1,11 @@ +.. _examples: + +======== +Examples +======== + +.. toctree:: + :maxdepth: 1 + + demo_notebook + online_retail_analysis diff --git a/docs/source/examples/online_retail_analysis.ipynb b/docs/source/examples/online_retail_analysis.ipynb new file mode 100644 index 00000000..4d4aa497 --- /dev/null +++ b/docs/source/examples/online_retail_analysis.ipynb @@ -0,0 +1,1707 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Online Retail analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:06.764412Z", + "iopub.status.busy": "2021-12-15T20:25:06.755567Z", + "iopub.status.idle": "2021-12-15T20:25:07.316950Z", + "shell.execute_reply": "2021-12-15T20:25:07.316561Z" + } + }, + "outputs": [], + "source": [ + "# imports to demonstrate DataFrame support\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import opensearch_py_ml as oml\n", + "from opensearchpy import OpenSearch\n", + "\n", + "# Import standard test settings for consistent results\n", + "from opensearch_py_ml.conftest import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Setup clients" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "CLUSTER_URL = 'https://localhost:9200'\n", + "\n", + "def get_os_client(cluster_url = CLUSTER_URL,\n", + " username='admin',\n", + " password='admin'):\n", + " '''\n", + " Get OpenSearch client\n", + " :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443\n", + " :return: OpenSearch client\n", + " '''\n", + " client = OpenSearch(\n", + " hosts=[cluster_url],\n", + " http_auth=(username, password),\n", + " verify_certs=False\n", + " )\n", + " return client" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "client = get_os_client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Started\n", + "\n", + "To get started, let's create an `opensearch_py_ml.DataFrame` by reading a csv file. This creates and populates the \n", + "`online-retail` index in the local Opensearch cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:07.324283Z", + "iopub.status.busy": "2021-12-15T20:25:07.323764Z", + "iopub.status.idle": "2021-12-15T20:25:16.241379Z", + "shell.execute_reply": "2021-12-15T20:25:16.241877Z" + } + }, + "outputs": [], + "source": [ + "df = oml.csv_to_opensearch(\"data/online-retail.csv.gz\",\n", + " os_client=client, \n", + " os_dest_index='online-retail', \n", + " es_if_exists='replace', \n", + " os_dropna=True,\n", + " es_refresh=True,\n", + " compression='gzip',\n", + " index_col=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see that the `\"_id\"` field was used to index our data frame. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:16.246737Z", + "iopub.status.busy": "2021-12-15T20:25:16.244084Z", + "iopub.status.idle": "2021-12-15T20:25:16.250080Z", + "shell.execute_reply": "2021-12-15T20:25:16.250410Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'_id'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index.os_index_field" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can check which field from opensearch are available to our opensearch_py_ml data frame. `columns` is available as a parameter when instantiating the data frame which allows one to choose only a subset of fields from your index to be included in the data frame. Since we didn't set this parameter, we have access to all fields." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:16.254703Z", + "iopub.status.busy": "2021-12-15T20:25:16.254060Z", + "iopub.status.idle": "2021-12-15T20:25:16.256567Z", + "shell.execute_reply": "2021-12-15T20:25:16.256138Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode',\n", + " 'UnitPrice'],\n", + " dtype='object')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's see the data types of our fields. Running `df.dtypes`, we can see that opensearch field types are mapped to pandas field types." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:16.261335Z", + "iopub.status.busy": "2021-12-15T20:25:16.260762Z", + "iopub.status.idle": "2021-12-15T20:25:16.263024Z", + "shell.execute_reply": "2021-12-15T20:25:16.263323Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Country object\n", + "CustomerID float64\n", + "Description object\n", + "InvoiceDate object\n", + "InvoiceNo object\n", + "Quantity int64\n", + "StockCode object\n", + "UnitPrice float64\n", + "dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also offer a `.os_info()` data frame method that shows all info about the underlying index. It also contains information about operations being passed from data frame methods to opensearch. More on this later." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:16.266245Z", + "iopub.status.busy": "2021-12-15T20:25:16.265860Z", + "iopub.status.idle": "2021-12-15T20:25:16.271135Z", + "shell.execute_reply": "2021-12-15T20:25:16.270816Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "os_index_pattern: online-retail\n", + "Index:\n", + " os_index_field: _id\n", + " is_source_field: False\n", + "Mappings:\n", + " capabilities:\n", + " os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name\n", + "Country Country True keyword None object True True False Country\n", + "CustomerID CustomerID True double None float64 True True False CustomerID\n", + "Description Description True keyword None object True True False Description\n", + "InvoiceDate InvoiceDate True keyword None object True True False InvoiceDate\n", + "InvoiceNo InvoiceNo True keyword None object True True False InvoiceNo\n", + "Quantity Quantity True long None int64 True True False Quantity\n", + "StockCode StockCode True keyword None object True True False StockCode\n", + "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", + "Operations:\n", + " tasks: []\n", + " size: None\n", + " sort_params: None\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", + " body: {}\n", + " post_processing: []\n", + "\n" + ] + } + ], + "source": [ + "print(df.os_info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting and Indexing Data\n", + "\n", + "Now that we understand how to create a data frame and get access to it's underlying attributes, let's see how we can select subsets of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### head and tail\n", + "\n", + "much like pandas, opensearch_py_ml data frames offer `.head(n)` and `.tail(n)` methods that return the first and last n rows, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:16.274779Z", + "iopub.status.busy": "2021-12-15T20:25:16.274393Z", + "iopub.status.idle": "2021-12-15T20:25:17.555325Z", + "shell.execute_reply": "2021-12-15T20:25:17.555642Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
0United Kingdom17850.0...85123A2.55
1United Kingdom17850.0...710533.39
\n", + "
\n", + "

2 rows × 8 columns

" + ], + "text/plain": [ + " Country CustomerID ... StockCode UnitPrice\n", + "0 United Kingdom 17850.0 ... 85123A 2.55\n", + "1 United Kingdom 17850.0 ... 71053 3.39\n", + "\n", + "[2 rows x 8 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:17.559534Z", + "iopub.status.busy": "2021-12-15T20:25:17.559123Z", + "iopub.status.idle": "2021-12-15T20:25:17.637500Z", + "shell.execute_reply": "2021-12-15T20:25:17.637125Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "os_index_pattern: online-retail\n", + "Index:\n", + " os_index_field: _id\n", + " is_source_field: False\n", + "Mappings:\n", + " capabilities:\n", + " os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name\n", + "Country Country True keyword None object True True False Country\n", + "CustomerID CustomerID True double None float64 True True False CustomerID\n", + "Description Description True keyword None object True True False Description\n", + "InvoiceDate InvoiceDate True keyword None object True True False InvoiceDate\n", + "InvoiceNo InvoiceNo True keyword None object True True False InvoiceNo\n", + "Quantity Quantity True long None int64 True True False Quantity\n", + "StockCode StockCode True keyword None object True True False StockCode\n", + "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", + "Operations:\n", + " tasks: [('tail': ('sort_field': '_doc', 'count': 2)), ('head': ('sort_field': '_doc', 'count': 2)), ('tail': ('sort_field': '_doc', 'count': 2))]\n", + " size: 2\n", + " sort_params: {'_doc': 'desc'}\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", + " body: {}\n", + " post_processing: [('sort_index'), ('head': ('count': 2)), ('tail': ('count': 2))]\n", + "\n" + ] + } + ], + "source": [ + "print(df.tail(2).head(2).tail(2).os_info())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:17.640519Z", + "iopub.status.busy": "2021-12-15T20:25:17.640139Z", + "iopub.status.idle": "2021-12-15T20:25:18.647340Z", + "shell.execute_reply": "2021-12-15T20:25:18.646548Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
14998United Kingdom17419.0...217731.25
14999United Kingdom17419.0...221492.10
\n", + "
\n", + "

2 rows × 8 columns

" + ], + "text/plain": [ + " Country CustomerID ... StockCode UnitPrice\n", + "14998 United Kingdom 17419.0 ... 21773 1.25\n", + "14999 United Kingdom 17419.0 ... 22149 2.10\n", + "\n", + "[2 rows x 8 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Selecting columns\n", + "\n", + "you can also pass a list of columns to select columns from the data frame in a specified order." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:18.654238Z", + "iopub.status.busy": "2021-12-15T20:25:18.653517Z", + "iopub.status.idle": "2021-12-15T20:25:19.431749Z", + "shell.execute_reply": "2021-12-15T20:25:19.431127Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryInvoiceDate
0United Kingdom2010-12-01 08:26:00
1United Kingdom2010-12-01 08:26:00
2United Kingdom2010-12-01 08:26:00
3United Kingdom2010-12-01 08:26:00
4United Kingdom2010-12-01 08:26:00
\n", + "
\n", + "

5 rows × 2 columns

" + ], + "text/plain": [ + " Country InvoiceDate\n", + "0 United Kingdom 2010-12-01 08:26:00\n", + "1 United Kingdom 2010-12-01 08:26:00\n", + "2 United Kingdom 2010-12-01 08:26:00\n", + "3 United Kingdom 2010-12-01 08:26:00\n", + "4 United Kingdom 2010-12-01 08:26:00\n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['Country', 'InvoiceDate']].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Boolean Indexing\n", + "\n", + "we also allow you to filter the data frame using boolean indexing. Under the hood, a boolean index maps to a `terms` query that is then passed to opensearch to filter the index." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:19.440640Z", + "iopub.status.busy": "2021-12-15T20:25:19.439831Z", + "iopub.status.idle": "2021-12-15T20:25:20.066747Z", + "shell.execute_reply": "2021-12-15T20:25:20.067477Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'term': {'Country': 'Germany'}}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
1109Germany12662.0...228092.95
1110Germany12662.0...843472.55
1111Germany12662.0...849450.85
1112Germany12662.0...222421.65
1113Germany12662.0...222441.95
\n", + "
\n", + "

5 rows × 8 columns

" + ], + "text/plain": [ + " Country CustomerID ... StockCode UnitPrice\n", + "1109 Germany 12662.0 ... 22809 2.95\n", + "1110 Germany 12662.0 ... 84347 2.55\n", + "1111 Germany 12662.0 ... 84945 0.85\n", + "1112 Germany 12662.0 ... 22242 1.65\n", + "1113 Germany 12662.0 ... 22244 1.95\n", + "\n", + "[5 rows x 8 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the construction of a boolean vector maps directly to an opensearch query\n", + "print(df['Country']=='Germany')\n", + "df[(df['Country']=='Germany')].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we can also filter the data frame using a list of values." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:20.077022Z", + "iopub.status.busy": "2021-12-15T20:25:20.076412Z", + "iopub.status.idle": "2021-12-15T20:25:21.233013Z", + "shell.execute_reply": "2021-12-15T20:25:21.234073Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'terms': {'Country': ['Germany', 'United States']}}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
0United Kingdom17850.0...85123A2.55
1United Kingdom17850.0...710533.39
2United Kingdom17850.0...84406B2.75
3United Kingdom17850.0...84029G3.39
4United Kingdom17850.0...84029E3.39
\n", + "
\n", + "

5 rows × 8 columns

" + ], + "text/plain": [ + " Country CustomerID ... StockCode UnitPrice\n", + "0 United Kingdom 17850.0 ... 85123A 2.55\n", + "1 United Kingdom 17850.0 ... 71053 3.39\n", + "2 United Kingdom 17850.0 ... 84406B 2.75\n", + "3 United Kingdom 17850.0 ... 84029G 3.39\n", + "4 United Kingdom 17850.0 ... 84029E 3.39\n", + "\n", + "[5 rows x 8 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(df['Country'].isin(['Germany', 'United States']))\n", + "df[df['Country'].isin(['Germany', 'United Kingdom'])].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also combine boolean vectors to further filter the data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:21.245390Z", + "iopub.status.busy": "2021-12-15T20:25:21.244737Z", + "iopub.status.idle": "2021-12-15T20:25:22.358701Z", + "shell.execute_reply": "2021-12-15T20:25:22.355150Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
\n", + "
\n", + "

0 rows × 8 columns

" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Country, CustomerID, Description, InvoiceDate, InvoiceNo, Quantity, StockCode, UnitPrice]\n", + "Index: []\n", + "\n", + "[0 rows x 8 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['Country']=='Germany') & (df['Quantity']>90)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using this example, let see how opensearch_py_ml translates this boolean filter to an opensearch `bool` query." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:22.383610Z", + "iopub.status.busy": "2021-12-15T20:25:22.370577Z", + "iopub.status.idle": "2021-12-15T20:25:22.390275Z", + "shell.execute_reply": "2021-12-15T20:25:22.388963Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "os_index_pattern: online-retail\n", + "Index:\n", + " os_index_field: _id\n", + " is_source_field: False\n", + "Mappings:\n", + " capabilities:\n", + " os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name\n", + "Country Country True keyword None object True True False Country\n", + "CustomerID CustomerID True double None float64 True True False CustomerID\n", + "Description Description True keyword None object True True False Description\n", + "InvoiceDate InvoiceDate True keyword None object True True False InvoiceDate\n", + "InvoiceNo InvoiceNo True keyword None object True True False InvoiceNo\n", + "Quantity Quantity True long None int64 True True False Quantity\n", + "StockCode StockCode True keyword None object True True False StockCode\n", + "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", + "Operations:\n", + " tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}))]\n", + " size: None\n", + " sort_params: None\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", + " body: {'query': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}}\n", + " post_processing: []\n", + "\n" + ] + } + ], + "source": [ + "print(df[(df['Country']=='Germany') & (df['Quantity']>90)].os_info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregation and Descriptive Statistics\n", + "\n", + "Let's begin to ask some questions of our data and use opensearch_py_ml to get the answers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**How many different countries are there?**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:22.398231Z", + "iopub.status.busy": "2021-12-15T20:25:22.397459Z", + "iopub.status.idle": "2021-12-15T20:25:22.482238Z", + "shell.execute_reply": "2021-12-15T20:25:22.481338Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "16" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Country'].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**What is the total sum of products ordered?**" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:22.492668Z", + "iopub.status.busy": "2021-12-15T20:25:22.491590Z", + "iopub.status.idle": "2021-12-15T20:25:22.580015Z", + "shell.execute_reply": "2021-12-15T20:25:22.578300Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "111960" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Quantity'].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Show me the sum, mean, min, and max of the qunatity and unit_price fields**" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:22.601432Z", + "iopub.status.busy": "2021-12-15T20:25:22.600117Z", + "iopub.status.idle": "2021-12-15T20:25:22.702450Z", + "shell.execute_reply": "2021-12-15T20:25:22.701499Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
QuantityUnitPrice
sum111960.00061548.490000
mean7.4644.103233
max2880.000950.990000
min-9360.0000.000000
\n", + "
" + ], + "text/plain": [ + " Quantity UnitPrice\n", + "sum 111960.000 61548.490000\n", + "mean 7.464 4.103233\n", + "max 2880.000 950.990000\n", + "min -9360.000 0.000000" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['Quantity','UnitPrice']].agg(['sum', 'mean', 'max', 'min'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Give me descriptive statistics for the entire data frame**" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:22.712002Z", + "iopub.status.busy": "2021-12-15T20:25:22.711114Z", + "iopub.status.idle": "2021-12-15T20:25:22.982698Z", + "shell.execute_reply": "2021-12-15T20:25:22.981770Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerIDQuantityUnitPrice
count10729.00000015000.00000015000.000000
mean15590.7766807.4640004.103233
std1764.18959285.93011620.106214
min12347.000000-9360.0000000.000000
25%14222.6894661.0000001.250000
50%15668.0196082.0000002.510000
75%17218.8066046.4720004.212788
max18239.0000002880.000000950.990000
\n", + "
" + ], + "text/plain": [ + " CustomerID Quantity UnitPrice\n", + "count 10729.000000 15000.000000 15000.000000\n", + "mean 15590.776680 7.464000 4.103233\n", + "std 1764.189592 85.930116 20.106214\n", + "min 12347.000000 -9360.000000 0.000000\n", + "25% 14222.689466 1.000000 1.250000\n", + "50% 15668.019608 2.000000 2.510000\n", + "75% 17218.806604 6.472000 4.212788\n", + "max 18239.000000 2880.000000 950.990000" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Show me a histogram of numeric columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:23.000466Z", + "iopub.status.busy": "2021-12-15T20:25:22.999571Z", + "iopub.status.idle": "2021-12-15T20:25:23.576387Z", + "shell.execute_reply": "2021-12-15T20:25:23.576703Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df[(df['Quantity']>-50) & \n", + " (df['Quantity']<50) & \n", + " (df['UnitPrice']>0) & \n", + " (df['UnitPrice']<100)][['Quantity', 'UnitPrice']].hist(figsize=[12,4], bins=30)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:23.584264Z", + "iopub.status.busy": "2021-12-15T20:25:23.583784Z", + "iopub.status.idle": "2021-12-15T20:25:24.494000Z", + "shell.execute_reply": "2021-12-15T20:25:24.493618Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs4AAAEICAYAAABPtXIYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy89olMNAAAACXBIWXMAAAsTAAALEwEAmpwYAAAeNUlEQVR4nO3df5Qd91nf8fcndpwEq5FJDIojuciJjKmxaEP22ElpqVwSkLEVAycNdkzBqbFOODWFVoUqQMuPklND40KMDRwRGyVgrLgmTaRYIaGUbQgEcMyP2vGPIhwllnGsGBLBKjSJzNM/7iy5bLTauXvv3bt39v06x8d7Z+6deR7Nndlnn/nOTKoKSZIkSaf2jEkHIEmSJE0DC2dJkiSpBQtnSZIkqQULZ0mSJKkFC2dJkiSpBQtnSZIkqQULZ61JSeaSvGjScUiSIMnPJ/mPq3V50jwLZ62YJNcmuT/Jp5N8PMnPJlm/AuudTfKd/dOqal1VPdrM35vkx8cdhyR1VZJKsmXBtB9J8sttPl9Vr6+q/9x8bluSIydZ1ueapsenkvxOkpe3WZ40ShbOWhFJdgE/AXwfsB54GbAZeF+SZ04wNEnSdHh7Va0DvgT4APCOJFn4piSnrXhkWjMsnDV2SZ4L/Cjw3VX1a1X1uao6DLwGeBHw2oVd34UdhyS7k/xpkr9K8mCSb+6bd22SDyR5U5JPJvlIksuaeW8E/ilwS9OpuKWZXkm2JNkJXAN8fzP/QJLvS/KrC3K4Ocmbx/VvJEldNn9MT7IrydEkTyR5Xd/8vUl+PMmZwHuAFzbH5LkkL+xfVlV9Dngr8ALg+c1nfy7JwSTHgUtP8jvlyiR/lOQvm98l25vp65Pc1sTzeBODhbcWZeGslfCPgWcD7+ifWFVzwEHg61ss40/pFcDr6RXhv5zknL75lwCPAGcDPwncliRV9YPAbwE3NMMzblgQwx7gDuAnm/k7gF8Gtic5CyDJ6cBVwNsGylqS1O8F9I7hG4HrgFuTfHH/G6rqOHAZ8GfNMXldVf1Z/3uSPAu4Fnisqp5qJr8WeCPw9+h1o/vffzG94/f3AWcBXwscbmbvBU4AW4CX0Pt99HeG9kn9LJy1Es4GnqqqEyeZ9wS9026nVFX/var+rKr+pqreDvwJcHHfWz5aVb9QVU/T60ScA2xYTrBV9QTwfuBfNJO2N/Hft5zlSZIA+BzwY81Zx4PAHHDBAJ9/TZJPAY8BLwW+uW/eu6rqt5vfEf9vweeuA26vql9v5j9eVQ8n2QB8I/C9VXW8qo4CP0WvUSKd1OmTDkBrwlPA2UlOP0nxfE4z/5SSfDvw7+iNiwZYR68gn/fx+R+q6tPNsLd1Q8T8VuC7gF8Avg34pSGWJUld9zSw8HqVZ9Irluf9+YLfAZ9msOP0XVX1bYvMe+wUnzuX3tnNhb6sifGJvqHSz1hiWVrj7DhrJXwQ+AzwLf0Tk6yjd0puFjgOfFHf7Bf0ve/L6BWwNwDPr6qzgAeAL7goZBG1jPnvBL4qyUXAFfSGc0iSTu5jfL6xMe884KPLWNZSx+xBP/MY8OJFpn8GOLuqzmr+e25VfeUy1q81wsJZY1dVx+iNS/6ZJNuTPDPJZuAuet3mO4A/Ar4xyfOSvAD43r5FnEnvoPgJgOaCkosGCOFJehchtp7fnOq7G/gV4Per6mMDrE+S1pq3Az+UZFOSZyR5BbCD3nF0UE/Su+hvVLcrvQ14XZKva2LbmOQrmmF57wNuSvLcZt6Lk/yzEa1XHWThrBVRVT8J/ADwJuCvgI/Q6zC/orkY5JeAP6Z3wcb76B2E5z/7IHATvc71k8BW4LcHWP2bgVc3d9y4+STzbwMubO4N+s6+6W9t1uUwDUk6tR8DfofehXmfpHeR9jVV9cCgC6qqh4E7gUeb4/ILl/rMEsv7feB19MYvHwP+N71hGgDfDpwBPNjEfTe9IYTSSaVqOWdEpOE0XeMfA75mtXZzk/x94GHgBVX1l5OOR5IkTZYXB2oiquoXk5ygd6u6VVc4J3kGvYsR91k0S5IksOMsfYHmBvxP0ruoZXtVeYW1JEmycJYkSZLa8OJASZIkqYVVMcb57LPPrs2bN086DACOHz/OmWeeOekwxqKruXU1L+hubqs1r/vuu++pqlrySZZankGP9av1ezIq5je9upwbdDu/48eP8/DDDy/7WL8qCufNmzfzoQ99aNJhADA7O8u2bdsmHcZYdDW3ruYF3c1tteaVZDkPa1BLgx7rV+v3ZFTMb3p1OTfodn6zs7Nceumlyz7WO1RDkiRJamEshXOSM5N8KMkV41i+JEmStNJaFc5Jbk9yNMkDC6ZvT/JIkkNJdvfN+g/0HqcsSZIkdULbjvNeYHv/hCSnAbcClwEXAlcnuTDJK+k9uvLoCOOUJK0ynl2UtNa0ujiwqt6fZPOCyRcDh6rqUYAk+4ArgXXAmfSK6b9OcrCq/mbhMpPsBHYCbNiwgdnZ2eXmMFJzc3OrJpZR62puXc0LuptbV/OadkluB64AjlbVRX3TtwNvBk4D3lJVNzazPLsoaU0Z5q4aG4H+J6odAS6pqhsAklwLPHWyohmgqvYAewBmZmZqtVy92fUrSbuYW1fzgu7m1tW8OmAvcAvwtvkJfWcXX0nvOH9vkv30fgc8CDx75cOUpMkY2+3oqmrvuJYtSRq91XZ2setnJsxvenU5N+h2fnNzc0N9fpjC+XHg3L7Xm5pprSXZAezYsmXLEGFIksZoYmcXu35mwvymV5dzg27nN+wfBMMUzvcC5yc5j17BfBXw2kEWUFUHgAMzMzPXDxGHBMDm3feccv7hGy9foUiktaPN2cVhmiRL7dfgvi1p5bS9Hd2dwAeBC5IcSXJdVZ0AbgDeCzwE3FVVHx5k5Ul2JNlz7NixQeOWJK2Moc8uVtWBqtq5fv36kQYmSSut7V01rl5k+kHg4HJXbsdZkla9oc8uSlJX+MhtSRLg2UVJWsrY7qrRhhcHStLq4dlFSTq1iXacHfcmSZKkaeFQDUnSWDlUQ1JXWDhLksbKs4uSumKihbNdCEmSJE0LxzhLksbKJomkrnCohiRprGySSOoKC2dJkiSpBcc4S5IkSS04xlmSNFY2SSR1hUM1JEljZZNEUldYOEuSJEktWDhLkiRJLXhxoCRJktSCFwdKksbKJomkrnCohiRprGySSOoKC2dJkiSphdMnHYAkScPYvPueU84/fOPlKxSJpK6z4yxJkiS14F01JEmSpBa8q4YkaaxskkjqCodqSJLGyiaJpK6wcJYkSZJa8K4aWjOWuvIevPpekiQtzo6zJEmS1IKFsyRJktSChbMkSZLUgoWzJEmS1IIPQJEkSZJa8AEokqSxskkiqSscqiFJGiubJJK6wsJZkiRJasHCWZIkSWrBwlmSJElqwcJZkiRJauH0SQcgTZvNu+9Z8j2Hb7x8BSKRJEkryY6zJEmS1IKFsyRJktSChbMkSZLUgoWzJEmS1MLIC+ck/yDJzye5O8l3jXr5kiRJ0iS0KpyT3J7kaJIHFkzfnuSRJIeS7Aaoqoeq6vXAa4CvGX3IkqRJs0kiaS1q23HeC2zvn5DkNOBW4DLgQuDqJBc2814F3AMcHFmkkqSxskkiSafW6j7OVfX+JJsXTL4YOFRVjwIk2QdcCTxYVfuB/UnuAX7lZMtMshPYCbBhwwZmZ2eXlcCozc3NrZpYRq2ruc3ntWvriaGX1ebfp816RvXv3PVtplVnL3AL8Lb5CX1NklcCR4B7k+yvqgebJsl3Ab80gVhb897rkkZlmAegbAQe63t9BLgkyTbgW4BncYqOc1XtAfYAzMzM1LZt24YIZXRmZ2dZLbGMWldzm8/r2ha/HJdy+JptS76nzXraLKeNrm8zrS6rrUkyNzfHrq1PD5rGskziD7mu/wHZ5fy6nBt0O7+5ubmhPj/yJwdW1Sww2+a9SXYAO7Zs2TLqMCRJozGxJsns7Cw3feD4wAEvx6j+2B1E1/+A7HJ+Xc4Nup3fsH8QDFM4Pw6c2/d6UzOttao6AByYmZm5fog4JEkrzCaJpLVomML5XuD8JOfRK5ivAl47kqikCWkzFlJaY2ySSFKj7e3o7gQ+CFyQ5EiS66rqBHAD8F7gIeCuqvrwICtPsiPJnmPHjg0atyRpZfxtkyTJGfSaJPsnHJMkTUSrwrmqrq6qc6rqmVW1qapua6YfrKovr6oXV9UbB115VR2oqp3r168f9KOSpBGzSSJJpzbyiwMlSdOpqq5eZPpBhrgvv0M1JHXFyB+5PQi7EJIkSZoWEy2cHaohSd1nk0RSV0y0cJYkdZ9NEkld4VANSZIkqYWJXhzoBSPScNrcd/rwjZevQCTS4nwAiqSucKiGJGmsHKohqSssnCVJkqQWJjpUw9N3kqTVwGFPktrwdnSSpLHyQnBJXeFQDUnSWNkkkdQVFs6SJElSCxbOkiRJUgs+AEWSJElqwYsDJUljZZNEUlc4VEOSNFY2SSR1hYWzJEmS1IKFsyRJktSChbMkSZLUgnfVkCRJklrwrhqSpLGySSKpKxyqIUkaK5skkrrCwlmSJElqwcJZkiRJasHCWZIkSWrBwlmSJElqwcJZkiRJasHCWZIkSWrBB6BIkiRJLfgAFEnSWNkkkdQVDtWQJI2VTRJJXWHhLEmSJLVg4SxJkiS1YOEsSZIktXD6pAOQpEFt3n3Pku85fOPlKxCJJGktseMsSZIktWDhLEmSJLVg4SxJkiS14BhnSZJGxPH3UrdZOEtqxYJAkrTWjaVwTvJNwOXAc4Hbqup941iPJEmStFJaj3FOcnuSo0keWDB9e5JHkhxKshugqt5ZVdcDrwe+dbQhS5ImLck3JfmFJG9P8vWTjkeSVsIgHee9wC3A2+YnJDkNuBV4JXAEuDfJ/qp6sHnLDzXzJUmrXJLbgSuAo1V1Ud/07cCbgdOAt1TVjVX1TuCdSb4YeBPQ+TOLbYYrSeq21oVzVb0/yeYFky8GDlXVowBJ9gFXJnkIuBF4T1X9wcmWl2QnsBNgw4YNzM7ODh79GMzNza2aWEatq7nN57Vr64lJh/K3fuaOdy35nq0b1y/5nqW2WZucR7XNR7muYb+LK5n3GrMXGySStKhhxzhvBB7re30EuAT4buAVwPokW6rq5xd+sKr2AHsAZmZmatu2bUOGMhqzs7OsllhGbZpzO1WnZ9fWp7npA8eZtmtdD1+zbcn3/Mwd72pyW8zSObdZTxvXtrk4sOW6hv0ujjIWfd6oGyTN+5fdJJmbm2PX1qcHymEazP8bdLWZMa/L+XU5N+h2fnNzc0N9fiyVRlXdDNy81PuS7AB2bNmyZRxhSJKGt+wGCQzXJJmdnV3iD8fpNP9H3TQ3M9rocn5dzg26nd+wfxAM+wCUx4Fz+15vaqa1UlUHqmrn+vVLn7KWJK0eVXVzVb20ql6/WNE8L8mOJHuOHTu2UuFJ0lgMWzjfC5yf5LwkZwBXAfuHD0uStEoM1SABmySSumOQ29HdCXwQuCDJkSTXVdUJ4AbgvcBDwF1V9eEBlmkXQpJWNxskktRoXThX1dVVdU5VPbOqNlXVbc30g1X15VX14qp64yArtwshSavHOBokzXJtkkjqhOm6DYEkaWyq6upFph8EDg6x3APAgZmZmeuXuwxJWg2GHeM8FLsQkiRJmhYTLZwdqiFJ3WeTRFJXOFRDmpA2j+/dtXUFApHGzKEakrpiooWzD0CRVoc2RbwkSWudQzUkSWPlUA1JXTHRwlmS1H02SSR1hYWzJEmS1IK3o5MkSZJacIyzJGmsbJJI6gqHakiSxsomiaSu8D7OkiStoPnbP+7aeoJrF7kV5OEbL1/JkCS1ZMdZkiRJasGLAyVJkqQWvDhQkjRWNkkkdYVDNSRJY2WTRFJXWDhLkiRJLXhXDUkrxrsJSJKmmR1nSZIkqQULZ0mSJKmFiQ7VSLID2LFly5ZJhqEJ27zIKXtJ3eCxXlJXTLRwrqoDwIGZmZnrJxmHJGl8PNYPbqmGgtcCSJPhUA1JkiSpBQtnSZIkqQULZ0mSJKkFC2dJkiSpBQtnSZIkqQULZ0mSJKmFiRbOSXYk2XPs2LFJhiFJkiQtaaKFc1UdqKqd69evn2QYkqQxskkiqSscqiFJGiubJJK6wsJZkiRJamGij9xW9y312FhJkqRpYcdZkiRJasHCWZIkSWrBoRqSJHVQm6Fyh2+8fAUikbrDjrMkSZLUgoWzJEmS1IJDNSRJWqMcziENxo6zJEmS1MLIC+ckL0pyW5K7R71sSZIkaVJaFc5Jbk9yNMkDC6ZvT/JIkkNJdgNU1aNVdd04gpUkrQ42SSStRW07znuB7f0TkpwG3ApcBlwIXJ3kwpFGJ0laMTZJJOnUWhXOVfV+4C8WTL4YONQcPD8L7AOuHHF8kqSVsxebJJK0qFRVuzcmm4F3V9VFzetXA9ur6jub1/8SuAT4YeCNwCuBt1TVf1lkeTuBnQAbNmx46b59+4bLZETm5uZYt27dpMMYi0nkdv/jx8a+jg3PgSf/euyrmYhpy23rxvWnnD//fThVXksto385w8RyMpdeeul9VTUz8Ac75CTH+pcDP1JV39C8fgPA/LE9yd1V9epTLG/Zx/q5uTk+cuzpZWay+g2zf49qPxnVuk7G36fTq8v5zc3NsWPHjmUf60d+O7qq+nPg9S3etwfYAzAzM1Pbtm0bdSjLMjs7y2qJZdQmkdu1LW51NKxdW09w0/3dvLPitOV2+Jptp5w//304VV5LLaN/OcPEotY2Ao/1vT4CXJLk+fSaJC9J8obFmiTDHOtnZ2e56QPHlxv3qjfM/j2q/WRU6zoZf59Ory7nNzs7O9Tnh/mN/Dhwbt/rTc201pLsAHZs2bJliDA0KW3u/ympm9o2ScBjvaTuGOZ2dPcC5yc5L8kZwFXA/kEWUFUHqmrn+vXLOw0kSRq7oZskHusldUXb29HdCXwQuCDJkSTXVdUJ4AbgvcBDwF1V9eFBVp5kR5I9x46NfxysJGlZhm6SSFJXtBqqUVVXLzL9IHBwuSuvqgPAgZmZmeuXuwxJ0mg0TZJtwNlJjgA/XFW3JZlvkpwG3L6cJgkO1ZhaPpZb+rzpuepIkjRWNkkk6dRG/sjtQThUQ5IkSdNiooWzF4xIUvfZJJHUFRMtnCVJ3WeTRFJXWDhLkiRJLUz04kCvtJak7vNYP3o+gEqaDMc4S5LGymO9pK5wqIYkSZLUgoWzJEmS1IJjnKeMT3CSNG081kvqCsc4S5LGymO9pK5wqIYkSZLUgoWzJEmS1IKFsyRJktSCFwdKksbKY71GxQvkNWleHChJGiuP9ZK6wqEakiRJUgsWzpIkSVILFs6SJElSCxbOkiRJUgtTe1eNlbyy1qt4pXba7Ctae7yrhqSu8K4akqSx8lgvqSscqiFJkiS1YOEsSZIktWDhLEmSJLVg4SxJkiS1YOEsSZIktWDhLEmSJLVg4SxJkiS1MLUPQJEkTQeP9d13socf7dp6gmub6Sv5kDAfWqZx8gEokqSx8lgvqSscqiFJkiS1YOEsSZIktWDhLEmSJLVg4SxJkiS1YOEsSZIktWDhLEmSJLVg4SxJkiS1YOEsSZIktWDhLEmSJLVg4SxJkiS1cPqoF5jkTOBngc8Cs1V1x6jXIUmaLI/1ktaiVh3nJLcnOZrkgQXTtyd5JMmhJLubyd8C3F1V1wOvGnG8kqQx8VgvSafWdqjGXmB7/4QkpwG3ApcBFwJXJ7kQ2AQ81rzt6dGEKUlaAXvxWC9Ji0pVtXtjshl4d1Vd1Lx+OfAjVfUNzes3NG89Anyyqt6dZF9VXbXI8nYCOwE2bNjw0n379g0U+P2PH1vyPVs3rh9omQBzc3OsW7duRda1HG1iWcyG58CTf937eRTxDhPLKPXn1TVdze1UebX5bo5rn7z00kvvq6qZgT/YIavpWD83N8dHjnW3Ju/q/j1v0PxGte+vxLo2PAe+9Hkr83t/VAb5txv2GD0qS8W83Dpvx44dyz7WDzPGeSOf7zZA7yB6CXAzcEuSy4EDi324qvYAewBmZmZq27ZtA6382t33LPmew9cMtkyA2dlZFsYyrnUtR5tYFrNr6wluur+3yUcR7zCxjFJ/Xl3T1dxOlVeb7+Zq2ifXgIkd62dnZ7npA8eXEfJ06Or+PW/Q/Ea176/EunZtPcFrBqxbJm2Qf7thj9GjslTMy63zhjHyPbaqjgOva/PeJDuAHVu2bBl1GJKkMfJYL2ktGuZ2dI8D5/a93tRMa62qDlTVzvXrp+t0hyStIR7rJakxTOF8L3B+kvOSnAFcBewfTViSpFXCY70kNdreju5O4IPABUmOJLmuqk4ANwDvBR4C7qqqDw+y8iQ7kuw5dmx1XGQmSWuZx3pJOrVWY5yr6upFph8EDi535VV1ADgwMzNz/XKXIUkaDY/1knRqPnJbkiRJamGihbOn7ySp+zzWS+qKiRbOXmktSd3nsV5SV7R+cuBYg0g+AXx00nE0zgaemnQQY9LV3LqaF3Q3t9Wa15dV1ZdMOoiuWsaxfrV+T0bF/KZXl3ODbud3NnDmco/1q6JwXk2SfKirj9ztam5dzQu6m1tX89Jodf17Yn7Tq8u5QbfzGzY3Lw6UJEmSWrBwliRJklqwcP5CeyYdwBh1Nbeu5gXdza2reWm0uv49Mb/p1eXcoNv5DZWbY5wlSZKkFuw4S5IkSS1YOEuSJEktWDgvkGRXkkpydvM6SW5OcijJ/0ny1ZOOcRBJ/muSh5vY/0eSs/rmvaHJ65Ek3zDBMJctyfYm/kNJdk86nuVKcm6S30zyYJIPJ/meZvrzkvx6kj9p/v/Fk451uZKcluQPk7y7eX1ekt9rtt3bk5wx6Ri1enRl34a1sX9Dt/fxJGclubv5ffpQkpd3Zfsl+bfN9/KBJHcmefY0b7sktyc5muSBvmkn3VbLqfEsnPskORf4euBjfZMvA85v/tsJ/NwEQhvGrwMXVdVXAf8XeANAkguBq4CvBLYDP5vktIlFuQxNvLfS20YXAlc3eU2jE8CuqroQeBnwr5tcdgO/UVXnA7/RvJ5W3wM81Pf6J4CfqqotwCeB6yYSlVadju3bsDb2b+j2Pv5m4Neq6iuAf0gvz6nffkk2Av8GmKmqi4DT6NUG07zt9tKra/ottq0GrvEsnP+unwK+H+i/YvJK4G3V87vAWUnOmUh0y1BV76uqE83L3wU2NT9fCeyrqs9U1UeAQ8DFk4hxCBcDh6rq0ar6LLCPXl5Tp6qeqKo/aH7+K3oH5Y308nlr87a3At80kQCHlGQTcDnwluZ1gH8O3N28ZWpz01h0Zt+G7u/f0O19PMl64GuB2wCq6rNV9Sm6s/1OB56T5HTgi4AnmOJtV1XvB/5iweTFttXANZ6FcyPJlcDjVfXHC2ZtBB7re32kmTaN/hXwnubnLuTVhRy+QJLNwEuA3wM2VNUTzayPAxsmFdeQfpreH6V/07x+PvCpvj/qOrHtNDKd3Lehs/s3dHsfPw/4BPCLzVCUtyQ5kw5sv6p6HHgTvTPtTwDHgPvozrabt9i2GvhYs6YK5yT/sxnDs/C/K4EfAP7TpGNcjiXymn/PD9I7XXjH5CLVUpKsA34V+N6q+sv+edW7d+TU3T8yyRXA0aq6b9KxSJPUxf0b1sQ+fjrw1cDPVdVLgOMsGJYxrduvGet7Jb0/Dl4InMkXDnPolGG31ekjjGXVq6pXnGx6kq30vjR/3Du7xCbgD5JcDDwOnNv39k3NtFVjsbzmJbkWuAL4uvr8jbtXfV4tdCGHv5XkmfR+qd5RVe9oJj+Z5JyqeqI5fXR0chEu29cAr0ryjcCzgefSGy94VpLTm67GVG87jVyn9m3o9P4N3d/HjwBHqur3mtd30yucu7D9XgF8pKo+AZDkHfS2Z1e23bzFttXAx5o11XFeTFXdX1VfWlWbq2ozvZ3kq6vq48B+4NubKy9fBhzra/evekm20zt99qqq+nTfrP3AVUmeleQ8egPjf38SMQ7hXuD85urfM+hd0LB/wjEtSzMe8Dbgoar6b32z9gPf0fz8HcC7Vjq2YVXVG6pqU7NvXQX8r6q6BvhN4NXN26YyN41NZ/Zt6Pb+Dd3fx5ta4LEkFzSTvg54kG5sv48BL0vyRc33dD63Tmy7Pottq4FrPJ8ceBJJDtO7wvSp5ot0C71TF58GXldVH5pkfINIcgh4FvDnzaTfrarXN/N+kN645xP0Th2+5+RLWb2aDsdP07sS+PaqeuNkI1qeJP8E+C3gfj4/RvAH6I2DvAv4+8BHgddU1cKLHqZGkm3Av6+qK5K8iN5FX88D/hD4tqr6zATD0yrSlX0b1s7+Dd3dx5P8I3oXPp4BPAq8jl7zceq3X5IfBb6VXi3wh8B30hvnO5XbLsmdwDbgbOBJ4IeBd3KSbbWcGs/CWZIkSWrBoRqSJElSCxbOkiRJUgsWzpIkSVILFs6SJElSCxbOkiRJUgsWzpIkSVILFs6SJElSC/8fA6+SxudduSkAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df[(df['Quantity']>-50) & \n", + " (df['Quantity']<50) & \n", + " (df['UnitPrice']>0) & \n", + " (df['UnitPrice']<100)][['Quantity', 'UnitPrice']].hist(figsize=[12,4], bins=30, log=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:24.504460Z", + "iopub.status.busy": "2021-12-15T20:25:24.504086Z", + "iopub.status.idle": "2021-12-15T20:25:26.468550Z", + "shell.execute_reply": "2021-12-15T20:25:26.466711Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CountryCustomerID...StockCodeUnitPrice
46United Kingdom13748.0...220862.55
83United Kingdom15291.0...217332.55
96United Kingdom14688.0...212120.42
102United Kingdom14688.0...85071B0.38
176United Kingdom16029.0...85099C1.65
..................
14784United Kingdom15061.0...2242310.95
14785United Kingdom15061.0...220751.45
14788United Kingdom15061.0...170380.07
14974United Kingdom14739.0...217040.72
14980United Kingdom14739.0...221781.06
\n", + "
\n", + "

258 rows × 8 columns

" + ], + "text/plain": [ + " Country CustomerID ... StockCode UnitPrice\n", + "46 United Kingdom 13748.0 ... 22086 2.55\n", + "83 United Kingdom 15291.0 ... 21733 2.55\n", + "96 United Kingdom 14688.0 ... 21212 0.42\n", + "102 United Kingdom 14688.0 ... 85071B 0.38\n", + "176 United Kingdom 16029.0 ... 85099C 1.65\n", + "... ... ... ... ... ...\n", + "14784 United Kingdom 15061.0 ... 22423 10.95\n", + "14785 United Kingdom 15061.0 ... 22075 1.45\n", + "14788 United Kingdom 15061.0 ... 17038 0.07\n", + "14974 United Kingdom 14739.0 ... 21704 0.72\n", + "14980 United Kingdom 14739.0 ... 22178 1.06\n", + "\n", + "[258 rows x 8 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.query('Quantity>50 & UnitPrice<100')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Arithmetic Operations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Numeric values" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:26.483774Z", + "iopub.status.busy": "2021-12-15T20:25:26.482084Z", + "iopub.status.idle": "2021-12-15T20:25:26.907406Z", + "shell.execute_reply": "2021-12-15T20:25:26.906448Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 6\n", + "1 6\n", + "2 8\n", + "3 6\n", + "4 6\n", + "Name: Quantity, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Quantity'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:26.912916Z", + "iopub.status.busy": "2021-12-15T20:25:26.910149Z", + "iopub.status.idle": "2021-12-15T20:25:27.361783Z", + "shell.execute_reply": "2021-12-15T20:25:27.362723Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2.55\n", + "1 3.39\n", + "2 2.75\n", + "3 3.39\n", + "4 3.39\n", + "Name: UnitPrice, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['UnitPrice'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:27.383414Z", + "iopub.status.busy": "2021-12-15T20:25:27.374098Z", + "iopub.status.idle": "2021-12-15T20:25:27.387546Z", + "shell.execute_reply": "2021-12-15T20:25:27.388753Z" + } + }, + "outputs": [], + "source": [ + "product = df['Quantity'] * df['UnitPrice']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:27.398754Z", + "iopub.status.busy": "2021-12-15T20:25:27.397557Z", + "iopub.status.idle": "2021-12-15T20:25:27.818022Z", + "shell.execute_reply": "2021-12-15T20:25:27.819640Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 15.30\n", + "1 20.34\n", + "2 22.00\n", + "3 20.34\n", + "4 20.34\n", + "dtype: float64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String concatenation" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-15T20:25:27.837007Z", + "iopub.status.busy": "2021-12-15T20:25:27.836370Z", + "iopub.status.idle": "2021-12-15T20:25:29.072872Z", + "shell.execute_reply": "2021-12-15T20:25:29.074153Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 United Kingdom85123A\n", + "1 United Kingdom71053\n", + "2 United Kingdom84406B\n", + "3 United Kingdom84029G\n", + "4 United Kingdom84029E\n", + " ... \n", + "14995 United Kingdom72349B\n", + "14996 United Kingdom72741\n", + "14997 United Kingdom22762\n", + "14998 United Kingdom21773\n", + "14999 United Kingdom22149\n", + "Length: 15000, dtype: object" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Country'] + df['StockCode']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..9d0e8b79 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,62 @@ +.. module:: opensearch_py_ml + +************************************************************** +Opensearch-py-ml: DataFrames and Machine Learning backed by Opensearch +************************************************************** + +**Date**: |today| **Version**: |version| + +**Useful links**: +`Source Repository `__ | +`Issues & Ideas `__ + +Opensearch-py-ml is a Python Opensearch client for exploring and analyzing data +in Opensearch with a familiar Pandas-compatible API. + +Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy, +pandas, scikit-learn to their Opensearch powered equivalents. In general, the data resides in Opensearch and +not in memory, which allows Opensearch-py-ml to access large datasets stored in Opensearch. + +Installing Opensearch-py-ml +~~~~~~~~~~~~~~~~ + +Opensearch-py-ml can be installed from `PyPI `_ via pip: + + .. code-block:: bash + + $ python -m pip install opensearch-py-ml + +Opensearch-py-ml can also be installed from `Conda Forge `_ with Conda: + + .. code-block:: bash + + $ conda install -c conda-forge opensearch-py-ml + +Getting Started +~~~~~~~~~~~~~~~ + +If it's your first time using Eland we recommend looking through the +:doc:`examples/index` documentation for ideas on what Opensearch-py-ml is capable of. + +If you're new to Opensearch we recommend `reading the documentation `_. + +.. toctree:: + :maxdepth: 2 + :hidden: + + reference/index + development/index + examples/index + +* :doc:`reference/index` + + * :doc:`reference/supported_apis` + * :doc:`reference/dataframe` + * :doc:`reference/series` + * :doc:`reference/general_utility_functions` + * :doc:`reference/io` + +* :doc:`examples/index` + + * :doc:`examples/demo_notebook` + * :doc:`examples/online_retail_analysis` diff --git a/docs/source/reference/api/DataFrame.agg.rst b/docs/source/reference/api/DataFrame.agg.rst new file mode 100644 index 00000000..2edec28e --- /dev/null +++ b/docs/source/reference/api/DataFrame.agg.rst @@ -0,0 +1,6 @@ +DataFrame.agg +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.agg diff --git a/docs/source/reference/api/DataFrame.aggregate.rst b/docs/source/reference/api/DataFrame.aggregate.rst new file mode 100644 index 00000000..4a6306ed --- /dev/null +++ b/docs/source/reference/api/DataFrame.aggregate.rst @@ -0,0 +1,6 @@ +DataFrame.aggregate +========================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.aggregate diff --git a/docs/source/reference/api/DataFrame.columns.rst b/docs/source/reference/api/DataFrame.columns.rst new file mode 100644 index 00000000..2561e435 --- /dev/null +++ b/docs/source/reference/api/DataFrame.columns.rst @@ -0,0 +1,6 @@ +DataFrame.columns +======================= + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.columns diff --git a/docs/source/reference/api/DataFrame.count.rst b/docs/source/reference/api/DataFrame.count.rst new file mode 100644 index 00000000..f7b0a2f4 --- /dev/null +++ b/docs/source/reference/api/DataFrame.count.rst @@ -0,0 +1,6 @@ +DataFrame.count +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.count diff --git a/docs/source/reference/api/DataFrame.describe.rst b/docs/source/reference/api/DataFrame.describe.rst new file mode 100644 index 00000000..be4b2f29 --- /dev/null +++ b/docs/source/reference/api/DataFrame.describe.rst @@ -0,0 +1,6 @@ +DataFrame.describe +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.describe diff --git a/docs/source/reference/api/DataFrame.drop.rst b/docs/source/reference/api/DataFrame.drop.rst new file mode 100644 index 00000000..ee441e0f --- /dev/null +++ b/docs/source/reference/api/DataFrame.drop.rst @@ -0,0 +1,6 @@ +DataFrame.drop +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.drop diff --git a/docs/source/reference/api/DataFrame.dtypes.rst b/docs/source/reference/api/DataFrame.dtypes.rst new file mode 100644 index 00000000..b5ad7e3e --- /dev/null +++ b/docs/source/reference/api/DataFrame.dtypes.rst @@ -0,0 +1,6 @@ +DataFrame.dtypes +====================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.dtypes diff --git a/docs/source/reference/api/DataFrame.empty.rst b/docs/source/reference/api/DataFrame.empty.rst new file mode 100644 index 00000000..80a70d69 --- /dev/null +++ b/docs/source/reference/api/DataFrame.empty.rst @@ -0,0 +1,6 @@ +DataFrame.empty +===================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.empty diff --git a/docs/source/reference/api/DataFrame.es_match.rst b/docs/source/reference/api/DataFrame.es_match.rst new file mode 100644 index 00000000..bf141ffe --- /dev/null +++ b/docs/source/reference/api/DataFrame.es_match.rst @@ -0,0 +1,6 @@ +DataFrame.es_match +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.es_match diff --git a/docs/source/reference/api/DataFrame.es_query.rst b/docs/source/reference/api/DataFrame.es_query.rst new file mode 100644 index 00000000..b4d5dfe4 --- /dev/null +++ b/docs/source/reference/api/DataFrame.es_query.rst @@ -0,0 +1,6 @@ +DataFrame.es_query +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.es_query diff --git a/docs/source/reference/api/DataFrame.filter.rst b/docs/source/reference/api/DataFrame.filter.rst new file mode 100644 index 00000000..9c0a4fcf --- /dev/null +++ b/docs/source/reference/api/DataFrame.filter.rst @@ -0,0 +1,6 @@ +DataFrame.filter +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.filter diff --git a/docs/source/reference/api/DataFrame.get.rst b/docs/source/reference/api/DataFrame.get.rst new file mode 100644 index 00000000..335fe293 --- /dev/null +++ b/docs/source/reference/api/DataFrame.get.rst @@ -0,0 +1,6 @@ +DataFrame.get +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.get diff --git a/docs/source/reference/api/DataFrame.groupby.rst b/docs/source/reference/api/DataFrame.groupby.rst new file mode 100644 index 00000000..b3f0792f --- /dev/null +++ b/docs/source/reference/api/DataFrame.groupby.rst @@ -0,0 +1,6 @@ +DataFrame.groupby +======================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.groupby diff --git a/docs/source/reference/api/DataFrame.head.rst b/docs/source/reference/api/DataFrame.head.rst new file mode 100644 index 00000000..d7f1fbbf --- /dev/null +++ b/docs/source/reference/api/DataFrame.head.rst @@ -0,0 +1,6 @@ +DataFrame.head +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.head diff --git a/docs/source/reference/api/DataFrame.hist.rst b/docs/source/reference/api/DataFrame.hist.rst new file mode 100644 index 00000000..8bb7f064 --- /dev/null +++ b/docs/source/reference/api/DataFrame.hist.rst @@ -0,0 +1,8 @@ +DataFrame.hist +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.hist +.. image:: opensearch_py_ml-DataFrame-hist-1.png + diff --git a/docs/source/reference/api/DataFrame.idxmax.rst b/docs/source/reference/api/DataFrame.idxmax.rst new file mode 100644 index 00000000..efe0d46e --- /dev/null +++ b/docs/source/reference/api/DataFrame.idxmax.rst @@ -0,0 +1,6 @@ +DataFrame.idxmax +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.idxmax diff --git a/docs/source/reference/api/DataFrame.idxmin.rst b/docs/source/reference/api/DataFrame.idxmin.rst new file mode 100644 index 00000000..6cbb54f2 --- /dev/null +++ b/docs/source/reference/api/DataFrame.idxmin.rst @@ -0,0 +1,6 @@ +DataFrame.idxmin +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.idxmin diff --git a/docs/source/reference/api/DataFrame.index.rst b/docs/source/reference/api/DataFrame.index.rst new file mode 100644 index 00000000..e89e4f8c --- /dev/null +++ b/docs/source/reference/api/DataFrame.index.rst @@ -0,0 +1,6 @@ +DataFrame.index +===================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.index diff --git a/docs/source/reference/api/DataFrame.info.rst b/docs/source/reference/api/DataFrame.info.rst new file mode 100644 index 00000000..4637a50d --- /dev/null +++ b/docs/source/reference/api/DataFrame.info.rst @@ -0,0 +1,6 @@ +DataFrame.info +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.info diff --git a/docs/source/reference/api/DataFrame.iterrows.rst b/docs/source/reference/api/DataFrame.iterrows.rst new file mode 100644 index 00000000..7f7a27ab --- /dev/null +++ b/docs/source/reference/api/DataFrame.iterrows.rst @@ -0,0 +1,6 @@ +DataFrame.iterrows +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.iterrows diff --git a/docs/source/reference/api/DataFrame.itertuples.rst b/docs/source/reference/api/DataFrame.itertuples.rst new file mode 100644 index 00000000..e2398bf0 --- /dev/null +++ b/docs/source/reference/api/DataFrame.itertuples.rst @@ -0,0 +1,6 @@ +DataFrame.itertuples +========================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.itertuples diff --git a/docs/source/reference/api/DataFrame.keys.rst b/docs/source/reference/api/DataFrame.keys.rst new file mode 100644 index 00000000..6887f05a --- /dev/null +++ b/docs/source/reference/api/DataFrame.keys.rst @@ -0,0 +1,6 @@ +DataFrame.keys +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.keys diff --git a/docs/source/reference/api/DataFrame.mad.rst b/docs/source/reference/api/DataFrame.mad.rst new file mode 100644 index 00000000..e9db9df9 --- /dev/null +++ b/docs/source/reference/api/DataFrame.mad.rst @@ -0,0 +1,6 @@ +DataFrame.mad +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.mad diff --git a/docs/source/reference/api/DataFrame.max.rst b/docs/source/reference/api/DataFrame.max.rst new file mode 100644 index 00000000..25344b82 --- /dev/null +++ b/docs/source/reference/api/DataFrame.max.rst @@ -0,0 +1,6 @@ +DataFrame.max +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.max diff --git a/docs/source/reference/api/DataFrame.mean.rst b/docs/source/reference/api/DataFrame.mean.rst new file mode 100644 index 00000000..ac70ab62 --- /dev/null +++ b/docs/source/reference/api/DataFrame.mean.rst @@ -0,0 +1,6 @@ +DataFrame.mean +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.mean diff --git a/docs/source/reference/api/DataFrame.median.rst b/docs/source/reference/api/DataFrame.median.rst new file mode 100644 index 00000000..47d826cb --- /dev/null +++ b/docs/source/reference/api/DataFrame.median.rst @@ -0,0 +1,6 @@ +DataFrame.median +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.median diff --git a/docs/source/reference/api/DataFrame.min.rst b/docs/source/reference/api/DataFrame.min.rst new file mode 100644 index 00000000..8ada553c --- /dev/null +++ b/docs/source/reference/api/DataFrame.min.rst @@ -0,0 +1,6 @@ +DataFrame.min +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.min diff --git a/docs/source/reference/api/DataFrame.mode.rst b/docs/source/reference/api/DataFrame.mode.rst new file mode 100644 index 00000000..56f4d0e5 --- /dev/null +++ b/docs/source/reference/api/DataFrame.mode.rst @@ -0,0 +1,6 @@ +DataFrame.mode +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.mode \ No newline at end of file diff --git a/docs/source/reference/api/DataFrame.ndim.rst b/docs/source/reference/api/DataFrame.ndim.rst new file mode 100644 index 00000000..416985f6 --- /dev/null +++ b/docs/source/reference/api/DataFrame.ndim.rst @@ -0,0 +1,6 @@ +DataFrame.ndim +==================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.ndim diff --git a/docs/source/reference/api/DataFrame.nunique.rst b/docs/source/reference/api/DataFrame.nunique.rst new file mode 100644 index 00000000..b1455f62 --- /dev/null +++ b/docs/source/reference/api/DataFrame.nunique.rst @@ -0,0 +1,6 @@ +DataFrame.nunique +======================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.nunique diff --git a/docs/source/reference/api/DataFrame.os_dtypes.rst b/docs/source/reference/api/DataFrame.os_dtypes.rst new file mode 100644 index 00000000..88223421 --- /dev/null +++ b/docs/source/reference/api/DataFrame.os_dtypes.rst @@ -0,0 +1,6 @@ +DataFrame.os_dtypes +========================= + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.os_dtypes diff --git a/docs/source/reference/api/DataFrame.os_info.rst b/docs/source/reference/api/DataFrame.os_info.rst new file mode 100644 index 00000000..23515eeb --- /dev/null +++ b/docs/source/reference/api/DataFrame.os_info.rst @@ -0,0 +1,6 @@ +DataFrame.os_info +======================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.os_info diff --git a/docs/source/reference/api/DataFrame.quantile.rst b/docs/source/reference/api/DataFrame.quantile.rst new file mode 100644 index 00000000..b09387e4 --- /dev/null +++ b/docs/source/reference/api/DataFrame.quantile.rst @@ -0,0 +1,6 @@ +DataFrame.quantile +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.quantile \ No newline at end of file diff --git a/docs/source/reference/api/DataFrame.query.rst b/docs/source/reference/api/DataFrame.query.rst new file mode 100644 index 00000000..df465b2e --- /dev/null +++ b/docs/source/reference/api/DataFrame.query.rst @@ -0,0 +1,6 @@ +DataFrame.query +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.query diff --git a/docs/source/reference/api/DataFrame.rst b/docs/source/reference/api/DataFrame.rst new file mode 100644 index 00000000..d8eb65b3 --- /dev/null +++ b/docs/source/reference/api/DataFrame.rst @@ -0,0 +1,18 @@ +DataFrame +================ + +.. currentmodule:: opensearch_py_ml + +.. autoclass:: opensearch_py_ml.DataFrame + + + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + opensearch_py_ml.DataFrame.abs + opensearch_py_ml.DataFrame.add + diff --git a/docs/source/reference/api/DataFrame.sample.rst b/docs/source/reference/api/DataFrame.sample.rst new file mode 100644 index 00000000..c085ccf2 --- /dev/null +++ b/docs/source/reference/api/DataFrame.sample.rst @@ -0,0 +1,6 @@ +DataFrame.sample +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.sample diff --git a/docs/source/reference/api/DataFrame.select_dtypes.rst b/docs/source/reference/api/DataFrame.select_dtypes.rst new file mode 100644 index 00000000..f4c52447 --- /dev/null +++ b/docs/source/reference/api/DataFrame.select_dtypes.rst @@ -0,0 +1,6 @@ +DataFrame.select_dtypes +============================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.select_dtypes diff --git a/docs/source/reference/api/DataFrame.shape.rst b/docs/source/reference/api/DataFrame.shape.rst new file mode 100644 index 00000000..30c15015 --- /dev/null +++ b/docs/source/reference/api/DataFrame.shape.rst @@ -0,0 +1,6 @@ +DataFrame.shape +===================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.shape diff --git a/docs/source/reference/api/DataFrame.size.rst b/docs/source/reference/api/DataFrame.size.rst new file mode 100644 index 00000000..d791c89f --- /dev/null +++ b/docs/source/reference/api/DataFrame.size.rst @@ -0,0 +1,6 @@ +DataFrame.size +==================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.size diff --git a/docs/source/reference/api/DataFrame.std.rst b/docs/source/reference/api/DataFrame.std.rst new file mode 100644 index 00000000..8a5c9bfd --- /dev/null +++ b/docs/source/reference/api/DataFrame.std.rst @@ -0,0 +1,6 @@ +DataFrame.std +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.std diff --git a/docs/source/reference/api/DataFrame.sum.rst b/docs/source/reference/api/DataFrame.sum.rst new file mode 100644 index 00000000..1a083f44 --- /dev/null +++ b/docs/source/reference/api/DataFrame.sum.rst @@ -0,0 +1,6 @@ +DataFrame.sum +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.sum diff --git a/docs/source/reference/api/DataFrame.tail.rst b/docs/source/reference/api/DataFrame.tail.rst new file mode 100644 index 00000000..d323b860 --- /dev/null +++ b/docs/source/reference/api/DataFrame.tail.rst @@ -0,0 +1,6 @@ +DataFrame.tail +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.tail diff --git a/docs/source/reference/api/DataFrame.to_csv.rst b/docs/source/reference/api/DataFrame.to_csv.rst new file mode 100644 index 00000000..398b7454 --- /dev/null +++ b/docs/source/reference/api/DataFrame.to_csv.rst @@ -0,0 +1,6 @@ +DataFrame.to_csv +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.to_csv diff --git a/docs/source/reference/api/DataFrame.to_html.rst b/docs/source/reference/api/DataFrame.to_html.rst new file mode 100644 index 00000000..0fc5e110 --- /dev/null +++ b/docs/source/reference/api/DataFrame.to_html.rst @@ -0,0 +1,6 @@ +DataFrame.to_html +======================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.to_html diff --git a/docs/source/reference/api/DataFrame.to_numpy.rst b/docs/source/reference/api/DataFrame.to_numpy.rst new file mode 100644 index 00000000..ef9e0c2c --- /dev/null +++ b/docs/source/reference/api/DataFrame.to_numpy.rst @@ -0,0 +1,6 @@ +DataFrame.to_numpy +======================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.to_numpy diff --git a/docs/source/reference/api/DataFrame.to_pandas.rst b/docs/source/reference/api/DataFrame.to_pandas.rst new file mode 100644 index 00000000..251ed3c5 --- /dev/null +++ b/docs/source/reference/api/DataFrame.to_pandas.rst @@ -0,0 +1,6 @@ +DataFrame.to_pandas +========================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.to_pandas diff --git a/docs/source/reference/api/DataFrame.to_string.rst b/docs/source/reference/api/DataFrame.to_string.rst new file mode 100644 index 00000000..051b4edf --- /dev/null +++ b/docs/source/reference/api/DataFrame.to_string.rst @@ -0,0 +1,6 @@ +DataFrame.to_string +========================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.to_string diff --git a/docs/source/reference/api/DataFrame.values.rst b/docs/source/reference/api/DataFrame.values.rst new file mode 100644 index 00000000..3b30d858 --- /dev/null +++ b/docs/source/reference/api/DataFrame.values.rst @@ -0,0 +1,6 @@ +DataFrame.values +====================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.DataFrame.values diff --git a/docs/source/reference/api/DataFrame.var.rst b/docs/source/reference/api/DataFrame.var.rst new file mode 100644 index 00000000..ca0c3a97 --- /dev/null +++ b/docs/source/reference/api/DataFrame.var.rst @@ -0,0 +1,6 @@ +DataFrame.var +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.DataFrame.var diff --git a/docs/source/reference/api/Index.rst b/docs/source/reference/api/Index.rst new file mode 100644 index 00000000..6998468e --- /dev/null +++ b/docs/source/reference/api/Index.rst @@ -0,0 +1,6 @@ +opensearch_py_ml.Index +=========== + +.. currentmodule:: opensearch_py_ml + +.. autoclass:: Index diff --git a/docs/source/reference/api/Series.add.rst b/docs/source/reference/api/Series.add.rst new file mode 100644 index 00000000..53c713a5 --- /dev/null +++ b/docs/source/reference/api/Series.add.rst @@ -0,0 +1,6 @@ +Series.add +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.add diff --git a/docs/source/reference/api/Series.describe.rst b/docs/source/reference/api/Series.describe.rst new file mode 100644 index 00000000..b0275409 --- /dev/null +++ b/docs/source/reference/api/Series.describe.rst @@ -0,0 +1,6 @@ +Series.describe +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.describe diff --git a/docs/source/reference/api/Series.div.rst b/docs/source/reference/api/Series.div.rst new file mode 100644 index 00000000..13df28a9 --- /dev/null +++ b/docs/source/reference/api/Series.div.rst @@ -0,0 +1,6 @@ +Series.div +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.div diff --git a/docs/source/reference/api/Series.divide.rst b/docs/source/reference/api/Series.divide.rst new file mode 100644 index 00000000..69097254 --- /dev/null +++ b/docs/source/reference/api/Series.divide.rst @@ -0,0 +1,6 @@ +Series.divide +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.divide diff --git a/docs/source/reference/api/Series.dtype.rst b/docs/source/reference/api/Series.dtype.rst new file mode 100644 index 00000000..729179ca --- /dev/null +++ b/docs/source/reference/api/Series.dtype.rst @@ -0,0 +1,6 @@ +Series.dtype +================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.dtype diff --git a/docs/source/reference/api/Series.dtypes.rst b/docs/source/reference/api/Series.dtypes.rst new file mode 100644 index 00000000..ea59398b --- /dev/null +++ b/docs/source/reference/api/Series.dtypes.rst @@ -0,0 +1,6 @@ +Series.dtypes +=================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.dtypes diff --git a/docs/source/reference/api/Series.empty.rst b/docs/source/reference/api/Series.empty.rst new file mode 100644 index 00000000..5b33620b --- /dev/null +++ b/docs/source/reference/api/Series.empty.rst @@ -0,0 +1,6 @@ +Series.empty +================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.empty diff --git a/docs/source/reference/api/Series.es_dtypes.rst b/docs/source/reference/api/Series.es_dtypes.rst new file mode 100644 index 00000000..be0066ac --- /dev/null +++ b/docs/source/reference/api/Series.es_dtypes.rst @@ -0,0 +1,6 @@ +Series.es_dtypes +====================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.es_dtypes diff --git a/docs/source/reference/api/Series.es_match.rst b/docs/source/reference/api/Series.es_match.rst new file mode 100644 index 00000000..4657aa88 --- /dev/null +++ b/docs/source/reference/api/Series.es_match.rst @@ -0,0 +1,6 @@ +Series.es_match +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.es_match diff --git a/docs/source/reference/api/Series.filter.rst b/docs/source/reference/api/Series.filter.rst new file mode 100644 index 00000000..c89e6a28 --- /dev/null +++ b/docs/source/reference/api/Series.filter.rst @@ -0,0 +1,6 @@ +Series.filter +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.filter diff --git a/docs/source/reference/api/Series.floordiv.rst b/docs/source/reference/api/Series.floordiv.rst new file mode 100644 index 00000000..2e2571a4 --- /dev/null +++ b/docs/source/reference/api/Series.floordiv.rst @@ -0,0 +1,6 @@ +Series.floordiv +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.floordiv diff --git a/docs/source/reference/api/Series.head.rst b/docs/source/reference/api/Series.head.rst new file mode 100644 index 00000000..b8314e70 --- /dev/null +++ b/docs/source/reference/api/Series.head.rst @@ -0,0 +1,6 @@ +Series.head +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.head diff --git a/docs/source/reference/api/Series.hist.rst b/docs/source/reference/api/Series.hist.rst new file mode 100644 index 00000000..31ff64cf --- /dev/null +++ b/docs/source/reference/api/Series.hist.rst @@ -0,0 +1,8 @@ +Series.hist +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.hist +.. image:: opensearch_py_ml-Series-hist-2.png + diff --git a/docs/source/reference/api/Series.index.rst b/docs/source/reference/api/Series.index.rst new file mode 100644 index 00000000..139cb2fa --- /dev/null +++ b/docs/source/reference/api/Series.index.rst @@ -0,0 +1,6 @@ +Series.index +================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.index diff --git a/docs/source/reference/api/Series.isin.rst b/docs/source/reference/api/Series.isin.rst new file mode 100644 index 00000000..32b371da --- /dev/null +++ b/docs/source/reference/api/Series.isin.rst @@ -0,0 +1,6 @@ +Series.isin +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.isin diff --git a/docs/source/reference/api/Series.isna.rst b/docs/source/reference/api/Series.isna.rst new file mode 100644 index 00000000..26cea28c --- /dev/null +++ b/docs/source/reference/api/Series.isna.rst @@ -0,0 +1,6 @@ +Series.isna +================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.isna diff --git a/docs/source/reference/api/Series.isnull.rst b/docs/source/reference/api/Series.isnull.rst new file mode 100644 index 00000000..95892229 --- /dev/null +++ b/docs/source/reference/api/Series.isnull.rst @@ -0,0 +1,6 @@ +Series.isnull +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.isnull diff --git a/docs/source/reference/api/Series.mad.rst b/docs/source/reference/api/Series.mad.rst new file mode 100644 index 00000000..535f1140 --- /dev/null +++ b/docs/source/reference/api/Series.mad.rst @@ -0,0 +1,6 @@ +Series.mad +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.mad diff --git a/docs/source/reference/api/Series.max.rst b/docs/source/reference/api/Series.max.rst new file mode 100644 index 00000000..db22d313 --- /dev/null +++ b/docs/source/reference/api/Series.max.rst @@ -0,0 +1,6 @@ +Series.max +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.max diff --git a/docs/source/reference/api/Series.mean.rst b/docs/source/reference/api/Series.mean.rst new file mode 100644 index 00000000..5703ecf1 --- /dev/null +++ b/docs/source/reference/api/Series.mean.rst @@ -0,0 +1,6 @@ +Series.mean +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.mean diff --git a/docs/source/reference/api/Series.median.rst b/docs/source/reference/api/Series.median.rst new file mode 100644 index 00000000..38a82db1 --- /dev/null +++ b/docs/source/reference/api/Series.median.rst @@ -0,0 +1,6 @@ +Series.median +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.median diff --git a/docs/source/reference/api/Series.min.rst b/docs/source/reference/api/Series.min.rst new file mode 100644 index 00000000..692e6e21 --- /dev/null +++ b/docs/source/reference/api/Series.min.rst @@ -0,0 +1,6 @@ +Series.min +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.min diff --git a/docs/source/reference/api/Series.mod.rst b/docs/source/reference/api/Series.mod.rst new file mode 100644 index 00000000..a072011e --- /dev/null +++ b/docs/source/reference/api/Series.mod.rst @@ -0,0 +1,6 @@ +Series.mod +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.mod diff --git a/docs/source/reference/api/Series.mode.rst b/docs/source/reference/api/Series.mode.rst new file mode 100644 index 00000000..8ce6e8f1 --- /dev/null +++ b/docs/source/reference/api/Series.mode.rst @@ -0,0 +1,6 @@ +Series.mode +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.mode \ No newline at end of file diff --git a/docs/source/reference/api/Series.mul.rst b/docs/source/reference/api/Series.mul.rst new file mode 100644 index 00000000..be56d674 --- /dev/null +++ b/docs/source/reference/api/Series.mul.rst @@ -0,0 +1,6 @@ +Series.mul +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.mul diff --git a/docs/source/reference/api/Series.multiply.rst b/docs/source/reference/api/Series.multiply.rst new file mode 100644 index 00000000..5a2c3dba --- /dev/null +++ b/docs/source/reference/api/Series.multiply.rst @@ -0,0 +1,6 @@ +Series.multiply +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.multiply diff --git a/docs/source/reference/api/Series.name.rst b/docs/source/reference/api/Series.name.rst new file mode 100644 index 00000000..c3deeeaf --- /dev/null +++ b/docs/source/reference/api/Series.name.rst @@ -0,0 +1,6 @@ +Series.name +================= + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.name diff --git a/docs/source/reference/api/Series.ndim.rst b/docs/source/reference/api/Series.ndim.rst new file mode 100644 index 00000000..c76ad455 --- /dev/null +++ b/docs/source/reference/api/Series.ndim.rst @@ -0,0 +1,6 @@ +Series.ndim +================= + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.ndim diff --git a/docs/source/reference/api/Series.notna.rst b/docs/source/reference/api/Series.notna.rst new file mode 100644 index 00000000..6dcd69da --- /dev/null +++ b/docs/source/reference/api/Series.notna.rst @@ -0,0 +1,6 @@ +Series.notna +================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.notna diff --git a/docs/source/reference/api/Series.notnull.rst b/docs/source/reference/api/Series.notnull.rst new file mode 100644 index 00000000..6dc46bbc --- /dev/null +++ b/docs/source/reference/api/Series.notnull.rst @@ -0,0 +1,6 @@ +Series.notnull +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.notnull diff --git a/docs/source/reference/api/Series.nunique.rst b/docs/source/reference/api/Series.nunique.rst new file mode 100644 index 00000000..999f2a15 --- /dev/null +++ b/docs/source/reference/api/Series.nunique.rst @@ -0,0 +1,6 @@ +Series.nunique +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.nunique diff --git a/docs/source/reference/api/Series.os_dtype.rst b/docs/source/reference/api/Series.os_dtype.rst new file mode 100644 index 00000000..49831a91 --- /dev/null +++ b/docs/source/reference/api/Series.os_dtype.rst @@ -0,0 +1,6 @@ +Series.os_dtype +===================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.os_dtype diff --git a/docs/source/reference/api/Series.os_info.rst b/docs/source/reference/api/Series.os_info.rst new file mode 100644 index 00000000..8354f11b --- /dev/null +++ b/docs/source/reference/api/Series.os_info.rst @@ -0,0 +1,6 @@ +Series.os_info +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.os_info diff --git a/docs/source/reference/api/Series.pow.rst b/docs/source/reference/api/Series.pow.rst new file mode 100644 index 00000000..004d1e19 --- /dev/null +++ b/docs/source/reference/api/Series.pow.rst @@ -0,0 +1,6 @@ +Series.pow +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.pow diff --git a/docs/source/reference/api/Series.quantile.rst b/docs/source/reference/api/Series.quantile.rst new file mode 100644 index 00000000..8f5f12cb --- /dev/null +++ b/docs/source/reference/api/Series.quantile.rst @@ -0,0 +1,6 @@ +Series.quantile +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.quantile \ No newline at end of file diff --git a/docs/source/reference/api/Series.radd.rst b/docs/source/reference/api/Series.radd.rst new file mode 100644 index 00000000..44e9bb1d --- /dev/null +++ b/docs/source/reference/api/Series.radd.rst @@ -0,0 +1,6 @@ +Series.radd +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.radd diff --git a/docs/source/reference/api/Series.rdiv.rst b/docs/source/reference/api/Series.rdiv.rst new file mode 100644 index 00000000..b5ec4dd3 --- /dev/null +++ b/docs/source/reference/api/Series.rdiv.rst @@ -0,0 +1,6 @@ +Series.rdiv +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rdiv diff --git a/docs/source/reference/api/Series.rdivide.rst b/docs/source/reference/api/Series.rdivide.rst new file mode 100644 index 00000000..0b7caa1f --- /dev/null +++ b/docs/source/reference/api/Series.rdivide.rst @@ -0,0 +1,6 @@ +Series.rdivide +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rdivide diff --git a/docs/source/reference/api/Series.rename.rst b/docs/source/reference/api/Series.rename.rst new file mode 100644 index 00000000..c4207c2e --- /dev/null +++ b/docs/source/reference/api/Series.rename.rst @@ -0,0 +1,6 @@ +Series.rename +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rename diff --git a/docs/source/reference/api/Series.rfloordiv.rst b/docs/source/reference/api/Series.rfloordiv.rst new file mode 100644 index 00000000..eddcb4ce --- /dev/null +++ b/docs/source/reference/api/Series.rfloordiv.rst @@ -0,0 +1,6 @@ +Series.rfloordiv +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rfloordiv diff --git a/docs/source/reference/api/Series.rmod.rst b/docs/source/reference/api/Series.rmod.rst new file mode 100644 index 00000000..1bd36e0d --- /dev/null +++ b/docs/source/reference/api/Series.rmod.rst @@ -0,0 +1,6 @@ +Series.rmod +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rmod diff --git a/docs/source/reference/api/Series.rmul.rst b/docs/source/reference/api/Series.rmul.rst new file mode 100644 index 00000000..95d4ebaf --- /dev/null +++ b/docs/source/reference/api/Series.rmul.rst @@ -0,0 +1,6 @@ +Series.rmul +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rmul diff --git a/docs/source/reference/api/Series.rmultiply.rst b/docs/source/reference/api/Series.rmultiply.rst new file mode 100644 index 00000000..643b1cea --- /dev/null +++ b/docs/source/reference/api/Series.rmultiply.rst @@ -0,0 +1,6 @@ +Series.rmultiply +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rmultiply diff --git a/docs/source/reference/api/Series.rpow.rst b/docs/source/reference/api/Series.rpow.rst new file mode 100644 index 00000000..2c4b0c52 --- /dev/null +++ b/docs/source/reference/api/Series.rpow.rst @@ -0,0 +1,6 @@ +Series.rpow +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rpow diff --git a/docs/source/reference/api/Series.rst b/docs/source/reference/api/Series.rst new file mode 100644 index 00000000..5806fc94 --- /dev/null +++ b/docs/source/reference/api/Series.rst @@ -0,0 +1,6 @@ +Series +============ + +.. currentmodule:: opensearch_py_ml + +.. autoclass:: opensearch_py_ml.Series diff --git a/docs/source/reference/api/Series.rsub.rst b/docs/source/reference/api/Series.rsub.rst new file mode 100644 index 00000000..17827573 --- /dev/null +++ b/docs/source/reference/api/Series.rsub.rst @@ -0,0 +1,6 @@ +Series.rsub +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rsub diff --git a/docs/source/reference/api/Series.rsubtract.rst b/docs/source/reference/api/Series.rsubtract.rst new file mode 100644 index 00000000..c124ff7b --- /dev/null +++ b/docs/source/reference/api/Series.rsubtract.rst @@ -0,0 +1,6 @@ +Series.rsubtract +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rsubtract diff --git a/docs/source/reference/api/Series.rtruediv.rst b/docs/source/reference/api/Series.rtruediv.rst new file mode 100644 index 00000000..51e19ea3 --- /dev/null +++ b/docs/source/reference/api/Series.rtruediv.rst @@ -0,0 +1,6 @@ +Series.rtruediv +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.rtruediv diff --git a/docs/source/reference/api/Series.sample.rst b/docs/source/reference/api/Series.sample.rst new file mode 100644 index 00000000..f942f2dc --- /dev/null +++ b/docs/source/reference/api/Series.sample.rst @@ -0,0 +1,6 @@ +Series.sample +=================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.sample diff --git a/docs/source/reference/api/Series.shape.rst b/docs/source/reference/api/Series.shape.rst new file mode 100644 index 00000000..0c0e0e3f --- /dev/null +++ b/docs/source/reference/api/Series.shape.rst @@ -0,0 +1,6 @@ +Series.shape +================== + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.shape diff --git a/docs/source/reference/api/Series.size.rst b/docs/source/reference/api/Series.size.rst new file mode 100644 index 00000000..d99a2048 --- /dev/null +++ b/docs/source/reference/api/Series.size.rst @@ -0,0 +1,6 @@ +Series.size +================= + +.. currentmodule:: opensearch_py_ml + +.. autoattribute:: opensearch_py_ml.Series.size diff --git a/docs/source/reference/api/Series.std.rst b/docs/source/reference/api/Series.std.rst new file mode 100644 index 00000000..cd144fa9 --- /dev/null +++ b/docs/source/reference/api/Series.std.rst @@ -0,0 +1,6 @@ +Series.std +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.std diff --git a/docs/source/reference/api/Series.sub.rst b/docs/source/reference/api/Series.sub.rst new file mode 100644 index 00000000..c55a7c35 --- /dev/null +++ b/docs/source/reference/api/Series.sub.rst @@ -0,0 +1,6 @@ +Series.sub +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.sub diff --git a/docs/source/reference/api/Series.subtract.rst b/docs/source/reference/api/Series.subtract.rst new file mode 100644 index 00000000..190d6082 --- /dev/null +++ b/docs/source/reference/api/Series.subtract.rst @@ -0,0 +1,6 @@ +Series.subtract +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.subtract diff --git a/docs/source/reference/api/Series.sum.rst b/docs/source/reference/api/Series.sum.rst new file mode 100644 index 00000000..6e850a0f --- /dev/null +++ b/docs/source/reference/api/Series.sum.rst @@ -0,0 +1,6 @@ +Series.sum +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.sum diff --git a/docs/source/reference/api/Series.tail.rst b/docs/source/reference/api/Series.tail.rst new file mode 100644 index 00000000..c0412d50 --- /dev/null +++ b/docs/source/reference/api/Series.tail.rst @@ -0,0 +1,6 @@ +Series.tail +================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.tail diff --git a/docs/source/reference/api/Series.to_numpy.rst b/docs/source/reference/api/Series.to_numpy.rst new file mode 100644 index 00000000..463db663 --- /dev/null +++ b/docs/source/reference/api/Series.to_numpy.rst @@ -0,0 +1,6 @@ +Series.to_numpy +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.to_numpy diff --git a/docs/source/reference/api/Series.to_pandas.rst b/docs/source/reference/api/Series.to_pandas.rst new file mode 100644 index 00000000..09793856 --- /dev/null +++ b/docs/source/reference/api/Series.to_pandas.rst @@ -0,0 +1,6 @@ +Series.to_pandas +========================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.to_pandas diff --git a/docs/source/reference/api/Series.to_string.rst b/docs/source/reference/api/Series.to_string.rst new file mode 100644 index 00000000..c67cd3e5 --- /dev/null +++ b/docs/source/reference/api/Series.to_string.rst @@ -0,0 +1,6 @@ +Series.to_string +====================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.to_string diff --git a/docs/source/reference/api/Series.truediv.rst b/docs/source/reference/api/Series.truediv.rst new file mode 100644 index 00000000..a6c510ba --- /dev/null +++ b/docs/source/reference/api/Series.truediv.rst @@ -0,0 +1,6 @@ +Series.truediv +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.truediv diff --git a/docs/source/reference/api/Series.unique.rst b/docs/source/reference/api/Series.unique.rst new file mode 100644 index 00000000..4c4108df --- /dev/null +++ b/docs/source/reference/api/Series.unique.rst @@ -0,0 +1,6 @@ +Series.unique +==================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.unique diff --git a/docs/source/reference/api/Series.value_counts.rst b/docs/source/reference/api/Series.value_counts.rst new file mode 100644 index 00000000..43a61662 --- /dev/null +++ b/docs/source/reference/api/Series.value_counts.rst @@ -0,0 +1,6 @@ +Series.value_counts +========================= + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.value_counts diff --git a/docs/source/reference/api/Series.var.rst b/docs/source/reference/api/Series.var.rst new file mode 100644 index 00000000..3f07c82b --- /dev/null +++ b/docs/source/reference/api/Series.var.rst @@ -0,0 +1,6 @@ +Series.var +================ + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.Series.var diff --git a/docs/source/reference/api/csv_to_opensearch.rst b/docs/source/reference/api/csv_to_opensearch.rst new file mode 100644 index 00000000..56ea4f76 --- /dev/null +++ b/docs/source/reference/api/csv_to_opensearch.rst @@ -0,0 +1,6 @@ +csv_to_opensearch +================== + +.. currentmodule:: opensearch_py_ml + +.. autofunction:: opensearch_py_ml.etl.csv_to_opensearch diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.agg.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.agg.rst new file mode 100644 index 00000000..b9557489 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.agg.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.agg +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.agg diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.aggregate.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.aggregate.rst new file mode 100644 index 00000000..54637456 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.aggregate.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.aggregate +======================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.aggregate diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.count.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.count.rst new file mode 100644 index 00000000..4eec8084 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.count.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.count +==================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.count diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.mad.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.mad.rst new file mode 100644 index 00000000..99a665ad --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.mad.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.mad +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.mad diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.max.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.max.rst new file mode 100644 index 00000000..9e225b01 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.max.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.max +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.max diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.mean.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.mean.rst new file mode 100644 index 00000000..77e6f771 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.mean.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.mean +=================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.mean diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.median.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.median.rst new file mode 100644 index 00000000..723a65f8 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.median.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.median +===================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.median diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.min.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.min.rst new file mode 100644 index 00000000..bb257ff0 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.min.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.min +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.min diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.nunique.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.nunique.rst new file mode 100644 index 00000000..ddc51343 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.nunique.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.nunique +====================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.nunique diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.quantile.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.quantile.rst new file mode 100644 index 00000000..63f63de2 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.quantile.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.quantile +======================================= + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.quantile diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.rst new file mode 100644 index 00000000..9ceddc19 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.rst @@ -0,0 +1,15 @@ +groupby.DataFrameGroupBy +============================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. autoclass:: opensearch_py_ml.groupby.DataFrameGroupBy + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + opensearch_py_ml.DataFrame.abs + opensearch_py_ml.DataFrame.add diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.std.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.std.rst new file mode 100644 index 00000000..73cad20b --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.std.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.std +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.std diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.sum.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.sum.rst new file mode 100644 index 00000000..1b73b6c6 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.sum.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.sum +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.sum diff --git a/docs/source/reference/api/groupby.DataFrameGroupBy.var.rst b/docs/source/reference/api/groupby.DataFrameGroupBy.var.rst new file mode 100644 index 00000000..39d1b7f5 --- /dev/null +++ b/docs/source/reference/api/groupby.DataFrameGroupBy.var.rst @@ -0,0 +1,6 @@ +groupby.DataFrameGroupBy.var +================================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. automethod:: opensearch_py_ml.groupby.DataFrameGroupBy.var diff --git a/docs/source/reference/api/groupby.GroupBy.rst b/docs/source/reference/api/groupby.GroupBy.rst new file mode 100644 index 00000000..3163c2a4 --- /dev/null +++ b/docs/source/reference/api/groupby.GroupBy.rst @@ -0,0 +1,15 @@ +groupby.GroupBy +===================== + +.. currentmodule:: opensearch_py_ml.groupby + +.. autoclass:: opensearch_py_ml.GroupBy + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + DataFrame.abs + DataFrame.add diff --git a/docs/source/reference/api/opensearch_py_ml-DataFrame-hist-1.png b/docs/source/reference/api/opensearch_py_ml-DataFrame-hist-1.png new file mode 100644 index 00000000..a9eb2f50 Binary files /dev/null and b/docs/source/reference/api/opensearch_py_ml-DataFrame-hist-1.png differ diff --git a/docs/source/reference/api/opensearch_py_ml-Series-hist-2.png b/docs/source/reference/api/opensearch_py_ml-Series-hist-2.png new file mode 100644 index 00000000..07af4635 Binary files /dev/null and b/docs/source/reference/api/opensearch_py_ml-Series-hist-2.png differ diff --git a/docs/source/reference/api/opensearch_to_pandas.rst b/docs/source/reference/api/opensearch_to_pandas.rst new file mode 100644 index 00000000..3220cc05 --- /dev/null +++ b/docs/source/reference/api/opensearch_to_pandas.rst @@ -0,0 +1,6 @@ +opensearch_to_pandas +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.etl.opensearch_to_pandas diff --git a/docs/source/reference/api/pandas_to_opensearch.rst b/docs/source/reference/api/pandas_to_opensearch.rst new file mode 100644 index 00000000..7916930e --- /dev/null +++ b/docs/source/reference/api/pandas_to_opensearch.rst @@ -0,0 +1,6 @@ +pandas_to_opensearch +===================== + +.. currentmodule:: opensearch_py_ml + +.. automethod:: opensearch_py_ml.etl.pandas_to_opensearch diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst new file mode 100644 index 00000000..ed065a11 --- /dev/null +++ b/docs/source/reference/dataframe.rst @@ -0,0 +1,143 @@ +.. _api.dataframe: + +========= +DataFrame +========= +.. currentmodule:: opensearch_py_ml + +Constructor +~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame + +Attributes and Underlying Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.index + api/DataFrame.columns + api/DataFrame.dtypes + api/DataFrame.select_dtypes + api/DataFrame.values + api/DataFrame.empty + api/DataFrame.shape + api/DataFrame.ndim + api/DataFrame.size + +Indexing, Iteration +~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.head + api/DataFrame.keys + api/DataFrame.tail + api/DataFrame.get + api/DataFrame.query + api/DataFrame.sample + api/DataFrame.iterrows + api/DataFrame.itertuples + +Function Application, GroupBy & Window +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Opensearch aggregations using cardinality (``count``) are accurate + approximations using the `HyperLogLog++ algorithm`_ so may not + be exact. + +.. _HyperLogLog++ algorithm: https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf + +.. toctree:: + :maxdepth: 2 + + api/DataFrame.agg + api/DataFrame.aggregate + api/DataFrame.groupby + +.. currentmodule:: opensearch_py_ml.groupby + +.. toctree:: + :maxdepth: 2 + + api/groupby.DataFrameGroupBy + api/groupby.DataFrameGroupBy.agg + api/groupby.DataFrameGroupBy.aggregate + api/groupby.DataFrameGroupBy.count + api/groupby.DataFrameGroupBy.mad + api/groupby.DataFrameGroupBy.max + api/groupby.DataFrameGroupBy.mean + api/groupby.DataFrameGroupBy.median + api/groupby.DataFrameGroupBy.min + api/groupby.DataFrameGroupBy.nunique + api/groupby.DataFrameGroupBy.std + api/groupby.DataFrameGroupBy.sum + api/groupby.DataFrameGroupBy.var + api/groupby.DataFrameGroupBy.quantile + +.. currentmodule:: opensearch_py_ml + +.. _api.dataframe.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.count + api/DataFrame.describe + api/DataFrame.info + api/DataFrame.max + api/DataFrame.mean + api/DataFrame.min + api/DataFrame.median + api/DataFrame.mad + api/DataFrame.std + api/DataFrame.var + api/DataFrame.sum + api/DataFrame.nunique + api/DataFrame.mode + api/DataFrame.quantile + api/DataFrame.idxmax + api/DataFrame.idxmin + +Reindexing / Selection / Label Manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.drop + api/DataFrame.filter + +Plotting +~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.hist + +Opensearch Functions +~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.os_info + api/DataFrame.es_match + api/DataFrame.es_query + api/DataFrame.os_dtypes + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/DataFrame.info + api/DataFrame.to_numpy + api/DataFrame.to_csv + api/DataFrame.to_html + api/DataFrame.to_string + api/DataFrame.to_pandas diff --git a/docs/source/reference/general_utility_functions.rst b/docs/source/reference/general_utility_functions.rst new file mode 100644 index 00000000..98557bb8 --- /dev/null +++ b/docs/source/reference/general_utility_functions.rst @@ -0,0 +1,14 @@ +.. _api.general_utility_functions: + +========================= +General Utility Functions +========================= +.. currentmodule:: opensearch_py_ml + +Pandas and Opensearch +~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/opensearch_to_pandas + api/pandas_to_opensearch diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 00000000..81e17857 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,16 @@ +.. _api: + +============= +API Reference +============= + +This page gives an overview of all public opensearch-py-ml objects, functions and +methods. All classes and functions exposed in ``Opensearch-py-ml.*`` namespace are public. + +.. toctree:: + :maxdepth: 2 + + dataframe + series + general_utility_functions + io diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst new file mode 100644 index 00000000..9ce317d8 --- /dev/null +++ b/docs/source/reference/indexing.rst @@ -0,0 +1,17 @@ +.. _api.index: + +===== +Index +===== +.. currentmodule:: opensearch_py_ml + +**Many of these methods or variants thereof are available on the objects +that contain an index (Series/DataFrame) and those should most likely be +used before calling these methods directly.** + +Constructor +~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + opensearch_py_ml.Index diff --git a/docs/source/reference/io.rst b/docs/source/reference/io.rst new file mode 100644 index 00000000..107df750 --- /dev/null +++ b/docs/source/reference/io.rst @@ -0,0 +1,13 @@ +.. _api.io: + +============ +Input/Output +============ +.. currentmodule:: opensearch_py_ml + +Flat File +~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/csv_to_opensearch diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst new file mode 100644 index 00000000..46fa0994 --- /dev/null +++ b/docs/source/reference/series.rst @@ -0,0 +1,122 @@ +.. _api.series: + +====== +Series +====== +.. currentmodule:: opensearch_py_ml + +Constructor +~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series + +Attributes and Underlying Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.index + api/Series.dtype + api/Series.dtypes + api/Series.shape + api/Series.name + api/Series.empty + api/Series.ndim + api/Series.size + +Indexing, Iteration +~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.head + api/Series.tail + api/Series.sample + +Binary Operator Functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.add + api/Series.sub + api/Series.subtract + api/Series.mul + api/Series.multiply + api/Series.div + api/Series.divide + api/Series.truediv + api/Series.floordiv + api/Series.mod + api/Series.pow + api/Series.radd + api/Series.rsub + api/Series.rsubtract + api/Series.rmul + api/Series.rmultiply + api/Series.rdiv + api/Series.rdivide + api/Series.rtruediv + api/Series.rfloordiv + api/Series.rmod + api/Series.rpow + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.describe + api/Series.max + api/Series.mean + api/Series.min + api/Series.sum + api/Series.median + api/Series.mad + api/Series.std + api/Series.var + api/Series.nunique + api/Series.unique + api/Series.value_counts + api/Series.mode + api/Series.quantile + +Reindexing / Selection / Label Manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.rename + api/Series.isna + api/Series.notna + api/Series.isnull + api/Series.notnull + api/Series.isin + api/Series.filter + +Plotting +~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.hist + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.to_string + api/Series.to_numpy + api/Series.to_pandas + +Opensearch Functions +~~~~~~~~~~~~~~~~~~~~~~~ +.. toctree:: + :maxdepth: 2 + + api/Series.os_info + api/Series.es_match + api/Series.os_dtype diff --git a/noxfile.py b/noxfile.py index 2649f60a..5ab3b7a9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -23,7 +23,6 @@ # specific language governing permissions and limitations # under the License. -import os from pathlib import Path import nox @@ -119,8 +118,8 @@ def test(session, pandas_version: str): "--cov-report=term-missing", "--cov=opensearch_py_ml/", "--cov-config=setup.cfg", - # "--doctest-modules", //TODO: commenting for now. - # "--nbval", //TODO: we need to revisit this part if we need to test jupyter notebooks + "--doctest-modules", + "--nbval", ) session.run( @@ -132,45 +131,11 @@ def test(session, pandas_version: str): @nox.session(reuse_venv=True) def docs(session): # Run this so users get an error if they don't have Pandoc installed. - session.run("pandoc", "--version", external=True) session.install("-r", "docs/requirements-docs.txt") session.install(".") - # See if we have an Elasticsearch cluster active - # to rebuild the Jupyter notebooks with. - es_active = False - try: - from elasticsearch import ConnectionError, Elasticsearch - - try: - es = Elasticsearch("https://localhost:9200") - es.info() - if not es.indices.exists(index="flights"): - session.run("python", "-m", "tests.setup_tests") - es_active = True - except ConnectionError: - pass - except ImportError: - pass - - # Rebuild all the example notebooks inplace - if es_active: - session.install("jupyter-client", "ipykernel") - for filename in os.listdir(BASE_DIR / "docs/sphinx/examples"): - if ( - filename.endswith(".ipynb") - and filename != "introduction_to_eland_webinar.ipynb" - ): - session.run( - "jupyter", - "nbconvert", - "--to", - "notebook", - "--inplace", - "--execute", - str(BASE_DIR / "docs/sphinx/examples" / filename), - ) + session.run("python", "-m", "setup_tests") session.cd("docs") session.run("make", "clean", external=True) diff --git a/opensearch_py_ml/__init__.py b/opensearch_py_ml/__init__.py index 9679b743..25e9c7f0 100644 --- a/opensearch_py_ml/__init__.py +++ b/opensearch_py_ml/__init__.py @@ -24,12 +24,11 @@ # under the License. from ._version import __title__, __url__, __version__ # noqa: F401 -from .common import SortOrder +from .common import SortOrder, os_version # noqa: F401 from .dataframe import DataFrame from .etl import csv_to_opensearch, opensearch_to_pandas, pandas_to_opensearch from .index import Index from .ndframe import NDFrame -from .sagemaker_tools import make_sagemaker_prediction from .series import Series __all__ = [ @@ -41,5 +40,6 @@ "opensearch_to_pandas", "csv_to_opensearch", "SortOrder", - "make_sagemaker_prediction", ] + +# Define test files and indices diff --git a/opensearch_py_ml/common.py b/opensearch_py_ml/common.py index add99830..2e7e0f0e 100644 --- a/opensearch_py_ml/common.py +++ b/opensearch_py_ml/common.py @@ -27,6 +27,7 @@ from enum import Enum from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union, cast +import opensearchpy import pandas as pd # type: ignore from opensearchpy import OpenSearch @@ -331,3 +332,28 @@ def os_version(os_client: OpenSearch) -> Tuple[int, int, int]: else: opensearch_py_ml_os_version = os_client._os_ml_py_version # type: ignore return opensearch_py_ml_os_version + + +OPENSEARCH_HOST = "https://instance:9200" +OPENSEARCH_ADMIN_USER, OPENSEARCH_ADMIN_PASSWORD = "admin", "admin" + +# Define client to use in tests +OPENSEARCH_TEST_CLIENT = OpenSearch( + hosts=[OPENSEARCH_HOST], + http_auth=(OPENSEARCH_ADMIN_USER, OPENSEARCH_ADMIN_PASSWORD), + verify_certs=False, +) +# in github integration test, host url is: https://instance:9200 +# in development, usually host url is: https://localhost:9200 +# it's hard to remember changing the host url. So applied a try catch so that we don't have to keep change this config +try: + OS_VERSION = os_version(OPENSEARCH_TEST_CLIENT) +except opensearchpy.exceptions.ConnectionError: + OPENSEARCH_HOST = "https://localhost:9200" + # Define client to use in tests + OPENSEARCH_TEST_CLIENT = OpenSearch( + hosts=[OPENSEARCH_HOST], + http_auth=(OPENSEARCH_ADMIN_USER, OPENSEARCH_ADMIN_PASSWORD), + verify_certs=False, + ) + OS_VERSION = os_version(OPENSEARCH_TEST_CLIENT) diff --git a/opensearch_py_ml/dataframe.py b/opensearch_py_ml/dataframe.py index 1a76e4f3..ec58d33b 100644 --- a/opensearch_py_ml/dataframe.py +++ b/opensearch_py_ml/dataframe.py @@ -42,6 +42,7 @@ from pandas.util._validators import validate_bool_kwarg # type: ignore import opensearch_py_ml.plotting as gfx +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from opensearch_py_ml.filter import BooleanFilter from opensearch_py_ml.groupby import DataFrameGroupBy @@ -65,12 +66,9 @@ class DataFrame(NDFrame): Parameters ---------- os_client: OpenSearch client - os_index_pattern: str - OpenSearch index pattern. This can contain wildcards. (e.g. 'flights') - columns: list of str, optional - List of DataFrame columns. A subset of the OpenSearch index's fields. - os_index_field: str, optional - The OpenSearch index field to use as the DataFrame index. Defaults to _id if None is used. + os_index_pattern: str OpenSearch index pattern. This can contain wildcards. (e.g. 'flights') + columns: list of str, optional List of DataFrame columns. A subset of the OpenSearch index's fields. + os_index_field: str, optional The OpenSearch index field to use as the DataFrame index. Defaults to _id if None is used. See Also -------- @@ -80,7 +78,7 @@ class DataFrame(NDFrame): -------- Constructing DataFrame from an OpenSearch configuration arguments and an OpenSearch index - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.head() AvgTicketPrice Cancelled ... dayOfWeek timestamp 0 841.265642 False ... 0 2018-01-01 00:00:00 @@ -94,9 +92,7 @@ class DataFrame(NDFrame): Constructing DataFrame from an OpenSearch client and an OpenSearch index - >>> from opensearchpy import OpenSearch - >>> es = OpenSearch("http://localhost:9200") - >>> df = ed.DataFrame(os_client=es, os_index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled']) + >>> df = ed.DataFrame(os_client=OPENSEARCH_TEST_CLIENT, os_index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled']) >>> df.head() AvgTicketPrice Cancelled 0 841.265642 False @@ -112,7 +108,7 @@ class DataFrame(NDFrame): (TODO - currently index_field must also be a field if not _id) >>> df = ed.DataFrame( - ... os_client='http://localhost:9200', + ... os_client=OPENSEARCH_TEST_CLIENT, ... os_index_pattern='flights', ... columns=['AvgTicketPrice', 'timestamp'], ... os_index_field='timestamp' @@ -126,13 +122,12 @@ class DataFrame(NDFrame): 2018-01-01T00:36:51 803.015200 2018-01-01 00:36:51 [5 rows x 2 columns] + """ def __init__( self, - os_client: Optional[ - Union[str, List[str], Tuple[str, ...], "OpenSearch"] - ] = None, + os_client: "OpenSearch" = None, os_index_pattern: Optional[str] = None, columns: Optional[List[str]] = None, os_index_field: Optional[str] = None, @@ -176,7 +171,7 @@ def columns(self) -> pd.Index: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> assert isinstance(df.columns, pd.Index) >>> df.columns Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName', @@ -204,7 +199,7 @@ def empty(self) -> bool: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.empty False """ @@ -234,7 +229,7 @@ def head(self, n: int = 5) -> "DataFrame": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'Dest']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['Origin', 'Dest']) >>> df.head(3) Origin Dest 0 Frankfurt am Main Airport Sydney Kingsford Smith International Airport @@ -269,7 +264,7 @@ def tail(self, n: int = 5) -> "DataFrame": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'Dest']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['Origin', 'Dest']) >>> df.tail() Origin \\ 13054 Pisa International Airport... @@ -371,7 +366,7 @@ def drop( -------- Drop a column - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'email', 'user']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce', columns=['customer_first_name', 'email', 'user']) >>> df.drop(columns=['user']) customer_first_name email 0 Eddie eddie@underwood-family.zzz @@ -581,7 +576,7 @@ def count(self) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) >>> df.count() customer_first_name 4675 geoip.city_name 4094 @@ -603,7 +598,7 @@ def os_info(self): Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)] >>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']] >>> df = df.tail() @@ -623,7 +618,7 @@ def os_info(self): is_source_field: False Mappings: capabilities: - es_field_name is_source os_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name + os_field_name is_source os_dtype os_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_os_field_name timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID DestAirportID DestAirportID True keyword None object True True False DestAirportID @@ -698,7 +693,7 @@ def es_match( Examples -------- - >>> df = ed.DataFrame("http://localhost:9200", "ecommerce") + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, "ecommerce") >>> df.es_match("Men's", columns=["category"]) category currency ... type user 0 [Men's Clothing] EUR ... order eddie @@ -760,7 +755,7 @@ def es_query(self, query) -> "DataFrame": .. _geo-distance query documentation from Elasticsearch: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-geo-distance-query.html - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) >>> df.es_query({"bool": {"filter": {"geo_distance": {"distance": "1km", "geoip.location": [55.3, 25.3]}}}}).head() customer_first_name geoip.city_name 1 Mary Dubai @@ -836,7 +831,7 @@ def info( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) >>> df.info() Index: 4675 entries, 0 to 4674 @@ -1372,7 +1367,7 @@ def select_dtypes(self, include=None, exclude=None) -> "DataFrame": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', ... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek']) >>> df.dtypes AvgTicketPrice float64 @@ -1413,7 +1408,7 @@ def shape(self) -> Tuple[int, int]: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce') >>> df.shape (4675, 45) """ @@ -1475,7 +1470,7 @@ def iterrows( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() >>> df AvgTicketPrice Cancelled 0 841.265642 False @@ -1512,7 +1507,7 @@ def iterrows( def itertuples( self, index: bool = True, - name: Union[str, None] = "opensearch-py-ml", + name: Union[str, None] = "opensearch_py_ml", sort_index: Optional[str] = "_doc", ) -> Iterable[Tuple[Any, ...]]: """ @@ -1540,7 +1535,7 @@ def itertuples( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['AvgTicketPrice', 'Cancelled']).head() >>> df AvgTicketPrice Cancelled 0 841.265642 False @@ -1553,21 +1548,21 @@ def itertuples( >>> for row in df.itertuples(): ... print(row) - Eland(Index='0', AvgTicketPrice=841.2656419677076, Cancelled=False) - Eland(Index='1', AvgTicketPrice=882.9826615595518, Cancelled=False) - Eland(Index='2', AvgTicketPrice=190.6369038508356, Cancelled=False) - Eland(Index='3', AvgTicketPrice=181.69421554118, Cancelled=True) - Eland(Index='4', AvgTicketPrice=730.041778346198, Cancelled=False) + opensearch_py_ml(Index='0', AvgTicketPrice=841.2656419677076, Cancelled=False) + opensearch_py_ml(Index='1', AvgTicketPrice=882.9826615595518, Cancelled=False) + opensearch_py_ml(Index='2', AvgTicketPrice=190.6369038508356, Cancelled=False) + opensearch_py_ml(Index='3', AvgTicketPrice=181.69421554118, Cancelled=True) + opensearch_py_ml(Index='4', AvgTicketPrice=730.041778346198, Cancelled=False) By setting the `index` parameter to False we can remove the index as the first element of the tuple: >>> for row in df.itertuples(index=False): ... print(row) - Eland(AvgTicketPrice=841.2656419677076, Cancelled=False) - Eland(AvgTicketPrice=882.9826615595518, Cancelled=False) - Eland(AvgTicketPrice=190.6369038508356, Cancelled=False) - Eland(AvgTicketPrice=181.69421554118, Cancelled=True) - Eland(AvgTicketPrice=730.041778346198, Cancelled=False) + opensearch_py_ml(AvgTicketPrice=841.2656419677076, Cancelled=False) + opensearch_py_ml(AvgTicketPrice=882.9826615595518, Cancelled=False) + opensearch_py_ml(AvgTicketPrice=190.6369038508356, Cancelled=False) + opensearch_py_ml(AvgTicketPrice=181.69421554118, Cancelled=True) + opensearch_py_ml(AvgTicketPrice=730.041778346198, Cancelled=False) With the `name` parameter set we set a custom name for the yielded namedtuples: @@ -1636,7 +1631,7 @@ def aggregate( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) AvgTicketPrice DistanceKilometers sum 8204364 92616288 @@ -1711,7 +1706,7 @@ def groupby( Examples -------- - >>> ed_flights = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> ed_flights = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> ed_flights.groupby(["DestCountry", "Cancelled"]).agg(["min", "max"], numeric_only=True) # doctest: +NORMALIZE_WHITESPACE AvgTicketPrice dayOfWeek min max min max @@ -1806,7 +1801,7 @@ def mode( Examples -------- - >>> ed_ecommerce = ed.DataFrame('http://localhost:9200', 'ecommerce') + >>> ed_ecommerce = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce') >>> ed_df = ed_ecommerce.filter(["total_quantity", "geoip.city_name", "customer_birth_date", "day_of_week", "taxful_total_price"]) >>> ed_df.mode(numeric_only=False) total_quantity geoip.city_name customer_birth_date day_of_week taxful_total_price @@ -1871,7 +1866,7 @@ def quantile( Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights') + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"]) >>> ed_flights.quantile() # doctest: +SKIP AvgTicketPrice 640.387285 @@ -1914,7 +1909,7 @@ def idxmax(self, axis: int = 0) -> pd.Series: Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights') + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"]) >>> ed_flights.idxmax() AvgTicketPrice 1843 @@ -1946,7 +1941,7 @@ def idxmin(self, axis: int = 0) -> pd.Series: Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights') + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"]) >>> ed_flights.idxmin() AvgTicketPrice 5454 @@ -1982,7 +1977,7 @@ def query(self, expr) -> "DataFrame": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.shape (13059, 27) >>> df.query('FlightDelayMin > 60').shape @@ -2026,7 +2021,7 @@ def get( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.get('Carrier') 0 Kibana Airlines 1 Logstash Airways @@ -2157,7 +2152,7 @@ def to_numpy(self) -> None: Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5) + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5) >>> pd_df = ed.opensearch_to_pandas(ed_df) >>> print(f"type(ed_df)={type(ed_df)}\\ntype(pd_df)={type(pd_df)}") type(ed_df)= diff --git a/opensearch_py_ml/etl.py b/opensearch_py_ml/etl.py index f33d9c31..45c964f2 100644 --- a/opensearch_py_ml/etl.py +++ b/opensearch_py_ml/etl.py @@ -32,6 +32,7 @@ from opensearchpy.helpers import parallel_bulk from opensearch_py_ml import DataFrame +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.common import DEFAULT_CHUNK_SIZE, PANDAS_VERSION from opensearch_py_ml.field_mappings import FieldMappings, verify_mapping_compatibility @@ -133,7 +134,7 @@ def pandas_to_opensearch( >>> ed_df = ed.pandas_to_opensearch(pd_df, - ... 'http://localhost:9200', + ... OPENSEARCH_TEST_CLIENT, ... 'pandas_to_opensearch', ... es_if_exists="replace", ... es_refresh=True, @@ -260,7 +261,7 @@ def opensearch_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.Da Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights').head() + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights').head() >>> type(ed_df) >>> ed_df @@ -290,7 +291,7 @@ def opensearch_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.Da Convert `opensearch_py_ml.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows - >>> pd_df = ed.opensearch_to_pandas(ed.DataFrame('http://localhost:9200', 'flights'), show_progress=True) # doctest: +SKIP + >>> pd_df = ed.opensearch_to_pandas(ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights'), show_progress=True) # doctest: +SKIP 2020-01-29 12:43:36.572395: read 10000 rows 2020-01-29 12:43:37.309031: read 13059 rows @@ -424,7 +425,7 @@ def csv_to_opensearch( # type: ignore >>> ed.csv_to_opensearch( ... "churn.csv", - ... os_client='http://localhost:9200', + ... os_client=OPENSEARCH_TEST_CLIENT, ... os_dest_index='churn', ... es_refresh=True, ... index_col=0 diff --git a/opensearch_py_ml/groupby.py b/opensearch_py_ml/groupby.py index 32aafb93..35dbf68e 100644 --- a/opensearch_py_ml/groupby.py +++ b/opensearch_py_ml/groupby.py @@ -25,6 +25,7 @@ from typing import TYPE_CHECKING, List, Optional, Union +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.query_compiler import QueryCompiler if TYPE_CHECKING: @@ -76,7 +77,7 @@ def mean(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP @@ -127,7 +128,7 @@ def var(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE @@ -178,7 +179,7 @@ def std(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE @@ -229,7 +230,7 @@ def mad(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").mad() # doctest: +SKIP @@ -280,7 +281,7 @@ def median(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP @@ -331,7 +332,7 @@ def sum(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE @@ -382,7 +383,7 @@ def min(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE @@ -433,7 +434,7 @@ def max(self, numeric_only: bool = True) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) >>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE @@ -484,7 +485,7 @@ def nunique(self) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE @@ -534,7 +535,7 @@ def quantile( Examples -------- - >>> ed_df = ed.DataFrame('http://localhost:9200', 'flights') + >>> ed_df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"]) >>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP AvgTicketPrice FlightDelayMin @@ -624,7 +625,7 @@ def aggregate( Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE @@ -678,7 +679,7 @@ def count(self) -> "pd.DataFrame": Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "flights", + ... OPENSEARCH_TEST_CLIENT, "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] ... ) >>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE diff --git a/opensearch_py_ml/ml_commons_integration/ml_common_utils.py b/opensearch_py_ml/ml_commons_integration/ml_common_utils.py index 2125e674..fec5f9be 100644 --- a/opensearch_py_ml/ml_commons_integration/ml_common_utils.py +++ b/opensearch_py_ml/ml_commons_integration/ml_common_utils.py @@ -7,4 +7,5 @@ ML_BASE_URI = "/_plugins/_ml" MODEL_UPLOAD_CHUNK_SIZE = 10_000_000 +MODEL_MAX_SIZE = 4_000_000_000 BUF_SIZE = 65536 # lets read stuff in 64kb chunks! diff --git a/opensearch_py_ml/ml_commons_integration/upload/ml_common_model_uploader.py b/opensearch_py_ml/ml_commons_integration/upload/ml_common_model_uploader.py index 35a20d59..62ae44cb 100644 --- a/opensearch_py_ml/ml_commons_integration/upload/ml_common_model_uploader.py +++ b/opensearch_py_ml/ml_commons_integration/upload/ml_common_model_uploader.py @@ -16,6 +16,7 @@ from opensearch_py_ml.ml_commons_integration.ml_common_utils import ( BUF_SIZE, ML_BASE_URI, + MODEL_MAX_SIZE, MODEL_UPLOAD_CHUNK_SIZE, ) @@ -52,6 +53,8 @@ def upload_model( @param model_meta_path string filepath of the model metadata. A json file of model metadata is expected @param isVerbose bool if isVerbose is true method will print more messages. """ + if os.stat(model_path).st_size > MODEL_MAX_SIZE: + raise Exception("Model file size exceeds the limit of 4GB") total_num_chunks: int = ceil( os.stat(model_path).st_size / MODEL_UPLOAD_CHUNK_SIZE diff --git a/opensearch_py_ml/ndframe.py b/opensearch_py_ml/ndframe.py index c2d90e09..0598cbdc 100644 --- a/opensearch_py_ml/ndframe.py +++ b/opensearch_py_ml/ndframe.py @@ -29,6 +29,7 @@ import pandas as pd # type: ignore +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.query_compiler import QueryCompiler if TYPE_CHECKING: @@ -107,7 +108,7 @@ def index(self) -> "Index": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> assert isinstance(df.index, ed.Index) >>> df.index.os_index_field '_id' @@ -135,7 +136,7 @@ def dtypes(self) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) >>> df.dtypes Origin object AvgTicketPrice float64 @@ -157,7 +158,7 @@ def os_dtypes(self) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) >>> df.os_dtypes Origin keyword AvgTicketPrice float @@ -221,7 +222,7 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mean() # doctest: +SKIP AvgTicketPrice 628.254 Cancelled 0.128494 @@ -270,7 +271,7 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.sum() # doctest: +SKIP AvgTicketPrice 8.20436e+06 Cancelled 1678 @@ -318,7 +319,7 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.min() # doctest: +SKIP AvgTicketPrice 100.021 Cancelled False @@ -365,7 +366,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.var() # doctest: +SKIP AvgTicketPrice 70964.570234 Cancelled 0.111987 @@ -411,7 +412,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.std() # doctest: +SKIP AvgTicketPrice 266.407061 Cancelled 0.334664 @@ -457,7 +458,7 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.median() # doctest: +SKIP AvgTicketPrice 640.363 Cancelled False @@ -506,7 +507,7 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.max() # doctest: +SKIP AvgTicketPrice 1199.73 Cancelled True @@ -565,7 +566,7 @@ def nunique(self) -> pd.Series: Examples -------- >>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user'] - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce', columns=columns) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce', columns=columns) >>> df.nunique() category 6 currency 1 @@ -591,7 +592,7 @@ def mad(self, numeric_only: bool = True) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mad() # doctest: +SKIP AvgTicketPrice 213.35497 dayOfWeek 2.00000 @@ -636,7 +637,7 @@ def describe(self) -> pd.DataFrame: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles >>> df.describe() # doctest: +SKIP AvgTicketPrice FlightDelayMin count 13059.000000 13059.000000 diff --git a/opensearch_py_ml/plotting/_core.py b/opensearch_py_ml/plotting/_core.py index de724f2b..029864e6 100644 --- a/opensearch_py_ml/plotting/_core.py +++ b/opensearch_py_ml/plotting/_core.py @@ -23,6 +23,7 @@ # specific language governing permissions and limitations # under the License. +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.plotting._matplotlib.hist import hist_frame, hist_series @@ -51,7 +52,7 @@ def ed_hist_series( Examples -------- >>> import matplotlib.pyplot as plt - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df[df.OriginWeather == 'Sunny']['FlightTimeMin'].hist(alpha=0.5, density=True) # doctest: +SKIP >>> df[df.OriginWeather != 'Sunny']['FlightTimeMin'].hist(alpha=0.5, density=True) # doctest: +SKIP >>> plt.show() # doctest: +SKIP @@ -117,7 +118,7 @@ def ed_hist_frame( Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) # doctest: +SKIP """ return hist_frame( diff --git a/opensearch_py_ml/sagemaker_tools.py b/opensearch_py_ml/sagemaker_tools.py deleted file mode 100644 index 7b4f01a1..00000000 --- a/opensearch_py_ml/sagemaker_tools.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# The OpenSearch Contributors require contributions made to -# this file be licensed under the Apache-2.0 license or a -# compatible open source license. -# Any modifications Copyright OpenSearch Contributors. See -# GitHub history for details. - -from math import ceil -from typing import Any, Dict, List, Optional, Tuple - -from sagemaker.predictor import Predictor, Session - -from opensearch_py_ml import DataFrame - -DEFAULT_SAGEMAKER_UPLOAD_CHUNK_SIZE = 1000 - - -def make_sagemaker_prediction( - endpoint_name: str, - data: DataFrame, - target_column: str, - sagemaker_session: Optional[Session] = None, - column_order: Optional[List[str]] = None, - chunksize: int = None, - sort_index: Optional[str] = "_doc", -) -> Tuple[List[Any], Dict[Any, Any]]: - """ - Make a prediction on an opensearch_py_ml dataframe using a deployed SageMaker model endpoint. - - Note that predictions will be returned based on the order in which data is ordered when - ed.Dataframe.iterrows() is called on them. - - Parameters - ---------- - endpoint_name: string representing name of SageMaker endpoint - data: opensearch_py_ml DataFrame representing data to feed to SageMaker model. The dataframe must match the input datatypes - of the model and also have the correct number of columns. - target_column: column name of the dependent variable in the data. - sagemaker_session: A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, - one is created using the default AWS configuration chain. - column_order: list of string values representing the proper order that the columns of independent variables should - be read into the SageMaker model. Must be a permutation of the column names of the opensearch_py_ml DataFrame. - chunksize: how large each chunk being uploaded to sagemaker should be. - sort_index: the index with which to sort the predictions by. Defaults to '_doc', an internal identifier for - Lucene that optimizes performance. - - Returns - ---------- - list representing the indices, dictionary representing the output of the model on input data - """ - predictor = Predictor( - endpoint=endpoint_name, - sagemaker_session=sagemaker_session, - content_type="text/csv", - ) - data = data.drop(columns=target_column) - - if column_order is not None: - data = data[column_order] - if chunksize is None: - chunksize = DEFAULT_SAGEMAKER_UPLOAD_CHUNK_SIZE - - indices = [index for index, _ in data.iterrows(sort_index=sort_index)] - - to_return = [] - - for i in range(ceil(data.shape[0] / chunksize)): - df_slice = indices[chunksize * i : min(len(indices), chunksize * (i + 1))] - to_process = data.filter(df_slice, axis=0) - preds = predictor.predict(to_process.to_csv(header=False, index=False)) - to_return.append(preds) - - return indices, to_return diff --git a/opensearch_py_ml/series.py b/opensearch_py_ml/series.py index e286dcc2..bb2c82ec 100644 --- a/opensearch_py_ml/series.py +++ b/opensearch_py_ml/series.py @@ -56,6 +56,7 @@ ArithmeticSeries, ArithmeticString, ) +from opensearch_py_ml.common import OPENSEARCH_TEST_CLIENT # noqa: F401 from opensearch_py_ml.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from opensearch_py_ml.filter import ( BooleanFilter, @@ -110,7 +111,7 @@ class Series(NDFrame): Examples -------- - >>> ed.Series(os_client='http://localhost:9200', os_index_pattern='flights', name='Carrier') + >>> ed.Series(os_client=OPENSEARCH_TEST_CLIENT, os_index_pattern='flights', name='Carrier') 0 Kibana Airlines 1 Logstash Airways 2 Logstash Airways @@ -178,7 +179,7 @@ def shape(self) -> Tuple[int, int]: Examples -------- - >>> df = ed.Series('http://localhost:9200', 'ecommerce', name='total_quantity') + >>> df = ed.Series(OPENSEARCH_TEST_CLIENT, 'ecommerce', name='total_quantity') >>> df.shape (4675, 1) """ @@ -227,7 +228,7 @@ def rename(self, new_name: str) -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df.Carrier 0 Kibana Airlines 1 Logstash Airways @@ -303,7 +304,7 @@ def value_counts(self, es_size: int = 10) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> df['Carrier'].value_counts() Logstash Airways 3331 JetBeats 3274 @@ -619,7 +620,7 @@ def quantile( Examples -------- - >>> ed_flights = ed.DataFrame('http://localhost:9200', 'flights') + >>> ed_flights = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') >>> ed_flights["timestamp"].quantile([.2,.5,.75]) # doctest: +SKIP 0.20 2018-01-09 04:30:57.289159912 0.50 2018-01-21 23:39:27.031627441 @@ -723,10 +724,10 @@ def mode(self, es_size: int = 10) -> pd.Series: Examples -------- - >>> ed_ecommerce = ed.DataFrame('http://localhost:9200', 'ecommerce') + >>> ed_ecommerce = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce') >>> ed_ecommerce["day_of_week"].mode() 0 Thursday - dtype: object + Name: day_of_week, dtype: object >>> ed_ecommerce["order_date"].mode() 0 2016-12-02 20:36:58 @@ -739,13 +740,13 @@ def mode(self, es_size: int = 10) -> pd.Series: 7 2016-12-15 11:38:24 8 2016-12-22 19:39:22 9 2016-12-24 06:21:36 - dtype: datetime64[ns] + Name: order_date, dtype: datetime64[ns] >>> ed_ecommerce["order_date"].mode(es_size=3) 0 2016-12-02 20:36:58 1 2016-12-04 23:44:10 2 2016-12-08 06:21:36 - dtype: datetime64[ns] + Name: order_date, dtype: datetime64[ns] """ return self._query_compiler.mode(is_dataframe=False, es_size=es_size) @@ -792,7 +793,7 @@ def es_match( Examples -------- >>> df = ed.DataFrame( - ... "http://localhost:9200", "ecommerce", + ... OPENSEARCH_TEST_CLIENT, "ecommerce", ... columns=["category", "taxful_total_price"] ... ) >>> df[ @@ -839,7 +840,7 @@ def __add__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -899,7 +900,7 @@ def __truediv__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -938,7 +939,7 @@ def __floordiv__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -977,7 +978,7 @@ def __mod__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1016,7 +1017,7 @@ def __mul__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1055,7 +1056,7 @@ def __sub__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1094,7 +1095,7 @@ def __pow__(self, right: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1133,7 +1134,7 @@ def __radd__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1165,7 +1166,7 @@ def __rtruediv__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1197,7 +1198,7 @@ def __rfloordiv__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1229,7 +1230,7 @@ def __rmod__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1261,7 +1262,7 @@ def __rmul__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1293,7 +1294,7 @@ def __rpow__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.total_quantity 0 2 1 2 @@ -1325,7 +1326,7 @@ def __rsub__(self, left: "Series") -> "Series": Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'ecommerce').head(5) + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'ecommerce').head(5) >>> df.taxful_total_price 0 36.98 1 53.98 @@ -1447,7 +1448,7 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.max()) 1199 """ @@ -1471,7 +1472,7 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.mean()) 628 """ @@ -1495,7 +1496,7 @@ def median(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.median()) 640 """ @@ -1519,7 +1520,7 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.min()) 100 """ @@ -1543,7 +1544,7 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.sum()) 8204364 """ @@ -1565,7 +1566,7 @@ def nunique(self) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['Carrier'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['Carrier'] >>> s.nunique() 4 """ @@ -1576,7 +1577,7 @@ def unique(self) -> pd.Series: """ Returns all unique values within a Series. Note that behavior is slightly different between pandas and Eland: pandas will return values in the order - they're first seen and Eland returns values in sorted order. + they're first seen and opensearch-py-ml returns values in sorted order. Returns ------- @@ -1605,7 +1606,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.var()) 70964 """ @@ -1627,7 +1628,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.std()) 266 """ @@ -1649,7 +1650,7 @@ def mad(self, numeric_only: Optional[bool] = None) -> pd.Series: Examples -------- - >>> s = ed.DataFrame('http://localhost:9200', 'flights')['AvgTicketPrice'] + >>> s = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')['AvgTicketPrice'] >>> int(s.mad()) 213 """ @@ -1677,7 +1678,7 @@ def describe(self) -> pd.Series: Examples -------- - >>> df = ed.DataFrame('http://localhost:9200', 'flights') # ignoring percentiles as they don't generate consistent results + >>> df = ed.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights') # ignoring percentiles as they don't generate consistent results >>> df.AvgTicketPrice.describe() # doctest: +SKIP count 13059.000000 mean 628.253689 @@ -1710,7 +1711,7 @@ def to_numpy(self) -> None: Examples -------- - >>> ed_s = ed.Series('http://localhost:9200', 'flights', name='Carrier').head(5) + >>> ed_s = ed.Series(OPENSEARCH_TEST_CLIENT, 'flights', name='Carrier').head(5) >>> pd_s = ed.opensearch_to_pandas(ed_s) >>> print(f"type(ed_s)={type(ed_s)}\\ntype(pd_s)={type(pd_s)}") type(ed_s)= diff --git a/requirements.txt b/requirements.txt index a65f329d..93a6a620 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,4 @@ pandas>=1.5,<2 matplotlib>=3.6.0,<4 numpy>=1.23.3,<2 opensearch-py>=2 -sagemaker>=2.110 protobuf==3.20.* \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py index 8aedd5a7..c2243faf 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -47,7 +47,7 @@ # in development, usually host url is: https://localhost:9200 # it's hard to remember changing the host url. So applied a try catch so that we don't have to keep change this config try: - ES_VERSION = os_version(OPENSEARCH_TEST_CLIENT) + OS_VERSION = os_version(OPENSEARCH_TEST_CLIENT) except opensearchpy.exceptions.ConnectionError: OPENSEARCH_HOST = "https://localhost:9200" # Define client to use in tests @@ -56,9 +56,9 @@ http_auth=(OPENSEARCH_ADMIN_USER, OPENSEARCH_ADMIN_PASSWORD), verify_certs=False, ) - ES_VERSION = os_version(OPENSEARCH_TEST_CLIENT) + OS_VERSION = os_version(OPENSEARCH_TEST_CLIENT) -FLIGHTS_INDEX_NAME = "opensearch_dashboards_sample_data_flights" +FLIGHTS_INDEX_NAME = "flights" FLIGHTS_MAPPING = { "mappings": { "properties": { @@ -95,11 +95,11 @@ FLIGHTS_FILE_NAME = ROOT_DIR + "/flights.json.gz" FLIGHTS_DF_FILE_NAME = ROOT_DIR + "/flights_df.json.gz" -FLIGHTS_SMALL_INDEX_NAME = "opensearch_dashboards_sample_data_flights_small" +FLIGHTS_SMALL_INDEX_NAME = "flights_small" FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + "/flights_small.json.gz" -ECOMMERCE_INDEX_NAME = "opensearch_dashboards_sample_data_ecommerce" +ECOMMERCE_INDEX_NAME = "ecommerce" ECOMMERCE_MAPPING = { "mappings": { "properties": { diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index ed9f5c53..00f9dfaf 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -50,7 +50,7 @@ def test_flights_metrics(self, numeric_only): for func in self.funcs: # Pandas v1.0 doesn't support mean() on datetime - # Pandas and Eland don't support sum() on datetime + # Pandas and opensearch_py_ml don't support sum() on datetime if not numeric_only: dtype_include = ( [np.number, np.datetime64]