In [1]:
# This notebook expects that Modin and Ray are installed, e.g. by `pip install modin[ray]`.
# Alternative way is to install everything via conda, like
#     conda create -n test-modin-intel -c intel/label/test -c conda-forge modin=0.8.intel jupyter
# then activate new environment and run jupyter from it

# NOTE: this is special version for showing cloud-cluster functionality.
# It requires installation of extra packages: `pip install cloudpickle rpyc`
# Also if your environment requires proxy for SSH you need to expose it via MODIN_SOCKS_PROXY environment variable,
# please note that it requires ray >= 0.8.7 to work
import modin.experimental.pandas as pd
from modin.experimental.cloud import create_cluster

Please note that some of these APIs deviate from pandas in order to provide improved performance.


In [2]:
columns_names = [
        "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
        "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
        "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
        "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
        "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
        "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
        "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
        "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
        "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
        "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
        "dropoff_ntaname", "dropoff_puma",
    ]
parse_dates=["pickup_datetime", "dropoff_datetime"]

In [3]:
with create_cluster('aws', '../../../aws_credentials',
                    cluster_name="rayscale-test",
                    region="eu-north-1", zone="eu-north-1b", image="ami-00e1e82d7d4ca80d3") as remote:
    df = pd.read_csv('https://modin-datasets.s3.amazonaws.com/trips_data.csv', names=columns_names,
                    header=None, parse_dates=parse_dates)

Cluster configuration valid.

Cluster: rayscale-test
Bootstraping AWS config
AWS config
  IAM Profile: ray-autoscaler-v1
  EC2 Key pair (head & workers): ray-autoscaler_4_eu-north-1
  VPC Subnets (head & workers): subnet-0995b6216dbfd237a
  EC2 Security groups (head & workers): sg-071a54090e55d92f3
  EC2 AMI (head & workers): ami-00e1e82d7d4ca80d3

No head node found. Launching a new cluster. Confirm [y/N]: y [automatic, due to --yes]

Acquiring an up-to-date head node
  Launching 1 nodes [subnet_id=subnet-0995b6216dbfd237a]
    Launched instance i-0fe38c76768f10015 [state=pending, info=pending]
  Launched a new head node
  Fetching the new head node
  
<1/1> Setting up head node
  Prepared bootstrap config
  New status: waiting-for-ssh
  [1/6] Waiting for SSH to become available
    Running `uptime` as a test.
    Waiting for IP
      Not yet available, retrying in 10 seconds
      Received: 13.48.10.50
    Running `uptime`
    Success.
  Updating cluster configuration. [hash=53d91b29

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/command_runner.py", line 385, in start_process
    self.process_runner.check_call(final_cmd)
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/subprocess.py", line 363, in check_call
    raise CalledProcessError(retcode, cmd)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/updater.py", line 74, in run
    self.do_update()
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/updater.py", line 300, in do_update
    self.cmd_runner.run(cmd)
  File "/localdis

  Failed to setup head node.


Cannot spawn cluster:
Traceback (most recent call last):
  File "/localdisk/vnlitvin/modin/modin/experimental/cloud/rayscale.py", line 162, in __do_spawn
    verbose=1,
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/commands.py", line 178, in create_or_update_cluster
    override_cluster_name)
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/commands.py", line 628, in get_or_create_head_node
    cli_logger.abort("Failed to setup head node.")
  File "/localdisk/vnlitvin/miniconda3/envs/test-modin-intel/lib/python3.7/site-packages/ray/autoscaler/cli_logger.py", line 343, in abort
    raise SilentClickException("Exiting due to cli_logger.abort()")
ray.autoscaler.cli_logger.SilentClickException: Exiting due to cli_logger.abort()



CannotSpawnCluster: cause: Exiting due to cli_logger.abort()
Cannot spawn cluster

In [4]:
with remote:
    print(df)

NameError: name 'remote' is not defined

In [5]:
def q1(df):
    return df.groupby("cab_type")["cab_type"].count()
def q2(df):
    return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]
def q3(df):
    return df.groupby(["passenger_count", "pickup_datetime"]).size().reset_index()
def q4(df):
    transformed = pd.DataFrame({
        "passenger_count": df["passenger_count"],
        "pickup_datetime": df["pickup_datetime"].dt.year,
        "trip_distance": df["trip_distance"].astype("int64"),
    })
    return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"])  \
            .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])

In [6]:
with remote:
    for query in (q1, q2, q3, q4):
        print(query(df))

10000