In [None]:
# =============================================================
# Copyright © 2020 Intel Corporation
# 
# SPDX-License-Identifier: MIT
# =============================================================

# Modin Getting Started Example for Distributed Pandas

## Importing and Organizing Data

In this example we will be generating a **synthetic dataset** and **demonstrating stock Pandas operations running with Modin**.

Let's start by **importing** all the necessary packages and modules

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
# ****** Do not change the code in this cell! It verifies that the notebook is being run correctly! ******

def verify_and_print_times(pandas_time, modin_time):
    if modin_time < pandas_time:
        print(f"Modin was {pandas_time / modin_time:.2f}X faster than stock pandas!")
        return
    print(
        f"Oops, stock pandas appears to be {modin_time / pandas_time:.2f}X faster than Modin in this case. "
        "This is unlikely but could happen sometimes on certain machines/environments/datasets. "
        "One of the most probable reasons is the excessive amount of partitions being assigned to a single worker. "
        "You may visit Modin's optimization guide in order to learn more about such cases and how to fix them: "
        "\nhttps://modin.readthedocs.io/en/latest/usage_guide/optimization_notes/index.html\n\n"
        "But first, verify that you're using the latest Modin version, also, try to use different executions, "
        "for basic usage we recommend non-experimental 'PandasOnRay'.\n"
        "Current configuration is:"
    )
    try:
        from modin.utils import get_current_execution

        execution = get_current_execution()
    except ImportError:
        # for modin version < 0.12.0
        try:
            from modin.utils import get_current_backend

            execution = get_current_backend()
        except ImportError:
            # for modin versions < 0.8.1
            execution = (
                "Can't deduce the current execution, your Modin version is too old!"
            )
    print(f"\tExecution: {execution}")
    try:
        import modin.config as cfg

        print(
            f"\tIs experimental: {cfg.IsExperimental.get()}\n"
            f"\tNumber of CPUs to utilize by Modin (check that Modin uses all CPUs on your machine): {cfg.CpuCount.get()}\n"
            f"\tIs in debug mode (debug mode may perform slower): {cfg.IsDebug.get()}"
        )
    except (ImportError, AttributeError):
        # for modin versions < 0.8.2
        print("\tCan't deduce Modin configuration, your Modin version is too old!")
    import modin

    print(f"\tModin version: {modin.__version__}")

## How to Use Modin

We will also be importing **stock Pandas as pandas** and **Modin as pd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**

In [None]:
import pandas

**Note: Uncomment below cell if you run on Intel oneAPI DevCloud**

In [None]:
# import ray
# ray.shutdown()
# ray.init(_memory=16000 * 1024 * 1024, object_store_memory=500 * 1024 * 1024,_driver_object_store_memory=500 * 1024 * 1024)

In [None]:
import modin.pandas as pd

We will now **generate a synthetic dataset** using NumPy to use with Modin and save it to a CSV.

In [None]:
array=np.random.randint(low=100,high=10000,size=(2**18,2**8))
#array
np.savetxt("foo.csv", array, delimiter=",") #how to generate array

Now we will convert the ndarray into a Pandas dataframe and display the first five rows.
For **stock pandas, the dataframe is being stored as `pandas_df`** and for **Modin, the same dataframe is being stored as `modin_df`**.
Let's try running the following cell with Pandas first.

In [None]:
%%time
pandas_df = pandas.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
pandas_df.head()

Now let's run the same code, but use **Modin instead of stock Pandas.**

**Note the speedup!**

In [None]:
%%time
modin_df=pd.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
modin_df.head()

Let's now **visualize** this speedup from Modin with a plot!

In [None]:
def plotter(outputdict):
    fig = plt.figure(figsize = (10, 5)) 
    plt.bar(outputdict.keys(),outputdict.values(),color='blue',width=0.4)
    plt.xlabel("Python Package")
    plt.ylabel("Runtime(seconds)")
    plt.show()

In [None]:
t0 = time.time()
pandas_df = pandas.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
pandas_time = time.time()- t0

t1 = time.time()
modin_df = pd.read_csv("foo.csv", names=["col{}".format(i) for i in range(256)])
modin_time = time.time() - t1

print("Pandas Time(seconds):",pandas_time,"\nModin Time(seconds):",modin_time)
verify_and_print_times(pandas_time, modin_time)
outputDict={"Pandas":pandas_time,"Modin":modin_time}
plotter(outputDict)

## Other DataFrame Function Performance Example
We will now show the speedup in performance from Modin compared to stock Pandas with a few common functions.

Like before, **`pandas_df` is for  stock Pandas**, **`modin_df` is for Modin**.

### `df.mean()`

In [None]:
# Mean
t2 = time.time()
pandas_df.mean(axis=0)
pandas_time=time.time()- t2
print(" stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Mean
t3 = time.time()
modin_df.mean(axis=0)
modin_time=time.time()- t3
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
verify_and_print_times(pandas_time, modin_time)

### `df.applymap`

In [None]:
# Long apply function
t6 = time.time()
print(pandas_df.applymap(lambda x: x + 1))
pandas_time = time.time() - t6
print(" stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Long apply function
t7 = time.time()
print(modin_df.applymap(lambda x: x + 1))
modin_time = time.time() - t7
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
verify_and_print_times(pandas_time, modin_time)

### `pd.concat([df, df])`

In [None]:
# Concat
t8 = time.time()
print(pandas.concat([pandas_df, pandas_df], axis=0))
pandas_time = time.time() - t8
print("stock Pandas wall time for completion in seconds:",pandas_time)

In [None]:
# Concat
t9 = time.time()
print(pd.concat([modin_df, modin_df], axis=0))
modin_time = time.time() - t9
print("Modin wall time for completion in seconds:",modin_time)

In [None]:
verify_and_print_times(pandas_time, modin_time)

## Modin Coverage Examples 
The Modin package supports a large variety of Pandas functions.
Here are some examples:

### Count

In [None]:
modin_df.count()

### Filter

In [None]:
modin_df.filter(regex='0$', axis=1)

### iloc

In [None]:
modin_df.iloc[0]
modin_df.iloc[-1]
modin_df.iloc[:,0]
modin_df.iloc[:,-1]

## Series

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

### DataFrame to NumPy Array

In [None]:
modin_df.to_numpy()

### Series to NumPy Array

In [None]:
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
ser.to_numpy(dtype=object)
ser.to_numpy(dtype="datetime64[ns]")

### Set Options

In [None]:
pd.set_option('compute.use_bottleneck', False)
pd.set_option('compute.use_numexpr', False)

### Unique Function for Series

In [None]:
pd.unique(pd.Series([2, 1, 3, 3]))

In [None]:
print("[CODE_SAMPLE_COMPLETED_SUCCESFULLY]")