# Exploring MODIN as an alternative to Pandas

#### Installing MODIN

In [None]:
!pip install "modin[all]" # Install all of the above

Importing libraries

In [34]:
import time

import os
# os.environ["MODIN_ENGINE"] = "unidist" # Modin will use Unidist
# os.environ["UNIDIST_BACKEND"] = "mpi" # Unidist will use MPI backend
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
import modin.pandas as md

import pandas as pd

#### Loading the Dataset

MODIN

In [47]:
def read_data():
    start = time.time()
    modin_ratings1 = md.read_csv('data/data_archive/data_v2/ratings_v2_1.csv')
    modin_ratings2 = md.read_csv('data/data_archive/data_v2/ratings_v2_2.csv') 
    modin_movies = md.read_csv('data/data_archive/data_v2/movie_info_v2.csv')
    read_m = time.time()-start
    print("Reading time:",read_m,"seconds")
    merge = time.time()
    modin_ratings = md.concat([modin_ratings1,modin_ratings2]) 
    concat_m = time.time()-merge
    print("Concat time:",concat_m,"seconds")  
    return modin_ratings,modin_movies,read_m,concat_m
modin_ratings,modin_movies,read_m,concat_m = read_data()

Reading time: 0.7391571998596191 seconds
Concat time: 0.008408069610595703 seconds


PANDAS

In [49]:
def read_data():
    start = time.time()
    pd_ratings1 = pd.read_csv('data/data_archive/data_v2/ratings_v2_1.csv')
    pd_ratings2 = pd.read_csv('data/data_archive/data_v2/ratings_v2_2.csv')    
    pd_movies = pd.read_csv('data/data_archive/data_v2/movie_info_v2.csv')
    read_p = time.time()-start
    print("Reading time:",read_p,"seconds")
    merge = time.time()
    pd_ratings = pd.concat([pd_ratings1,pd_ratings2])
    concat_p = time.time()-merge
    print("Concat time:",concat_p,"seconds")
    return pd_ratings,pd_movies,read_p,concat_p
pd_ratings,pd_movies,read_p,concat_p = read_data()

Reading time: 1.5362989902496338 seconds
Concat time: 0.10606575012207031 seconds


#### Apply function

MODIN

In [51]:
def round1():
    start = time.time()
    ratings_per_movie_md = modin_ratings['rate'].apply(round)
    round_m = time.time()-start
    print("Rounding time: ",round_m,"seconds")
    return ratings_per_movie_md,round_m
ratings_per_movie_md,round_m = round1()

Rounding time:  0.27428483963012695 seconds


PANDAS

In [52]:
def round1():
    start = time.time()
    ratings_per_movie_pd = pd_ratings['rate'].apply(round)
    round_p = time.time()-start
    print("Rounding time: ",round_p,"seconds")
    return ratings_per_movie_pd,round_p
ratings_per_movie_pd,round_p = round1()

Rounding time:  0.3562281131744385 seconds


#### Merging ratings and movie dataframes

MODIN

In [59]:
def merging():
    start = time.time()
    merged_md = md.merge(modin_ratings,modin_movies,how='inner',
             left_on='movie_id',right_on='movie_id')
    merge_m = time.time()-start
    print("Reading time: ",merge_m,"seconds")
    return merged_md,merge_m
merged_md,merge_m = merging()

Reading time:  0.10237002372741699 seconds


PANDAS

In [60]:
def merging():
    start = time.time()
    merged_pd = pd.merge(pd_ratings,pd_movies,how='inner',
             left_on='movie_id',right_on='movie_id')
    merge_p = time.time()-start
    print("Reading time: ",merge_p,"seconds")
    return merged_pd,merge_p
merged_pd,merge_p = merging()

Reading time:  1.0103511810302734 seconds


#### Get mean of Ratings

MODIN

In [77]:
def get_mean():
    start = time.time()
    mean_rate_md = merged_md['rate'].mean()
    mean_m = time.time()-start
    print("Mean time:",mean_m,"seconds")
    return mean_rate_md,mean_m
mean_rate_md,mean_m = get_mean()

Mean time: 0.23832416534423828 seconds


PANDAS

#### Comparison Statistics

In [79]:
stats = {
    'Reading time':[read_p,read_m],
    'Concat time':[concat_p,concat_m],
    'Apply time':[round_p,round_m],
    'Merge time':[merge_p,merge_m]
}
print ("{:<15} {:<30} {:<30}".format('Metric','Pandas','Modin'))
for k, v in stats.items():
    pand,mod = v
    print ("{:<15} {:<30} {:<30}".format(k, pand, mod))

Metric          Pandas                         Modin                         
Reading time    1.5362989902496338             0.7391571998596191            
Concat time     0.10606575012207031            0.008408069610595703          
Apply time      0.3562281131744385             0.27428483963012695           
Merge time      1.0103511810302734             0.10237002372741699           
