In [1]:
# Last updated 12/20/19

# This script loads 2 CUDA and 2 Pandas data frames then concatenates them.

# In the comments, the term 'process' is used to refer to executing a method.

# Imports and setting variables
import cudf
from cudf import io
import matplotlib.pyplot as plt
from matplotlib import rcParams as rcp
import numpy as np
import pandas as pd
import time

# Set the variables
data_path = '/home/pace/mimiciii/'
# File names to use as input are:
# MICROBIOLOGYEVENTS.csv, MICROBIOLOGYEVENTS_5_Million.csv, MICROBIOLOGYEVENTS_10_Million.csv, 
# MICROBIOLOGYEVENTS_20_Million.csv, MICROBIOLOGYEVENTS_40_Million.csv
# input_file_name is the file that will have the 2nd data frame concatenated onto it.
# input_concat_file is the file that will be concatenated onto the first data frame
input_file_name = 'MICROBIOLOGYEVENTS.csv'
input_concat_file = 'MICROBIOLOGYEVENTS_5_Million.csv'

# Lists that save the values of each process so they can easily be copied and pasted elsewhere when running multiple
# experiments on the same input file
gdf_results_list = []
pdf_results_list = []
number_of_rows_list = []

# Specify the column names for the both input files
col_names = ['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','CHARTTIME','SPEC_ITEMID', 
             'SPEC_TYPE_DESC','ORG_ITEMID','ORG_NAME','ISOLATE_NUM','AB_ITEMID','AB_NAME', 
             'DILUTION_TEXT','DILUTION_COMPARISON','DILUTION_VALUE','INTERPRETATION']

In [2]:
# Functions

# Get the start time for a process and display message that the process has started
def get_start_time(print_text, file_name, df_type):
  if (df_type == 'GDF'):
    print('Start {0} the CUDA dataframe for {1}.'.format(print_text, str(file_name)))
  if (df_type == 'PDF'):
    print('Start {0} the Pandas dataframe for {1}.'.format(print_text, str(file_name)))
  return time.time()

# Calculate the total run time of the process then print a message stating process is 
# finished and display run time
def calculate_total_time_and_print_message(start_time, print_text, file_name, num_rows, num_cols, df_type):
  end_time = time.time()
  total_time = end_time - start_time
  # Print message for GDF
  if (df_type == 'GDF'):
    print('Finished {0} the CUDA dataframe for {1} in {2:0.4f} seconds.'.format(print_text, 
                                                                           str(file_name), total_time))
    print('The CUDA dataframe has {0} rows & {1} columns.\n'.format(str(num_rows), str(num_cols)))
  # Print message for PDF
  if (df_type == 'PDF'):
    print('Finished {0} the Pandas dataframe for {1} in {2:0.4f} seconds.'.format(print_text, 
                                                                           str(file_name), total_time))
    print('The Pandas dataframe has {0} rows & {1} columns.\n'.format(str(num_rows), str(num_cols)))
  # Return the total time
  return total_time

# Print the comparison of the GDF to the PDF
def print_comparison(print_text, gdf_time, pdf_time):
  if(gdf_time < pdf_time):
    print('*** The CUDA dataframe {0} {1:0.2f}x faster than the Pandas data frame. ***\n\n'.format(print_text, 
                                                                                                   (pdf_time/gdf_time)))
  else:
    print('*** The Pandas dataframe {0} {1:0.2f}x faster than the CUDA data frame. ***\n\n'.format(print_text, 
                                                                                                   (gdf_time/pdf_time)))

In [3]:
# Create CUDA and Pandas data frames and populate with data from MICROBIOLOGYEVENTS csv files

# Column types have to be set for gdf.  It does not automatically figure them out like Pandas does
col_types = ['int', 'int', 'int', 'str', 'str', 'int', 'str', 'int', 'str', 'int', 'int', 'str', 'str', 
             'str', 'float', 'str']

print_text = 'loading'
final_text = 'loaded'

# Load input files

# Create GDF (CUDA dataframes) for input file name
# Set start time for process and display message
start_time_gdf_input = get_start_time(print_text, input_file_name, 'GDF')

# Create the input CUDA data frame
MICROBIOLOGYEVENTS_gdf = cudf.read_csv(str(data_path + input_file_name), delimiter=',',  
                                       names=col_names, dtype=col_types, skiprows=1)
   
# Calculate total process time and print the message
gdf_total_input_time_process_time = calculate_total_time_and_print_message(start_time_gdf_input, print_text, 
                                                                 input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

# Create PDFs (Pandas data frames) for input file name
# Set start time for process and display message
start_time_pdf_input = get_start_time(print_text, input_file_name, 'PDF')

# Create the input Pandas data frame
MICROBIOLOGYEVENTS_pdf = pd.read_csv(str(data_path + input_file_name), names=col_names, skiprows=1)

# Calculate total process time and print the message
pdf_total_input_time_process_time = calculate_total_time_and_print_message(start_time_pdf_input, print_text, 
                                                                 input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

# Print comparison
print_comparison(final_text, gdf_total_input_time_process_time, pdf_total_input_time_process_time)


# Load concat files

# Set start time for process and display message
start_time_gdf_concat = get_start_time(print_text, input_concat_file, 'GDF')

# Create the concat CUDA data frames
MICROBIOLOGYEVENTS_concat_gdf = cudf.read_csv(str(data_path + input_concat_file), delimiter=',',  
                                              names=col_names, dtype=col_types, skiprows=1)

# Calculate total process time and print the message
gdf_total_concat_time_process_time = calculate_total_time_and_print_message(start_time_gdf_concat, print_text, 
                                                                 input_concat_file, str(len(MICROBIOLOGYEVENTS_concat_gdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_concat_gdf.columns)), 'GDF')


# Set start time for process and display message
start_time_pdf_concat = get_start_time(print_text, input_concat_file, 'PDF')

# Create the concat Pandas data frames
MICROBIOLOGYEVENTS_concat_pdf = pd.read_csv(str(data_path + input_concat_file), names=col_names, skiprows=1)

# Calculate total process time and print the message
pdf_total_concat_time_process_time = calculate_total_time_and_print_message(start_time_pdf_concat, print_text, 
                                                                 input_concat_file, str(len(MICROBIOLOGYEVENTS_concat_pdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_concat_pdf.columns)), 'PDF')

# Print comparison
print_comparison(final_text, gdf_total_concat_time_process_time, pdf_total_concat_time_process_time)


Start loading the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished loading the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.2131 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start loading the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished loading the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.8940 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The CUDA dataframe loaded 4.19x faster than the Pandas data frame. ***


Start loading the CUDA dataframe for MICROBIOLOGYEVENTS_5_Million.csv.
Finished loading the CUDA dataframe for MICROBIOLOGYEVENTS_5_Million.csv in 0.6673 seconds.
The CUDA dataframe has 5000000 rows & 16 columns.

Start loading the Pandas dataframe for MICROBIOLOGYEVENTS_5_Million.csv.
Finished loading the Pandas dataframe for MICROBIOLOGYEVENTS_5_Million.csv in 7.0800 seconds.
The Pandas dataframe has 5000000 rows & 16 columns.

*** The CUDA dataframe loaded 10.61x faster than the Pandas data frame. ***




In [4]:
# Concat the original dataframe to the newest dataframe

print_text = 'concating'
final_text = 'concated'

# Concat the CUDA dataframe
# Set start time for process and display message
start_time = get_start_time(print_text, input_file_name, 'GDF')

concat_gdf = cudf.concat([MICROBIOLOGYEVENTS_gdf,MICROBIOLOGYEVENTS_concat_gdf])

# Calculate total process time and print the message
gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                 input_file_name, str(len(concat_gdf)), 
                                                                 str(len(concat_gdf.columns)), 'GDF')

# Concat the Pandas dataframe
# Set start time for process and display message
start_time = get_start_time(print_text, input_file_name, 'PDF')

concat_pdf = pd.concat([MICROBIOLOGYEVENTS_pdf,MICROBIOLOGYEVENTS_concat_pdf])

# Calculate total process time and print the message
pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                 input_file_name, str(len(concat_pdf)), 
                                                                 str(len(concat_pdf.columns)), 'PDF')

# Print comparison
print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start concating the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished concating the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.2293 seconds.
The CUDA dataframe has 5631726 rows & 16 columns.

Start concating the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished concating the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.9825 seconds.
The Pandas dataframe has 5631726 rows & 16 columns.

*** The CUDA dataframe concated 4.28x faster than the Pandas data frame. ***


