In [1]:
# Last updated 12/20/19
# RAPIDS Benchmarking

# This script performs the following methods on both CUDA and Pandas data frames:
# 1. Creating data frames and populating with data from the MICROBIOLOGYEVENTS csv input files
# 2. Getting count of unique values in a column (SPEC_ITEMID)
# 3. Selecting number of rows with unique values for a column (SPEC_ITEMID)
# 4. Selecting n smallest and largest values for a column (SUBJECT_ID)
# 5. Selecting specific rows by index
# 6. Sorting values based on a column (CHARTDATE)
# 7. Adding a column (NEW_BLANK_COLUMN)
# 8. Adding a new column with calculated value (HADM_ID_times_2 = HADM_ID * 2)
# 9. Dropping a column (SPEC_ITEMID)

# In the comments, the term 'process' is used to refer to executing a method.

# Imports and setting variables

import cudf
from cudf import io
import matplotlib.pyplot as plt
from matplotlib import rcParams as rcp
import numpy as np
import pandas as pd
import time

# Set the variables
data_path = '/home/pace/mimiciii/'
# File names to use as input are:
# MICROBIOLOGYEVENTS.csv, MICROBIOLOGYEVENTS_5_Million.csv, MICROBIOLOGYEVENTS_10_Million.csv, 
# MICROBIOLOGYEVENTS_20_Million.csv, MICROBIOLOGYEVENTS_40_Million.csv
input_file_name = 'MICROBIOLOGYEVENTS.csv'

# Set values to determine if tests should be run.  Y indicates run, N indicates do not run
# This is not completely necessary, but allows you to only run the benchmarks you are interested
# in for a particular experiment
unique_count = 'Y' # Count of unique values in column SPEC_ITEMID
number_of_rows_with_unique_values_for_the_column = 'Y' # Number of rows in column SPEC_ITEMID that have a unique value
select_n_largest_and_smallest = 'Y' # n smallest and largest values for column SUBJECT_ID
n_smallest_and_largest_values = 5 # of smallest and largest values to select
select_specific_rows_by_index = 'Y' # specific rows by row index
run_sort = 'Y' # Sort based on column CHARTDATE
add_column = 'Y' # Add blank column named NEW_BLANK_COLUMN
add_column_calculated_value = 'Y' # Add new column and populate with values of HADM_ID * 2
drop_column = 'Y' # Drop SPEC_ITEMID


# Lists that save the values of each process so they can easily be copied and pasted elsewhere when running multiple
# experiments on the same input file
gdf_results_list = []
pdf_results_list = []

# Specify the column names for the input file
col_names = ['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','CHARTTIME','SPEC_ITEMID',
             'SPEC_TYPE_DESC','ORG_ITEMID','ORG_NAME','ISOLATE_NUM','AB_ITEMID','AB_NAME',
             'DILUTION_TEXT','DILUTION_COMPARISON','DILUTION_VALUE','INTERPRETATION']

In [2]:
# Functions

# Get the start time for a process and display message that the process has started
def get_start_time(print_text, input_file_name, df_type):
  if (df_type == 'GDF'):
    print('Start {0} the CUDA dataframe for {1}.'.format(print_text, str(input_file_name)))
  if (df_type == 'PDF'):
    print('Start {0} the Pandas dataframe for {1}.'.format(print_text, str(input_file_name)))
  return time.time()

# Calculate the total run time of the process then print a message stating process is 
# finished and display run time
def calculate_total_time_and_print_message(start_time, print_text, input_file_name, num_rows, num_cols, df_type):
  end_time = time.time()
  total_time = end_time - start_time
  # Print message for GDF
  if (df_type == 'GDF'):
    print('Finished {0} the CUDA dataframe for {1} in {2:0.4f} seconds.'.format(print_text, 
                                                                           str(input_file_name), total_time))
    print('The CUDA dataframe has {0} rows & {1} columns.\n'.format(str(num_rows), str(num_cols)))
  # Print message for PDF
  if (df_type == 'PDF'):
    print('Finished {0} the Pandas dataframe for {1} in {2:0.4f} seconds.'.format(print_text, 
                                                                           str(input_file_name), total_time))
    print('The Pandas dataframe has {0} rows & {1} columns.\n'.format(str(num_rows), str(num_cols)))
  # Return the total time
  return total_time

# Print the comparison of the GDF to the PDF
def print_comparison(print_text, gdf_time, pdf_time):
  if(gdf_time < pdf_time):
    print('*** The CUDA dataframe {0} {1:0.2f}x faster than the Pandas data frame. ***\n\n'.format(print_text, 
                                                                                                   (pdf_time/gdf_time)))
  else:
    print('*** The Pandas dataframe {0} {1:0.2f}x faster than the CUDA data frame. ***\n\n'.format(print_text, 
                                                                                                   (gdf_time/pdf_time)))

In [3]:
# Create CUDA and Pandas data frames and populate with data from MICROBIOLOGYEVENTS csv files

# Create GDF (CUDA dataframes) for MICROBIOLOGYEVENTS
print_text = 'loading'
final_text = 'loaded'
# Set start time for process and display message
start_time = get_start_time(print_text, input_file_name, 'GDF')

# Column types have to be set for gdf.  It does not automatically figure them out like Pandas does
col_types = ['int', 'int', 'int', 'str', 'str', 'int', 'str', 'int', 'str', 'int', 'int', 'str', 'str', 
             'str', 'float', 'str']
# Create the CUDA data frame
MICROBIOLOGYEVENTS_gdf = cudf.read_csv(str(data_path + input_file_name), delimiter=',',  
                                       names=col_names, dtype=col_types, skiprows=1)
 
# Calculate total process time and print the message
gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                 input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

# Add the time to the list that keeps the results
gdf_results_list.append(gdf_total_time_process_time)  

# Create PDF (Pandas dataframe) for MICROBIOLOGYEVENTS
# Set start time for process and display message
start_time = get_start_time(print_text, input_file_name, 'PDF')

# Create the Pandas data frame
MICROBIOLOGYEVENTS_pdf = pd.read_csv(str(data_path + input_file_name), names=col_names, skiprows=1)

# Calculate total process time and print the message
pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                 input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

# Add the time to the list that keeps the results
pdf_results_list.append(pdf_total_time_process_time)

# Print comparison
print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start loading the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished loading the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.2132 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start loading the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished loading the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.9017 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The CUDA dataframe loaded 4.23x faster than the Pandas data frame. ***




In [4]:
# Get count of unique values in a column (SPEC_ITEMID)

if unique_count == 'Y':

  print_text = 'calculating count of unique values in a column'  
  final_text = 'calculated count of unique values in a column'

  # Calculate count of unique values in a column (SPEC_ITEMID) from CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Get the count of unique values for the column
  MICROBIOLOGYEVENTS_gdf['SPEC_ITEMID'].nunique()

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)), 
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time) 
        
  # Calculate count of unique values in a column (SPEC_ITEMID) from Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Get the count of unique values for the column
  MICROBIOLOGYEVENTS_pdf['SPEC_ITEMID'].nunique()

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                 input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)), 
                                                                 str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, 
                   pdf_total_time_process_time)

Start calculating count of unique values in a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished calculating count of unique values in a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 1.0248 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start calculating count of unique values in a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished calculating count of unique values in a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.0035 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The Pandas dataframe calculated count of unique values in a column 291.30x faster than the CUDA data frame. ***




In [5]:
# Select number of rows with unique values for a column (SPEC_ITEMID)

if number_of_rows_with_unique_values_for_the_column == 'Y':
    
  print_text = 'selecting number of rows with unique values for a column'
  final_text = 'selected number of rows with unique values for a column'

  # Select number of rows with unique values for a column (SPEC_ITEMID) from CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Get the number of rows with unique values for a column
  MICROBIOLOGYEVENTS_gdf['SPEC_ITEMID'].value_counts()

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  

  # Select number of rows with unique values for a column (SPEC_ITEMID) from Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Get the number of rows with unique values for a column
  MICROBIOLOGYEVENTS_pdf['SPEC_ITEMID'].value_counts()

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start selecting number of rows with unique values for a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting number of rows with unique values for a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.3438 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start selecting number of rows with unique values for a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting number of rows with unique values for a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.0085 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The Pandas dataframe selected number of rows with unique values for a column 40.34x faster than the CUDA data frame. ***




In [6]:
# Select n smallest and largest values for a column (SUBJECT_ID)

if select_n_largest_and_smallest == 'Y':
    
  print_text = 'selecting {0} smallest and largest values for a column'.format(str(n_smallest_and_largest_values))
  final_text = 'selected {0} smallest and largest values for a column'.format(str(n_smallest_and_largest_values))

  # Select n smallest and largest values for a column (SUBJECT_ID) from CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Select n smallest and largest values for a column
  MICROBIOLOGYEVENTS_gdf.nsmallest(n_smallest_and_largest_values, 'SUBJECT_ID')
  MICROBIOLOGYEVENTS_gdf.nlargest(n_smallest_and_largest_values, 'SUBJECT_ID')

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  
        
  # Select n smallest and largest values for a column (SUBJECT_ID) from Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Select n smallest and largest values for a column
  MICROBIOLOGYEVENTS_pdf.nsmallest(n_smallest_and_largest_values, 'SUBJECT_ID')
  MICROBIOLOGYEVENTS_pdf.nlargest(n_smallest_and_largest_values, 'SUBJECT_ID')

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start selecting 5 smallest and largest values for a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting 5 smallest and largest values for a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 1.2422 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start selecting 5 smallest and largest values for a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting 5 smallest and largest values for a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.1027 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The Pandas dataframe selected 5 smallest and largest values for a column 12.09x faster than the CUDA data frame. ***




In [7]:
# Select specific rows by index

if select_specific_rows_by_index == 'Y':

  print_text = 'selecting specific rows by index'
  final_text = 'selected specific rows by index'

  # Select rows from CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Select specific rows by index
  MICROBIOLOGYEVENTS_gdf.loc[1000002:1000005]

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  

  # Select rows from Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Select specific rows by index
  MICROBIOLOGYEVENTS_pdf.loc[1000002:1000005]

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start selecting specific rows by index the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting specific rows by index the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.0151 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start selecting specific rows by index the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished selecting specific rows by index the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.0007 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The Pandas dataframe selected specific rows by index 20.57x faster than the CUDA data frame. ***




In [8]:
# Sort values based on a column (CHARTDATE)

if run_sort == 'Y':

  print_text = 'sorting values based on a column'
  final_text = 'sorted values based on a column'
    
  # Sort CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Sort
  MICROBIOLOGYEVENTS_gdf.sort_values('CHARTDATE', ascending=True)

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  
      
  # Sort Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Sort
  MICROBIOLOGYEVENTS_pdf.sort_values('CHARTDATE', ascending=True)

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start sorting values based on a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished sorting values based on a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.3669 seconds.
The CUDA dataframe has 631726 rows & 16 columns.

Start sorting values based on a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished sorting values based on a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.7629 seconds.
The Pandas dataframe has 631726 rows & 16 columns.

*** The CUDA dataframe sorted values based on a column 2.08x faster than the Pandas data frame. ***




In [9]:
# Add a column (NEW_BLANK_COLUMN)

if add_column == 'Y':

  print_text = 'adding column'
  final_text = 'added column'
    
  # Add column to CUDA dataframe (NEW_BLANK_COLUMN)
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Create the new column
  MICROBIOLOGYEVENTS_gdf['NEW_BLANK_COLUMN'] = ''

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)     
    
  # Add column to Pandas dataframe (NEW_BLANK_COLUMN)
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Create the new column
  MICROBIOLOGYEVENTS_pdf['NEW_BLANK_COLUMN'] = ''

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start adding column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished adding column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.0088 seconds.
The CUDA dataframe has 631726 rows & 17 columns.

Start adding column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished adding column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.0041 seconds.
The Pandas dataframe has 631726 rows & 17 columns.

*** The Pandas dataframe added column 2.14x faster than the CUDA data frame. ***




In [10]:
# Add new column with calculated value (HADM_ID * 2)

if add_column_calculated_value == 'Y':

  print_text = 'adding a new column with calculated value'
  final_text = 'added a new column with calculated value'

  # Add column with calculated value in CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Add the column
  MICROBIOLOGYEVENTS_gdf.add_column('HADM_ID_times_2', MICROBIOLOGYEVENTS_gdf['HADM_ID'] * 2)


  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  
    
    
  # Add column with calculated value in Pandas dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Add the column
  MICROBIOLOGYEVENTS_pdf['HADM_ID_times_2'] = MICROBIOLOGYEVENTS_pdf['HADM_ID'] * 2

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start adding a new column with calculated value the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished adding a new column with calculated value the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.0023 seconds.
The CUDA dataframe has 631726 rows & 18 columns.

Start adding a new column with calculated value the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished adding a new column with calculated value the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.0040 seconds.
The Pandas dataframe has 631726 rows & 18 columns.

*** The CUDA dataframe added a new column with calculated value 1.74x faster than the Pandas data frame. ***




In [11]:
# Drop column (SPEC_ITEMID)

if drop_column == 'Y':

  print_text = 'dropping a column'
  final_text = 'dropped a column'

  # Drop column in CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'GDF')

  # Drop the column
  MICROBIOLOGYEVENTS_gdf.drop_column('SPEC_ITEMID')

  # Calculate total process time and print the message
  gdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text, 
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_gdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_gdf.columns)), 'GDF')

  # Add the time to the list that keeps the results
  gdf_results_list.append(gdf_total_time_process_time)  
    
  # Drop column in CUDA dataframe
  # Set start time for process and display message
  start_time = get_start_time(print_text, input_file_name, 'PDF')

  # Drop the column
  MICROBIOLOGYEVENTS_pdf.drop(columns=['SPEC_ITEMID'])

  # Calculate total process time and print the message
  pdf_total_time_process_time = calculate_total_time_and_print_message(start_time, print_text,
                                                                       input_file_name, str(len(MICROBIOLOGYEVENTS_pdf)),
                                                                       str(len(MICROBIOLOGYEVENTS_pdf.columns)), 'PDF')

  # Add the time to the list that keeps the results
  pdf_results_list.append(pdf_total_time_process_time)

  # Print comparison
  print_comparison(final_text, gdf_total_time_process_time, pdf_total_time_process_time)

Start dropping a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv.
Finished dropping a column the CUDA dataframe for MICROBIOLOGYEVENTS.csv in 0.0006 seconds.
The CUDA dataframe has 631726 rows & 17 columns.

Start dropping a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv.
Finished dropping a column the Pandas dataframe for MICROBIOLOGYEVENTS.csv in 0.1575 seconds.
The Pandas dataframe has 631726 rows & 18 columns.

*** The CUDA dataframe dropped a column 260.55x faster than the Pandas data frame. ***




In [12]:
'''
# Get all of the results and show them. This is used mainly when performing multiple tests to get
# average values
for i in range(0, len(gdf_results_list)):
  print('%1.4f,%1.4f,%1.4f' % (gdf_results_list[i], pdf_results_list[i], pdf_results_list[i]/gdf_results_list[i]))

!nvidia-smi --query-gpu=memory.used -i 0 --format=csv --gpu_id=0
'''

"\n# Get all of the results and show them. This is used mainly when performing multiple tests to get\n# average values\nfor i in range(0, len(gdf_results_list)):\n  print('%1.4f,%1.4f,%1.4f' % (gdf_results_list[i], pdf_results_list[i], pdf_results_list[i]/gdf_results_list[i]))\n\n!nvidia-smi --query-gpu=memory.used -i 0 --format=csv --gpu_id=0\n"