<a href="https://colab.research.google.com/github/perlatomdpi/GPU-Machine-Learning-Algorithms/blob/main/Accellerated_Data_Manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GPU Accellerated Data Manipulation**
Magic commands: <br>
1. `%time` and `%%time` which will print summary information about how long it took to run code for a line or entire cell respectively. <br>
2. `%load` which will load the contents of a given file into the cell. 

In [None]:
from time import sleep

In [None]:
# CHECK GPU
#==============================================================================
# Runtime -> Change runtime type -> GPU 
# Has to be RAPIDS compatible: 
# If not terminate and restart session
!nvidia-smi

In [None]:
#==============================================================================
# INSTALL RAPIDS
#==============================================================================
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [None]:
#==============================================================================
# LOAD PACKAGES
#==============================================================================
# Import cuDF and CuPy for GPU-accelerated dataframes and math operations, plus the CPU libraries Pandas and NumPy
import cudf
import cupy as cp

import pandas as pd
import numpy as np

In [None]:
# Read data
%time gdf = cudf.read_csv('./data/my_data.csv')
gdf.shape

In [None]:
# Write data
%time rome_residents = gdf.loc[gdf['county'] == 'ROME']

# **Convert data type**



In [None]:
%time gdf['age'] = gdf['age'].astype('float32') # Change format
%time gdf['name'] = gdf['name'].str.title()     # Convert to lowercase

gdf['lat'] = gdf['lat'].astype('float32')       # Change coordinates format
gdf['long'] = gdf['long'].astype('float32')

# **Data Subsetting**

In [None]:
# loc will include every value it is passed whereas
gdf.loc[100:200]

# loc with Boolean Selection
%time e_names = gdf.loc[gdf['name'].str.startswith('A')]

# cuDF: 
# name that start with A and end with w - slower cuDF solution
time ed_names = gdf.loc[np.logical_and(gdf['name'].str.startswith('E'), gdf['name'].str.endswith('d'))]

# cuPY: 
# name that start with A and end with w - faster cuPY solutiion
%time ed_names = gdf.loc[cp.logical_and(gdf['name'].str.startswith('A'), gdf['name'].str.endswith('w'))]

# **Identify the latitude**


In [None]:
# Infer the person with the maximum lat value
sunderland_residents = gdf.loc[gdf['county'] == 'Italy']
northmost_sunderland_lat = sunderland_residents['lat'].max()

# **Grouping and Sorting**

In [None]:
# Record grouping
%%time
counties = gdf[['county', 'age']].groupby(['county'])
avg_ages = counties.mean()
print(avg_ages[:5])

In [None]:
# Sorting
%time gdf_names = gdf['name'].sort_values()
print(gdf_names[:5]) # yes, "A" is an infrequent but correct given name in the UK, according to census data
print(gdf_names[-5:])

# **Example**
Which names are associated with the lowest average age and how many people have those names. Using the mean and count methods on the data grouped by name, identify the three names with the lowest mean age and their counts.

In [None]:
name_groups = gdf[['name', 'age']].groupby('name')

name_ages = name_groups['age'].mean()
name_counts = name_groups['age'].count()

ages_counts = cudf.DataFrame()
ages_counts['mean_age'] = name_ages
ages_counts['count'] = name_counts

ages_counts = ages_counts.sort_values('mean_age')
ages_counts.iloc[:3]