# An intro to cuDF 
cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.

# Download data

In [None]:
%%script echo skipping
%%capture
%%bash
wget "https://bsql.s3.amazonaws.com/data/rapids_intro/movies.csv"

# Load libraries

In [1]:
import os
import time

import pandas as pd
import numpy as np

import cupy as cp
import cudf as cd

import s3fs

# Import data from csv

### **movies_pdf** is our Pandas DF

In [2]:
movies_pdf = pd.read_csv("movies.csv")

### **movies_cdf** is our cuDF

In [3]:
movies_cdf = cd.read_csv("movies.csv")

# Gather dataset statistics

In [4]:
print(movies_pdf.shape)
print(movies_pdf.ndim)
print(len(movies_pdf))

(4916, 28)
2
4916


In [5]:
print(movies_cdf.shape)
print(movies_cdf.ndim)
print(len(movies_cdf))

(4916, 28)
2
4916


# Explore Data

In [6]:
movies_pdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [7]:
movies_cdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [8]:
movies_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

In [9]:
movies_cdf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   color                      4897 non-null   object
 1   director_name              4814 non-null   object
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object
 10  actor_1_name               4909 non-null   object
 11  movie_title                4916 non-null   object
 12  num_voted_users            4916 non-null   int64
 13  cast_total_facebook_likes  4916 non-null   int64
 14  acto

In [10]:
movies_pdf.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [11]:
movies_cdf.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

# Select subsets of the dataframe

### Select fields that are continuous data only

In [12]:
movies_pdf.select_dtypes(include="number").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [13]:
movies_cdf.select_dtypes(include="number").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


### Select fields that are floats only

In [14]:
movies_pdf.select_dtypes(include="float").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35
4,,,131.0,,131.0,,0.0,,,,12.0,7.1,


In [15]:
movies_cdf.select_dtypes(include="float").head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35
4,,,131.0,,131.0,,0.0,,,,12.0,7.1,


### Select fields that are discrete values

In [16]:
movies_pdf.select_dtypes(include="object").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,


In [17]:
movies_cdf.select_dtypes(include="object").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,


# Data Analysis

### Summary statistics for all continuous data fields

In [18]:
movies_pdf.select_dtypes(include="number").describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [19]:
movies_cdf.select_dtypes(include="number").describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


### Summary statistics for all discrete value fields

In [20]:
movies_pdf.select_dtypes(include="object").describe()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
count,4897,4814,4903,4916,4909,4916,4893,4764,4916,4904,4911,4616
unique,2,2397,3030,914,2095,4916,3519,4756,4916,47,65,18
top,Color,Steven Spielberg,Morgan Freeman,Drama,Robert De Niro,Silent Hill,Steve Coogan,based on novel,http://www.imdb.com/title/tt1663662/?ref_=fn_t...,English,USA,R
freq,4693,26,18,233,48,1,8,4,1,4582,3710,2067


In [21]:
movies_cdf.select_dtypes(include="object").describe()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating
count,4897,4814,4903,4916,4909,4916,4893,4764,4916,4904,4911,4616
unique,3,2398,3031,914,2096,4916,3520,4757,4916,48,66,19
top,Color,Steven Spielberg,Morgan Freeman,Drama,Robert De Niro,May,Steve Coogan,based on novel,http://www.imdb.com/title/tt2814362/?ref_=fn_t...,English,USA,R
freq,4693,26,18,233,48,1,8,4,1,4582,3710,2067


### Transpose the cuDF describe results

(this can be done in Pandas too)

In [22]:
movies_pdf.select_dtypes(include="number").describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28602,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_facebook_likes,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_facebook_likes,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_facebook_likes,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


In [23]:
movies_cdf.select_dtypes(include="number").describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28601,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_facebook_likes,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_facebook_likes,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_facebook_likes,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


### Covaraiance calculation of two continuous variables

In [24]:
movies_pdf.movie_facebook_likes.cov(movies_pdf.actor_3_facebook_likes)

8481703.75419313

In [25]:
movies_cdf.movie_facebook_likes.cov(movies_cdf.actor_3_facebook_likes)

8481703.754193125

### Pearson correlation of two continuous variables

In [26]:
movies_pdf.movie_facebook_likes.corr(movies_pdf.actor_3_facebook_likes)

0.27107033354561877

In [27]:
movies_cdf.movie_facebook_likes.corr(movies_cdf.actor_3_facebook_likes)

0.27107033354561866

### Groupby
Analyze the gross amounts generated by the two main actors

In [28]:
movies_pdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,gross
actor_1_name,actor_2_name,Unnamed: 2_level_1
50 Cent,Bill Duke,30981850.0
A.J. Buckley,Jack McGraw,123070338.0
Aaliyah,Lena Olin,30307804.0
Aasif Mandvi,Dequina Moore,13214030.0
Abbie Cornish,Gabourey Sidibe,14989761.0
...,...,...
Zoë Kravitz,Lily Rabe,43097652.0
Zuhair Haddad,Harry Lennix,0.0
Álex Angulo,Germán Alcarazu,0.0
Ólafur Darri Ólafsson,Ingvar Eggert Sigurðsson,0.0


In [29]:
movies_cdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,gross
actor_1_name,actor_2_name,Unnamed: 2_level_1
Amir Talai,Anne Archer,191309.0
Gerard Butler,Logan Lerman,20488579.0
Al Pacino,F. Murray Abraham,44700000.0
Emma Stone,Katharine McPhee,48237389.0
Alyson Stoner,Stephen Boss,42385520.0
...,...,...
Ian Gamazon,Edgar Tancangco,70071.0
Adam Goldberg,Meat Loaf,676698.0
Barry Corbin,Nia Long,4692814.0
Julia Roberts,Christopher Meloni,152149590.0


# Data Preparation

Genres has multiple values that are combinations of several genres.  For example: `Action|Adventure|Comedy|Fantasy|Sci-Fi`

In [30]:
print('There are ' + str(len(movies_pdf.genres.unique())) + ' genre combinations in the genres field')
print('Examples:\n', movies_pdf.genres.unique()[:10])

There are 914 genre combinations in the genres field
Examples:
 ['Action|Adventure|Fantasy|Sci-Fi' 'Action|Adventure|Fantasy'
 'Action|Adventure|Thriller' 'Action|Thriller' 'Documentary'
 'Action|Adventure|Sci-Fi' 'Action|Adventure|Romance'
 'Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance'
 'Adventure|Family|Fantasy|Mystery' 'Action|Adventure']


## Splitting the Genre column using Pandas

In [37]:
genres_pdf = movies_pdf.join(movies_pdf.genres.str.split('|', expand=True).add_prefix('genre_'))
genres_pdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,aspect_ratio,movie_facebook_likes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,1.78,33000,Action,Adventure,Fantasy,Sci-Fi,,,,
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,2.35,0,Action,Adventure,Fantasy,,,,,
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,2.35,85000,Action,Adventure,Thriller,,,,,
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2.35,164000,Action,Thriller,,,,,,
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,0,Documentary,,,,,,,


## Splitting the Genre column using cuDF
cuDF does not have the add_prefix() option when splitting a column.  Here is one way to rename your columns in cuDF.

In [32]:
genre_fields = len(movies_cdf.genres.str.split('|', expand=True).columns)
print('There will be ' + str(genre_fields) + ' new columns that will be added into our dataframe\n')
genres_cdf = movies_cdf.join(movies_cdf.genres.str.split('|', expand=True))
genres_cdf.head()

There will be 8 new columns that will be added into our dataframe



Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,aspect_ratio,movie_facebook_likes,0,1,2,3,4,5,6,7
3488,Color,Rachel Perkins,33.0,88.0,3.0,38.0,Magda Szubanski,46.0,110029.0,Comedy|Drama|Musical,...,2.35,479,Comedy,Drama,Musical,,,,,
3504,Color,Hans Petter Moland,28.0,106.0,19.0,0.0,Sara-Marie Maltha,844.0,64148.0,Drama,...,1.85,260,Drama,,,,,,,
3496,Color,Robert Wise,57.0,115.0,338.0,45.0,Paula Kelly,116.0,,Sci-Fi|Thriller,...,2.35,0,Sci-Fi,Thriller,,,,,,
3512,Color,,10.0,55.0,,5.0,Fortunato Cerlino,18.0,,Crime|Drama|Thriller,...,1.85,0,Crime,Drama,Thriller,,,,,
3489,Color,John Singleton,64.0,112.0,309.0,15.0,Lloyd Avery II,27.0,57504069.0,Crime|Drama,...,1.85,0,Crime,Drama,,,,,,


**The new column names are assigned numbers and are not strings**

In [33]:
col_numbers = genres_cdf.columns[-genre_fields:].to_list()
print(col_numbers)

[0, 1, 2, 3, 4, 5, 6, 7]


**Need to convert them to strings and give them a prefix and convert both lists into a dictionary to rename the new columns in the cuDF**

In [34]:
new_col_names = ['genre_' + str(x) for x in col_numbers] 
print(new_col_names)

new_col_dict = dict(zip(col_numbers, new_col_names))
print(new_col_dict)

['genre_0', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7']
{0: 'genre_0', 1: 'genre_1', 2: 'genre_2', 3: 'genre_3', 4: 'genre_4', 5: 'genre_5', 6: 'genre_6', 7: 'genre_7'}


In [35]:
genres_cdf = genres_cdf.rename(columns=new_col_dict)

In [36]:
genres_cdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,aspect_ratio,movie_facebook_likes,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
3488,Color,Rachel Perkins,33.0,88.0,3.0,38.0,Magda Szubanski,46.0,110029.0,Comedy|Drama|Musical,...,2.35,479,Comedy,Drama,Musical,,,,,
3504,Color,Hans Petter Moland,28.0,106.0,19.0,0.0,Sara-Marie Maltha,844.0,64148.0,Drama,...,1.85,260,Drama,,,,,,,
3496,Color,Robert Wise,57.0,115.0,338.0,45.0,Paula Kelly,116.0,,Sci-Fi|Thriller,...,2.35,0,Sci-Fi,Thriller,,,,,,
3512,Color,,10.0,55.0,,5.0,Fortunato Cerlino,18.0,,Crime|Drama|Thriller,...,1.85,0,Crime,Drama,Thriller,,,,,
3489,Color,John Singleton,64.0,112.0,309.0,15.0,Lloyd Avery II,27.0,57504069.0,Crime|Drama,...,1.85,0,Crime,Drama,,,,,,


# One Hot Encoding

## OHE using Pandas
Applied on the genre_0 column

In [38]:
pd_ohe = pd.get_dummies(genres_pdf.genre_0, prefix='genre_0')
df = pd.concat([genres_pdf, pd_ohe], axis=1)
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,genre_0_Game-Show,genre_0_History,genre_0_Horror,genre_0_Music,genre_0_Musical,genre_0_Mystery,genre_0_Romance,genre_0_Sci-Fi,genre_0_Thriller,genre_0_Western
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,0,0,0,0,0,0,0,0,0,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,0,0,0,0,0,0,0,0,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,0,0,0,0,0,0,0,0,0,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,0,0,0,0,0,0,0,0,0,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,0,0,0,0,0,0,0,0,0,0


## OHE using cuDF
Applied on the genre_0 column

In [39]:
cdf = cd.get_dummies(genres_cdf, prefix='genre_0', columns=['genre_0'])
cdf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,genre_0_Game-Show,genre_0_History,genre_0_Horror,genre_0_Music,genre_0_Musical,genre_0_Mystery,genre_0_Romance,genre_0_Sci-Fi,genre_0_Thriller,genre_0_Western
3488,Color,Rachel Perkins,33.0,88.0,3.0,38.0,Magda Szubanski,46.0,110029.0,Comedy|Drama|Musical,...,0,0,0,0,0,0,0,0,0,0
3504,Color,Hans Petter Moland,28.0,106.0,19.0,0.0,Sara-Marie Maltha,844.0,64148.0,Drama,...,0,0,0,0,0,0,0,0,0,0
3496,Color,Robert Wise,57.0,115.0,338.0,45.0,Paula Kelly,116.0,,Sci-Fi|Thriller,...,0,0,0,0,0,0,0,1,0,0
3512,Color,,10.0,55.0,,5.0,Fortunato Cerlino,18.0,,Crime|Drama|Thriller,...,0,0,0,0,0,0,0,0,0,0
3489,Color,John Singleton,64.0,112.0,309.0,15.0,Lloyd Avery II,27.0,57504069.0,Crime|Drama,...,0,0,0,0,0,0,0,0,0,0
