# Create Parquet with Iris Data Set

In [7]:
%matplotlib inline

In [8]:
import os
import json
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import pyarrow as pa
import pyarrow.parquet as pq

from IPython.display import JSON

In [9]:
HDFS_HOME = "hdfs://node-master:54310/user/hadoop"

In [4]:
for module in [pd, dask, pa, np]:
    print(module.__name__, module.__version__)

pandas 1.0.5
dask 2.30.0
pyarrow 3.0.0
numpy 1.18.5


In [37]:
!hdfs dfs -rm -r iris_parquet/iris.parq

Deleted iris_parquet/iris.parq


In [38]:
hdfs = pa.hdfs.connect('node-master', port=54310)

schema = pa.schema([
    pa.field('sepal_length', pa.float64()),
    pa.field('sepal_width', pa.float64()),
    pa.field('petal_length', pa.float64()),
    pa.field('petal_width', pa.float64()),
    pa.field('class', pa.string()),
    pa.field('date_test', pa.date32()),
])

columns = [
  'sepal_length',
  'sepal_width',
  'petal_length',
  'petal_width',
  'class'
]

df = pd.read_csv("data/iris.data", names=columns)
df['date_test'] = pd.date_range("2020-01-01", periods=len(df))
df['date_test'] = df['date_test'].dt.date

df.info(memory_usage='deep')

with hdfs.open("iris_parquet/iris.parq", "wb") as f:
    df.to_parquet(f,
                  engine='pyarrow',
                  compression='snappy',
                  #schema=schema,
                  index=False)

  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
 5   date_test     150 non-null    object 
dtypes: float64(4), object(2)
memory usage: 21.0 KB


# Load to Hive

In [39]:
%%bash
hive -e "SHOW TABLES;" 2> /dev/null

gdelt_parquet
gdelt_parquet_2020
iris_csv
iris_parquet
ne_10_states_provinces_parquet
ne_110_countries_parquet


In [40]:
%%bash
hive -e 'set parquet.compression=SNAPPY;
DROP TABLE IF EXISTS iris_parquet;
CREATE EXTERNAL TABLE iris_parquet (
    sepal_length DOUBLE,
    sepal_width  DOUBLE,
    petal_length DOUBLE,
    petal_width  DOUBLE,
    class        STRING,
    date_test    DATE
) 
STORED AS PARQUET
LOCATION "hdfs://node-master:54310/user/hadoop/iris_parquet";' 2> /dev/null

In [41]:
%%bash
hive -e 'DESCRIBE iris_parquet;' 2> /dev/null

sepal_length        	double              	                    
sepal_width         	double              	                    
petal_length        	double              	                    
petal_width         	double              	                    
class               	string              	                    
date_test           	date                	                    


In [42]:
%%bash
hive -e 'SELECT * FROM iris_parquet LIMIT 5;' 2> /dev/null

5.1	3.5	1.4	0.2	Iris-setosa	2020-01-01
4.9	3.0	1.4	0.2	Iris-setosa	2020-01-02
4.7	3.2	1.3	0.2	Iris-setosa	2020-01-03
4.6	3.1	1.5	0.2	Iris-setosa	2020-01-04
5.0	3.6	1.4	0.2	Iris-setosa	2020-01-05


# Dask Parquet

- [Best Practices](https://docs.dask.org/en/latest/dataframe-best-practices.html)
- [Remote Data](https://docs.dask.org/en/latest/remote-data-services.html)

In [20]:
!hdfs dfs -rm -r iris_parquet/iris.parq

In [22]:
dst_filepath = HDFS_HOME + "iris_parquet"

schema = pa.schema([
    pa.field('sepal_length', pa.float64()),
    pa.field('sepal_width', pa.float64()),
    pa.field('petal_length', pa.float64()),
    pa.field('petal_width', pa.float64()),
    pa.field('class', pa.string())
])

columns = [
  'sepal_length',
  'sepal_width',
  'petal_length',
  'petal_width',
  'class' 
]

ddf = dd.read_csv("data/iris.data", names=columns)
ddf.to_parquet(dst_filepath,
               engine='pyarrow',
               schema=schema,
               compression='snappy')