# Input/Output

## Data

In [1]:
import pandas as pd
from random import randint, choice

def get_record(idx, n_cols):
    gender = choice(['male', 'female'])
    data = [idx, gender] + [randint(1, 100) for _ in range(n_cols)]
    return tuple(data)

n_cols = 10
n_rows = 10

data = [get_record(i, n_cols) for i, r in enumerate(range(n_rows))]
columns = ['id', 'gender'] + [f'x{i}' for i in range(n_cols)]

df = sqlContext.createDataFrame(pd.DataFrame(data, columns=columns))

In [2]:
df.show()

+---+------+---+---+---+---+---+---+---+---+---+---+
| id|gender| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+---+------+---+---+---+---+---+---+---+---+---+---+
|  0|female| 65| 16| 65| 64| 81| 15| 90| 52| 54| 26|
|  1|  male| 61| 50| 83|100| 49| 62| 70|  5| 47| 44|
|  2|  male| 28| 54| 84|  9| 64| 82| 60| 86| 81| 41|
|  3|female| 74| 82| 77| 16| 90| 18| 20| 95| 31| 15|
|  4|female| 26| 64| 68| 96| 35| 36| 33| 59|  3|  1|
|  5|female|  8| 74| 60| 85| 64| 74| 68| 76| 70| 54|
|  6|  male| 36| 12| 12| 48| 58| 71| 17| 15| 41| 88|
|  7|female| 19| 31| 35| 57| 54| 65| 78| 49| 32| 21|
|  8|  male| 17| 90| 47| 86| 57| 59|  7| 24| 51| 43|
|  9|  male| 51| 26| 38| 39| 39| 95| 13| 27| 27| 63|
+---+------+---+---+---+---+---+---+---+---+---+---+



In [3]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- x0: long (nullable = true)
 |-- x1: long (nullable = true)
 |-- x2: long (nullable = true)
 |-- x3: long (nullable = true)
 |-- x4: long (nullable = true)
 |-- x5: long (nullable = true)
 |-- x6: long (nullable = true)
 |-- x7: long (nullable = true)
 |-- x8: long (nullable = true)
 |-- x9: long (nullable = true)



## Writing data

### CSV

In [4]:
df.write\
    .format('com.databricks.spark.csv')\
    .mode('overwrite')\
    .option('header', 'true')\
    .save('/user/root/data.csv')

In [5]:
%%sh
hdfs dfs -ls /user/root/data.csv | awk '{print $8}'


/user/root/data.csv/_SUCCESS
/user/root/data.csv/part-00000-88b1edd1-a941-495c-bdc2-2252437d637b-c000.csv
/user/root/data.csv/part-00001-88b1edd1-a941-495c-bdc2-2252437d637b-c000.csv


### As one CSV file

In [6]:
df.repartition(1).write\
    .format('com.databricks.spark.csv')\
    .mode('overwrite')\
    .option('header', 'true')\
    .save('/user/root/data.csv')

In [7]:
%%sh
hdfs dfs -ls /user/root/data.csv | awk '{print $8}'


/user/root/data.csv/_SUCCESS
/user/root/data.csv/part-00000-c024ed77-4edd-4606-93f2-69dc259460f9-c000.csv


### JSON

In [8]:
df.write\
    .format('json')\
    .mode('overwrite')\
    .save('/user/root/data.json')

In [9]:
%%sh
hdfs dfs -ls /user/root/data.json | awk '{print $8}'


/user/root/data.json/_SUCCESS
/user/root/data.json/part-00000-1e8a9501-865f-4dd9-afd3-0818d3b1c3ab-c000.json
/user/root/data.json/part-00001-1e8a9501-865f-4dd9-afd3-0818d3b1c3ab-c000.json


### Parquet

In [10]:
df.write\
    .format('parquet')\
    .mode('overwrite')\
    .save('/user/root/data.parquet')

In [11]:
%%sh
hdfs dfs -ls /user/root/data.parquet | awk '{print $8}'


/user/root/data.parquet/_SUCCESS
/user/root/data.parquet/part-00000-b9959723-1f8a-46f5-b1f7-f9c8c13b82e9-c000.snappy.parquet
/user/root/data.parquet/part-00001-b9959723-1f8a-46f5-b1f7-f9c8c13b82e9-c000.snappy.parquet


### Parquet with partitions

In [12]:
df.write\
    .format('parquet')\
    .mode('overwrite')\
    .partitionBy('gender')\
    .save('/user/root/data.parquet')

In [13]:
%%sh
hdfs dfs -ls /user/root/data.parquet | awk '{print $8}'


/user/root/data.parquet/_SUCCESS
/user/root/data.parquet/gender=female
/user/root/data.parquet/gender=male


In [14]:
%%sh
hdfs dfs -ls /user/root/data.parquet/gender=female | awk '{print $8}'


/user/root/data.parquet/gender=female/part-00000-cc86f798-414c-47d5-a890-e8c04e2462dd.c000.snappy.parquet
/user/root/data.parquet/gender=female/part-00001-cc86f798-414c-47d5-a890-e8c04e2462dd.c000.snappy.parquet


In [15]:
%%sh
hdfs dfs -ls /user/root/data.parquet/gender=male | awk '{print $8}'


/user/root/data.parquet/gender=male/part-00000-cc86f798-414c-47d5-a890-e8c04e2462dd.c000.snappy.parquet
/user/root/data.parquet/gender=male/part-00001-cc86f798-414c-47d5-a890-e8c04e2462dd.c000.snappy.parquet


### ORC

In [16]:
df.write\
    .format('orc')\
    .mode('overwrite')\
    .save('/user/root/data.orc')

In [17]:
%%sh
hdfs dfs -ls /user/root/data.orc | awk '{print $8}'


/user/root/data.orc/_SUCCESS
/user/root/data.orc/part-00000-4fc3d3d7-061c-404c-99d4-0c6d6ee063d0-c000.snappy.orc
/user/root/data.orc/part-00001-4fc3d3d7-061c-404c-99d4-0c6d6ee063d0-c000.snappy.orc


## Reading

### Whole text file

In [18]:
pair_rdd = sc.wholeTextFiles('hdfs://localhost/*.csv')

item = pair_rdd.collect()[0]

print(item[0])
print(item[1][0:90])

hdfs://localhost/data.csv
x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
14,22,25,63,47,52,13,14,23,27
35,80,38,28,73,69,21,16,76,53



### Text file by lines

In [19]:
rdd = sc.textFile('hdfs://localhost/data.csv')
rdd.take(5)

['x0,x1,x2,x3,x4,x5,x6,x7,x8,x9',
 '14,22,25,63,47,52,13,14,23,27',
 '35,80,38,28,73,69,21,16,76,53',
 '46,37,46,55,78,68,61,62,81,82',
 '19,12,45,50,71,63,94,7,10,77']

### CSV

In [20]:
spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('/user/root/data.csv')\
    .show()

+---+------+---+---+---+---+---+---+---+---+---+---+
| id|gender| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+---+------+---+---+---+---+---+---+---+---+---+---+
|  0|female| 65| 16| 65| 64| 81| 15| 90| 52| 54| 26|
|  1|  male| 61| 50| 83|100| 49| 62| 70|  5| 47| 44|
|  2|  male| 28| 54| 84|  9| 64| 82| 60| 86| 81| 41|
|  3|female| 74| 82| 77| 16| 90| 18| 20| 95| 31| 15|
|  4|female| 26| 64| 68| 96| 35| 36| 33| 59|  3|  1|
|  5|female|  8| 74| 60| 85| 64| 74| 68| 76| 70| 54|
|  6|  male| 36| 12| 12| 48| 58| 71| 17| 15| 41| 88|
|  7|female| 19| 31| 35| 57| 54| 65| 78| 49| 32| 21|
|  8|  male| 17| 90| 47| 86| 57| 59|  7| 24| 51| 43|
|  9|  male| 51| 26| 38| 39| 39| 95| 13| 27| 27| 63|
+---+------+---+---+---+---+---+---+---+---+---+---+



### JSON

In [21]:
spark.read.format('json')\
    .option('inferSchema', 'true')\
    .load('/user/root/data.json')\
    .show()

+------+---+---+---+---+---+---+---+---+---+---+---+
|gender| id| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+------+---+---+---+---+---+---+---+---+---+---+---+
|female|  0| 65| 16| 65| 64| 81| 15| 90| 52| 54| 26|
|  male|  1| 61| 50| 83|100| 49| 62| 70|  5| 47| 44|
|  male|  2| 28| 54| 84|  9| 64| 82| 60| 86| 81| 41|
|female|  3| 74| 82| 77| 16| 90| 18| 20| 95| 31| 15|
|female|  4| 26| 64| 68| 96| 35| 36| 33| 59|  3|  1|
|female|  5|  8| 74| 60| 85| 64| 74| 68| 76| 70| 54|
|  male|  6| 36| 12| 12| 48| 58| 71| 17| 15| 41| 88|
|female|  7| 19| 31| 35| 57| 54| 65| 78| 49| 32| 21|
|  male|  8| 17| 90| 47| 86| 57| 59|  7| 24| 51| 43|
|  male|  9| 51| 26| 38| 39| 39| 95| 13| 27| 27| 63|
+------+---+---+---+---+---+---+---+---+---+---+---+



### Parquet

In [22]:
spark.read.parquet('/user/root/data.parquet').show()

+---+---+---+---+---+---+---+---+---+---+---+------+
| id| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|gender|
+---+---+---+---+---+---+---+---+---+---+---+------+
|  6| 36| 12| 12| 48| 58| 71| 17| 15| 41| 88|  male|
|  8| 17| 90| 47| 86| 57| 59|  7| 24| 51| 43|  male|
|  9| 51| 26| 38| 39| 39| 95| 13| 27| 27| 63|  male|
|  0| 65| 16| 65| 64| 81| 15| 90| 52| 54| 26|female|
|  3| 74| 82| 77| 16| 90| 18| 20| 95| 31| 15|female|
|  4| 26| 64| 68| 96| 35| 36| 33| 59|  3|  1|female|
|  5|  8| 74| 60| 85| 64| 74| 68| 76| 70| 54|female|
|  7| 19| 31| 35| 57| 54| 65| 78| 49| 32| 21|female|
|  1| 61| 50| 83|100| 49| 62| 70|  5| 47| 44|  male|
|  2| 28| 54| 84|  9| 64| 82| 60| 86| 81| 41|  male|
+---+---+---+---+---+---+---+---+---+---+---+------+



### ORC

In [23]:
spark.read.orc('/user/root/data.orc').show()

+---+------+---+---+---+---+---+---+---+---+---+---+
| id|gender| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+---+------+---+---+---+---+---+---+---+---+---+---+
|  0|female| 65| 16| 65| 64| 81| 15| 90| 52| 54| 26|
|  1|  male| 61| 50| 83|100| 49| 62| 70|  5| 47| 44|
|  2|  male| 28| 54| 84|  9| 64| 82| 60| 86| 81| 41|
|  3|female| 74| 82| 77| 16| 90| 18| 20| 95| 31| 15|
|  4|female| 26| 64| 68| 96| 35| 36| 33| 59|  3|  1|
|  5|female|  8| 74| 60| 85| 64| 74| 68| 76| 70| 54|
|  6|  male| 36| 12| 12| 48| 58| 71| 17| 15| 41| 88|
|  7|female| 19| 31| 35| 57| 54| 65| 78| 49| 32| 21|
|  8|  male| 17| 90| 47| 86| 57| 59|  7| 24| 51| 43|
|  9|  male| 51| 26| 38| 39| 39| 95| 13| 27| 27| 63|
+---+------+---+---+---+---+---+---+---+---+---+---+

