In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('c8').getOrCreate()

In [3]:
# writing csv files

In [4]:
csvFile = spark.read.format('csv')\
.option("header","true")\
.option("mode","FAILFAST")\
.option("inferSchema","true")\
.load('flight-data/csv/2010-summary.csv')

In [6]:
# for instance, we can take out CSV file and write it out as a TSV file quite easily.

In [7]:
csvFile.write.format("csv").mode("overwrite").option("sep", "\t")\
.save("/tmp/my-tsv-file.tsv")

In [8]:
# reading json files

In [9]:
spark.read.format('json').option("mode","FAILFAST")\
.option("inferSchema","true")\
.load('flight-data/json/2010-summary.json').show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [10]:
#writing json files

In [11]:
csvFile.write.format("json").mode("overwrite").save("/tmp/my-json-file.json")

In [12]:
# reading parquet files

In [13]:
spark.read.format('parquet')\
.load('flight-data/parquet/2010-summary.parquet').show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [14]:
# writing parquet files

In [15]:
csvFile.write.format('parquet').mode("overwrite").save('/tmp/my-parquet-file.parquet')

In [16]:
# read orc files

In [17]:
spark.read.format('orc').load('flight-data/orc/2010-summary.orc').show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [18]:
# writing orc files

In [19]:
csvFile.write.format('orc').mode("overwrite").save('/tmp/my-json-file.orc')

In [20]:
# reading from sql databases

In [21]:
driver = "org.sqlite.JDBC"

In [22]:
path = '/flight-data/jdbc/my-sqlite.db'

In [23]:
url = "jdbc:sqlite:" + path

In [24]:
tablename = "flight_info"

In [25]:
dbDataFrame = spark.read.format("jdbc").option("url",url)\
.option("dbtable", tablename).option("driver", driver).load()

Py4JJavaError: An error occurred while calling o174.load.
: java.lang.ClassNotFoundException: org.sqlite.JDBC
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:38)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:78)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$6.apply(JDBCOptions.scala:78)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:78)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:330)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [26]:
# skipping sql read and write

In [27]:
# reading text files

In [28]:
spark.read.text('flight-data/csv/2010-summary.csv')\
.selectExpr("split(value, ',') as rows").show()

+--------------------+
|                rows|
+--------------------+
|[DEST_COUNTRY_NAM...|
|[United States, R...|
|[United States, I...|
|[United States, I...|
|[Egypt, United St...|
|[Equatorial Guine...|
|[United States, S...|
|[United States, G...|
|[Costa Rica, Unit...|
|[Senegal, United ...|
|[United States, M...|
|[Guyana, United S...|
|[United States, S...|
|[Malta, United St...|
|[Bolivia, United ...|
|[Anguilla, United...|
|[Turks and Caicos...|
|[United States, A...|
|[Saint Vincent an...|
|[Italy, United St...|
+--------------------+
only showing top 20 rows



In [29]:
# writing text files

In [30]:
csvFile.select("DEST_COUNTRY_NAME").write.text("/tmp/simple-text-file.txt")

In [31]:
#partition writing text files

In [32]:
csvFile.limit(10).select("DEST_COUNTRY_NAME","count")\
.write.partitionBy("count").text("/tmp/five-csv-files2py.csv")

In [33]:
#writing data in parallel

In [34]:
csvFile.repartition(5).write.format("csv").save("/tmp/multiple.csv")

In [35]:
# partition

In [36]:
csvFile.limit(10).write.mode("overwrite").partitionBy("DEST_COUNTRY_NAME")\
.save("/tmp/partitioned-files.parquet")