In [30]:
from pyspark.sql import SparkSession

In [31]:
spark = SparkSession \
                    .builder \
                    .master("local") \
                    .appName("FirstSparkApp") \
                    .getOrCreate()

In [32]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.host', 'host.docker.internal'),
 ('spark.app.id', 'local-1571639740569'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.app.name', 'FirstSparkApp'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '50887'),
 ('spark.eventLog.dir', 'file:/tmp/spark-events'),
 ('spark.ui.showConsoleProgress', 'true')]

In [33]:
spark

In [7]:
path = r".\search.json"

In [8]:
search = spark.read.json(path, multiLine=True)

In [15]:
type(search)

pyspark.sql.dataframe.DataFrame

In [9]:
search.printSchema()

root
 |-- designation: string (nullable = true)
 |-- discovery_date: string (nullable = true)
 |-- h_mag: string (nullable = true)
 |-- i_deg: string (nullable = true)
 |-- moid_au: string (nullable = true)
 |-- orbit_class: string (nullable = true)
 |-- period_yr: string (nullable = true)
 |-- pha: string (nullable = true)
 |-- q_au_1: string (nullable = true)
 |-- q_au_2: string (nullable = true)



In [10]:
search.describe()

DataFrame[summary: string, designation: string, discovery_date: string, h_mag: string, i_deg: string, moid_au: string, orbit_class: string, period_yr: string, pha: string, q_au_1: string, q_au_2: string]

In [11]:
search.show(n=1)

+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
|       designation|      discovery_date|h_mag|i_deg|moid_au|orbit_class|period_yr|pha|q_au_1|q_au_2|
+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
|419880 (2011 AH37)|2011-01-07T00:00:...| 19.7| 9.65|  0.035|     Apollo|     4.06|  Y|  0.84|  4.26|
+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
only showing top 1 row



In [12]:
search.take(5)

[Row(designation='419880 (2011 AH37)', discovery_date='2011-01-07T00:00:00.000', h_mag='19.7', i_deg='9.65', moid_au='0.035', orbit_class='Apollo', period_yr='4.06', pha='Y', q_au_1='0.84', q_au_2='4.26'),
 Row(designation='419624 (2010 SO16)', discovery_date='2010-09-17T00:00:00.000', h_mag='20.5', i_deg='14.52', moid_au='0.028', orbit_class='Apollo', period_yr='1', pha='Y', q_au_1='0.93', q_au_2='1.08'),
 Row(designation='414772 (2010 OC103)', discovery_date='2010-07-28T00:00:00.000', h_mag='19', i_deg='23.11', moid_au='0.333', orbit_class='Apollo', period_yr='1.31', pha='N', q_au_1='0.39', q_au_2='2'),
 Row(designation='414746 (2010 EH20)', discovery_date='2010-03-06T00:00:00.000', h_mag='18', i_deg='23.89', moid_au='0.268', orbit_class='Amor', period_yr='4.24', pha='N', q_au_1='1.25', q_au_2='3.99'),
 Row(designation='407324 (2010 OB101)', discovery_date='2010-07-18T00:00:00.000', h_mag='20.7', i_deg='9.12', moid_au='0.111', orbit_class='Apollo', period_yr='2.06', pha='N', q_au_1='

In [19]:
out_path = r".\search_log.csv"

In [20]:
search.write.save(out_path, format = "csv", header = True)

In [21]:
search_csv = spark.read.csv(out_path, header = True)

In [23]:
type(search_csv)

pyspark.sql.dataframe.DataFrame

In [24]:
search_csv.printSchema()

root
 |-- designation: string (nullable = true)
 |-- discovery_date: string (nullable = true)
 |-- h_mag: string (nullable = true)
 |-- i_deg: string (nullable = true)
 |-- moid_au: string (nullable = true)
 |-- orbit_class: string (nullable = true)
 |-- period_yr: string (nullable = true)
 |-- pha: string (nullable = true)
 |-- q_au_1: string (nullable = true)
 |-- q_au_2: string (nullable = true)



In [22]:
search_csv.show(n=2)

+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
|       designation|      discovery_date|h_mag|i_deg|moid_au|orbit_class|period_yr|pha|q_au_1|q_au_2|
+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
|419880 (2011 AH37)|2011-01-07T00:00:...| 19.7| 9.65|  0.035|     Apollo|     4.06|  Y|  0.84|  4.26|
|419624 (2010 SO16)|2010-09-17T00:00:...| 20.5|14.52|  0.028|     Apollo|        1|  Y|  0.93|  1.08|
+------------------+--------------------+-----+-----+-------+-----------+---------+---+------+------+
only showing top 2 rows



In [27]:
search_csv.select("orbit_class").show()

+-----------+
|orbit_class|
+-----------+
|     Apollo|
|     Apollo|
|     Apollo|
|       Amor|
|     Apollo|
|       Aten|
|     Apollo|
|     Apollo|
|     Apollo|
|     Apollo|
|       Aten|
|       Aten|
|     Apollo|
|       Amor|
|       Amor|
|     Apollo|
|       Amor|
|       Aten|
|     Apollo|
|     Apollo|
+-----------+
only showing top 20 rows



In [28]:
search_csv.take(1)

[Row(designation='419880 (2011 AH37)', discovery_date='2011-01-07T00:00:00.000', h_mag='19.7', i_deg='9.65', moid_au='0.035', orbit_class='Apollo', period_yr='4.06', pha='Y', q_au_1='0.84', q_au_2='4.26')]

In [37]:
spark.stop()