In [1]:
from pyspark.sql import SparkSession

In [2]:
warehouse = "hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien"

In [3]:
spark = SparkSession \
    .builder \
    .master("yarn") \
    .appName("MS-SIO-HADOOP-PROJECT-SPARK-SQL-TMP") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://sandbox-hdp.hortonworks.com:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
df = spark \
    .read \
    .format("csv") \
    .option("sep", ",") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load("file:/root/ms-sio-hdp/api-transilien/transilien_line_l_stations_by_code.csv")

In [5]:
df.show(200,False)

+--------+---------------------------------------+-------------+-------------+
|station |label                                  |latitude     |longitude    |
+--------+---------------------------------------+-------------+-------------+
|87334482|NEUVILLE UNIVERSITE                    |49.0141135729|2.07888615344|
|87366922|SAINT-GERMAIN EN LAYE BEL AIR FOURQUEUX|48.8950238658|2.07194236158|
|87381111|PONT CARDINET                          |48.8875699026|2.31401853193|
|87381129|CLICHY LEVALLOIS                       |48.8969214742|2.29837992753|
|87381137|ASNIERES SUR SEINE                     |48.9057768994|2.28332241877|
|87381459|CONFLANS FIN D'OISE                    |48.9891848652|2.07455911528|
|87381657|ACHERES VILLE                          |48.9700946335|2.07739969014|
|87381905|CERGY PREFECTURE                       |49.0365179881|2.07971720177|
|87382002|BECON LES BRUYERES                     |48.9055805862|2.26857173893|
|87382200|COURBEVOIE                             |48

In [6]:
db_location = "hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien"

In [7]:
spark.sql(f'create database if not exists transilien location "{db_location}"')

DataFrame[]

In [8]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='hdfs://sandbox-hdp.hortonworks.com:8020/apps/hive/warehouse'),
 Database(name='foodmart', description='', locationUri='hdfs://sandbox-hdp.hortonworks.com:8020/apps/hive/warehouse/foodmart.db'),
 Database(name='transilien', description='', locationUri='hdfs://sandbox-hdp.hortonworks.com:8020/api-transilien')]

In [9]:
spark.sql('use transilien')

DataFrame[]

In [10]:
df.write.mode("overwrite").saveAsTable("sncf_stations")

In [11]:
spark.catalog.listTables()

[Table(name='averagewaitingtime', database='transilien', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='sncf_stations', database='transilien', description=None, tableType='MANAGED', isTemporary=False)]

In [12]:
spark.sql("SELECT * FROM sncf_stations").show(100, False)

+--------+---------------------------------------+-------------+-------------+
|station |label                                  |latitude     |longitude    |
+--------+---------------------------------------+-------------+-------------+
|87334482|NEUVILLE UNIVERSITE                    |49.0141135729|2.07888615344|
|87366922|SAINT-GERMAIN EN LAYE BEL AIR FOURQUEUX|48.8950238658|2.07194236158|
|87381111|PONT CARDINET                          |48.8875699026|2.31401853193|
|87381129|CLICHY LEVALLOIS                       |48.8969214742|2.29837992753|
|87381137|ASNIERES SUR SEINE                     |48.9057768994|2.28332241877|
|87381459|CONFLANS FIN D'OISE                    |48.9891848652|2.07455911528|
|87381657|ACHERES VILLE                          |48.9700946335|2.07739969014|
|87381905|CERGY PREFECTURE                       |49.0365179881|2.07971720177|
|87382002|BECON LES BRUYERES                     |48.9055805862|2.26857173893|
|87382200|COURBEVOIE                             |48

In [14]:
spark.sql("SELECT * FROM sncf_stations").count()

40

In [13]:
spark.sql("SELECT station, label FROM sncf_stations WHERE latitude < 49 ORDER BY station").show()

+--------+--------------------+
| station|               label|
+--------+--------------------+
|87366922|SAINT-GERMAIN EN ...|
|87381111|       PONT CARDINET|
|87381129|    CLICHY LEVALLOIS|
|87381137|  ASNIERES SUR SEINE|
|87381459| CONFLANS FIN D'OISE|
|87381657|       ACHERES VILLE|
|87382002|  BECON LES BRUYERES|
|87382200|          COURBEVOIE|
|87382218|          LA DEFENSE|
|87382259|GARCHES MARNES LA...|
|87382267|          VAUCRESSON|
|87382333|CHAVILLE RIVE DROITE|
|87382341|SEVRES VILLE D'AVRAY|
|87382358|         SAINT-CLOUD|
|87382366|         LE VAL D'OR|
|87382374|SURESNES MONT VAL...|
|87382382|             PUTEAUX|
|87382432|LA CELLE SAINT-CLOUD|
|87382440|            BOUGIVAL|
|87382457|        LOUVECIENNES|
+--------+--------------------+
only showing top 20 rows



In [None]:
spark.stop()