In [1]:
import requests
from hdfs import InsecureClient
from pyspark.sql import SparkSession

# Download file:
url = "https://msi.nga.mil/api/publications/download?type=view&key=16920959/SFH00000/UpdatedPub150.csv"
local_csv_path = "/tmp/UpdatedPub150.csv"
response = requests.get(url, verify=False)

with open(local_csv_path, 'wb') as file:
    file.write(response.content)

# Connect to HDFS:
hdfs_client = InsecureClient('http://hdfs-namenode:9870', user='root')

# Upload the file to HDFS:
hdfs_path = '/data/UpdatedPub150.csv'
hdfs_client.upload(hdfs_path, local_csv_path, overwrite=True)

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV with PySpark") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .config("spark.cores.max", "1") \
    .config("spark.driver.memory", "2g") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

# Define the path to the CSV file in HDFS:
hdfs_csv_path = "hdfs://hdfs-namenode:8020/data/UpdatedPub150.csv"

# Read the CSV file into a DataFrameL
df = spark.read.csv(hdfs_csv_path, header=True, inferSchema=True)

# Create a temporary SQL viewL
df.createOrReplaceTempView("temporary_view")

# Query the temporary view using SQL
query_result = spark.sql("SELECT `Main Port Name`, `Latitude`, `Longitude` FROM temporary_view LIMIT 10")

# Show the result of the query
query_result.show()

# Optional: Print the schema of the DataFrame
df.printSchema()



+--------------------+----------+----------+
|      Main Port Name|  Latitude| Longitude|
+--------------------+----------+----------+
|              Maurer| 40.533333|    -74.25|
|             Iharana|    -13.35|      50.0|
|             Andoany|     -13.4|      48.3|
|         Chake Chake|     -5.25| 39.766667|
|   Mjimwema Terminal| -6.816667| 39.366667|
|      Delta Terminal|     36.85| 36.166667|
|  Cinta Oil Terminal|      -5.5|106.233333|
|        Europa Point| 36.133333|     -5.35|
|          New Harbor| 43.866667|-69.483333|
|Dtse / Gegua Oil ...|-22.816667|    -43.15|
+--------------------+----------+----------+

root
 |-- OID_: double (nullable = true)
 |-- World Port Index Number: double (nullable = true)
 |-- Region Name: string (nullable = true)
 |-- Main Port Name: string (nullable = true)
 |-- Alternate Port Name: string (nullable = true)
 |-- UN/LOCODE: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- World Water Body: string (nullable = true)
