<a href="https://colab.research.google.com/github/parkrye/Python/blob/main/202210_Bigdata/SparkSQL_%EC%8B%A4%EC%8A%B5_05_%EC%B5%9C%EC%A0%80_%EC%98%A8%EB%8F%84_%EA%B5%AC%ED%95%98%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

스키마 정의하기

In [None]:
schema = StructType([
    StructField("stationID", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("measure_type", StringType(), True),
    StructField("temperature", FloatType(), True)
])

In [None]:
directory = "C:\\Users\\mhso_lec\\study_notebook\\data"
filename  = "1800.csv"

In [None]:
df = spark.read.schema(schema).csv(f"file:///{directory}\\{filename}")
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)



In [None]:
df.show()

+-----------+--------+------------+-----------+
|  stationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMAX|      -75.0|
|ITE00100554|18000101|        TMIN|     -148.0|
|GM000010962|18000101|        PRCP|        0.0|
|EZE00100082|18000101|        TMAX|      -86.0|
|EZE00100082|18000101|        TMIN|     -135.0|
|ITE00100554|18000102|        TMAX|      -60.0|
|ITE00100554|18000102|        TMIN|     -125.0|
|GM000010962|18000102|        PRCP|        0.0|
|EZE00100082|18000102|        TMAX|      -44.0|
|EZE00100082|18000102|        TMIN|     -130.0|
|ITE00100554|18000103|        TMAX|      -23.0|
|ITE00100554|18000103|        TMIN|      -46.0|
|GM000010962|18000103|        PRCP|        4.0|
|EZE00100082|18000103|        TMAX|      -10.0|
|EZE00100082|18000103|        TMIN|      -73.0|
|ITE00100554|18000104|        TMAX|        0.0|
|ITE00100554|18000104|        TMIN|      -13.0|
|GM000010962|18000104|        PRCP|     

In [None]:
# 최소 온도 구하기
minTemps = df.filter(df.measure_type == "TMIN")
minTemps.show()

+-----------+--------+------------+-----------+
|  stationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMIN|     -148.0|
|EZE00100082|18000101|        TMIN|     -135.0|
|ITE00100554|18000102|        TMIN|     -125.0|
|EZE00100082|18000102|        TMIN|     -130.0|
|ITE00100554|18000103|        TMIN|      -46.0|
|EZE00100082|18000103|        TMIN|      -73.0|
|ITE00100554|18000104|        TMIN|      -13.0|
|EZE00100082|18000104|        TMIN|      -74.0|
|ITE00100554|18000105|        TMIN|       -6.0|
|EZE00100082|18000105|        TMIN|      -58.0|
|ITE00100554|18000106|        TMIN|       13.0|
|EZE00100082|18000106|        TMIN|      -57.0|
|ITE00100554|18000107|        TMIN|       10.0|
|EZE00100082|18000107|        TMIN|      -50.0|
|ITE00100554|18000108|        TMIN|       14.0|
|EZE00100082|18000108|        TMIN|      -31.0|
|ITE00100554|18000109|        TMIN|       23.0|
|EZE00100082|18000109|        TMIN|     

In [None]:
# 최적화를 위해 데이터를 줄입니다. Shuffle이 많이 일어나는 것을 방지
stationTemps = minTemps.select("stationID", "temperature")
stationTemps.show()

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
|EZE00100082|      -73.0|
|ITE00100554|      -13.0|
|EZE00100082|      -74.0|
|ITE00100554|       -6.0|
|EZE00100082|      -58.0|
|ITE00100554|       13.0|
|EZE00100082|      -57.0|
|ITE00100554|       10.0|
|EZE00100082|      -50.0|
|ITE00100554|       14.0|
|EZE00100082|      -31.0|
|ITE00100554|       23.0|
|EZE00100082|      -46.0|
|ITE00100554|       31.0|
|EZE00100082|      -75.0|
+-----------+-----------+
only showing top 20 rows



In [None]:
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [None]:
# withColumn 함수를 사용해서 컬럼을 하나 추가
minTempsByStationF = minTempsByStation.withColumn("temperature",
                                                 func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2 )
                                                 )
minTempsByStationF.show()

+-----------+----------------+-----------+
|  stationID|min(temperature)|temperature|
+-----------+----------------+-----------+
|ITE00100554|          -148.0|       5.36|
|EZE00100082|          -135.0|        7.7|
+-----------+----------------+-----------+



In [None]:
minTempsByStationF.select("stationID", "temperature").show()

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|       5.36|
|EZE00100082|        7.7|
+-----------+-----------+



In [None]:
spark.stop()