## Data uploading

In [1]:
import requests

In [3]:
url = 'https://pastebin.com/raw/TPXFpyYK'
req = requests.get(url, allow_redirects=True)

open('weather.csv', 'wb').write(req.content)

48301

## Data processing

In [4]:
import os
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

os.environ["PYSPARK_PYTHON"]="python3"

WEATHER_FILE = "weather.csv"

In [6]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Extra Task") \
    .getOrCreate()

In [37]:
df = spark.read.csv(WEATHER_FILE, header=True, inferSchema=True).toDF("date", "temp")
df.show(5)

+----------+----+
|      date|temp|
+----------+----+
|2008-01-01|   0|
|2008-01-02|  -5|
|2008-01-03| -11|
|2008-01-04| -11|
|2008-01-05| -12|
+----------+----+
only showing top 5 rows



In [38]:
df = df \
    .select(F.year("date").alias("year"), F.month("date").alias("month"), df.temp)

df.show(5)

+----+-----+----+
|year|month|temp|
+----+-----+----+
|2008|    1|   0|
|2008|    1|  -5|
|2008|    1| -11|
|2008|    1| -11|
|2008|    1| -12|
+----+-----+----+
only showing top 5 rows



In [39]:
df = df \
    .groupBy(df.year, df.month) \
    .agg(F.avg("temp"))

df.show(5)

+----+-----+--------------------+
|year|month|           avg(temp)|
+----+-----+--------------------+
|2012|   10|    7.67741935483871|
|2010|    7|   27.06451612903226|
|2010|   12|  -7.935483870967742|
|2015|    2|-0.21428571428571427|
|2008|    8|  17.419354838709676|
+----+-----+--------------------+
only showing top 5 rows



In [40]:
df = df \
    .groupBy(df.year) \
    .agg(F.variance("avg(temp)").alias("month_temp_var")) \
    .orderBy("month_temp_var", ascending=False) \
    .limit(1)

df.show()

+----+------------------+
|year|    month_temp_var|
+----+------------------+
|2010|158.22513898260243|
+----+------------------+

