In [2]:
import findspark 
findspark.init() 
import pyspark # only run after findspark.init() 
from pyspark.sql import SparkSession 
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate() 
df = spark.sql('''select 'spark' as hello ''') 
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [3]:
#import text file with 10 minutes temperature values as csv
df_weather = spark.read.option("header", "true").option("delimiter", ";").option("inferSchema", "false").csv('wetter.txt')
df_weather.printSchema()

root
 |-- STATIONS_ID: string (nullable = true)
 |-- MESS_DATUM: string (nullable = true)
 |-- QN: string (nullable = true)
 |-- PP_10: string (nullable = true)
 |-- TT_10: string (nullable = true)
 |-- TM5_10: string (nullable = true)
 |-- RF_10: string (nullable = true)
 |-- TD_10: string (nullable = true)



In [4]:
#show data frame
df_weather.show()

+-----------+------------+-----+-------+------+------+------+------+
|STATIONS_ID|  MESS_DATUM|   QN|  PP_10| TT_10|TM5_10| RF_10| TD_10|
+-----------+------------+-----+-------+------+------+------+------+
|          3|199304281230|    1|  987.3|  24.9|  28.4|  23.0|   2.4|
|          3|199304281240|    1|  987.2|  24.9|  28.6|  21.0|   1.2|
|          3|199304281250|    1|  987.2|  25.5|  28.7|  20.0|   0.7|
|          3|199304281300|    1|  987.0|  25.8|  28.8|  20.0|   1.0|
|          3|199304281310|    1|  986.9|  25.8|  29.6|  20.0|   0.9|
|          3|199304281320|    1|  986.7|  25.7|  29.7|  19.0|   0.2|
|          3|199304281330|    1|  986.8|  26.0|  29.8|  20.0|   1.5|
|          3|199304281340|    1|  986.8|  26.1|  29.7|  18.0|   0.2|
|          3|199304281350|    1|  986.7|  27.0|  29.7|  19.0|   1.4|
|          3|199304281400|    1|  986.6|  26.5|  29.2|  17.0|  -0.4|
|          3|199304281410|    1|  986.5|  26.8|  30.1|  19.0|   1.6|
|          3|199304281420|    1|  

In [29]:
#Bauernregel: Bringt der Juli heiße Glut, gerät auch der September gut.
#filter all temperature values 01.07. to 31.07. (Bringt der Juli heiße Glut)
df_july = df_weather.filter((df_weather["MESS_DATUM"]>='199307010000') & (df_weather["MESS_DATUM"]<='199307312350')).select(df_weather.MESS_DATUM, df_weather.TT_10)
df_july.show(200)

+------------+------+
|  MESS_DATUM| TT_10|
+------------+------+
|199307010100|  17.6|
|199307010110|  17.5|
|199307010120|  17.3|
|199307010130|  17.1|
|199307010140|  16.9|
|199307010150|  16.7|
|199307010200|  16.5|
|199307010210|  16.3|
|199307010220|  16.2|
|199307010230|  16.0|
|199307010240|  16.0|
|199307010250|  15.9|
|199307010300|  15.7|
|199307010310|  15.5|
|199307010320|  15.4|
|199307010330|  15.3|
|199307010340|  15.3|
|199307010350|  15.1|
|199307010400|  15.1|
|199307010410|  15.0|
|199307010420|  14.9|
|199307010430|  15.0|
|199307010440|  14.9|
|199307010450|  14.7|
|199307010500|  14.8|
|199307010510|  14.8|
|199307010520|  14.7|
|199307010530|  14.7|
|199307010540|  14.8|
|199307010550|  14.9|
|199307010600|  15.0|
|199307010610|  15.3|
|199307010620|  15.7|
|199307010630|  15.7|
|199307010640|  16.1|
|199307010650|  16.3|
|199307010700|  16.7|
|199307010710|  17.4|
|199307010720|  17.8|
|199307010730|  18.5|
|199307010740|  19.2|
|199307010750|  19.4|
|199307010

In [30]:
#Delete the columns where values are missing (-999)
df_weather = df_weather.filter((df_weather.TT_10 != -999.0))

In [32]:
#schow average temperature in July
df_july_avg = df_july.groupBy().agg(F.avg(df_weather.TT_10).alias('TT_july_avg'))
df_july_avg.show()

+------------------+
|       TT_july_avg|
+------------------+
|16.856591062205286|
+------------------+



In [33]:
#filter all temperature values 01.09. to 30.09. (gerät auch der September gut)
df_september = df_weather.filter((df_weather["MESS_DATUM"]>='199309010000') & (df_weather["MESS_DATUM"]<='199309302350')).select(df_weather.MESS_DATUM, df_weather.TT_10)
df_september.show(200)

+------------+------+
|  MESS_DATUM| TT_10|
+------------+------+
|199309010100|  11.2|
|199309010110|  11.0|
|199309010120|  10.9|
|199309010130|  10.8|
|199309010140|  10.8|
|199309010150|  10.7|
|199309010200|  10.6|
|199309010210|  10.5|
|199309010220|  10.4|
|199309010230|  10.3|
|199309010240|  10.3|
|199309010250|  10.3|
|199309010300|  10.4|
|199309010310|  10.4|
|199309010320|  10.4|
|199309010330|  10.5|
|199309010340|  10.5|
|199309010350|  10.5|
|199309010400|  10.5|
|199309010410|  10.4|
|199309010420|  10.4|
|199309010430|  10.3|
|199309010440|  10.3|
|199309010450|  10.3|
|199309010500|  10.3|
|199309010510|  10.3|
|199309010520|  10.3|
|199309010530|  10.3|
|199309010540|  10.3|
|199309010550|  10.3|
|199309010600|  10.3|
|199309010610|  10.4|
|199309010620|  10.4|
|199309010630|  10.3|
|199309010640|  10.4|
|199309010650|  10.4|
|199309010700|  10.5|
|199309010710|  10.5|
|199309010720|  10.5|
|199309010730|  10.6|
|199309010740|  10.6|
|199309010750|  10.7|
|199309010

In [34]:
#schow average temperature in September
df_september_avg = df_september.groupBy().agg(F.avg(df_weather.TT_10).alias('TT_september_avg'))
df_september_avg.show()

+------------------+
|  TT_september_avg|
+------------------+
|13.211925754060312|
+------------------+

