# Connecting to Postgres
This notebook shows how to pass JDBC driver and connect to our Postgres

In [61]:
import findspark

findspark.add_jars('/app/postgresql-42.1.4.jar')
findspark.init()

In [62]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("pyspark-postgres")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [63]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres/workshop") \
    .option("dbtable", "workshop.weather_hourly") \
    .option("user", "workshop") \
    .option("password", "w0rkzh0p") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [64]:
df.printSchema()

root
 |-- dt: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- temp: float (nullable = true)
 |-- weather: string (nullable = true)



In [None]:
from pyspark.sql.functions import desc

In [82]:
def get_main_weather(x):
    print(x)
    return x.weather.split('-')[0]

In [None]:
result_schema=StructType(
[
    StructField("weather",StringType(), False),
]

In [83]:
mainWeathers = df.rdd.map(lambda x: get_main_weather(x)).toDF(result_schema)

TypeError: Can not infer schema for type: <class 'str'>

In [84]:
mainWeathers

PythonRDD[41] at RDD at PythonRDD.scala:53

In [66]:
df.groupBy('weather').count().orderBy(desc('count')).show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+-----+
|             weather|count|
+--------------------+-----+
|Clouds-overcast c...|   29|
|     Rain-light rain|   10|
|Clouds-broken clouds|    6|
|Rain-heavy intens...|    2|
|  Rain-moderate rain|    1|
+--------------------+-----+



                                                                                

In [None]:
from datetime import datetime
df = df.toPandas()
df['dt'] = df['dt'].apply(lambda x : datetime.fromtimestamp(x))

In [None]:
df.plot(x='dt')

In [None]:
spark.stop()