In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [2]:
# get or create Spark session

app_name = "spark-review"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [3]:
rs_file ="/FileStore/tables/ratings_and_sentiments.csv"
df_rs = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(rs_file)
df_rs.printSchema()

In [4]:
import boto3

secret_name = "ut/postgres/db"
region_name = "us-east-2"
access_key = "YOUR ACCESS HERE"
secret_key = "YOUR KEY HERE"

session = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region_name)
client = session.client('secretsmanager')
secret_value = client.get_secret_value(SecretId=secret_name)
# secret_value

In [5]:
import json
def get_connection(secret_value):
  return json.loads(secret_value['SecretString'])
# get_connection(secret_value)

In [6]:
connection = get_connection(secret_value)

# Postgres credentials
jdbcHostname = connection['host']
jdbcPort = connection['port']
jdbcDatabase = "postgres"
dialect = "postgresql"
jdbcUsername = connection['username']
jdbcPassword = connection['password']

jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
connectionProperties = {
  "user" : jdbcUsername,
  "password" : jdbcPassword,
  "driver" : "org.postgresql.Driver" 
}
# for mysql driver = com.mysql.jdbc.Driver

In [7]:
# CREATE TABLE review_ratings (
#   label TEXT,  -- This will be the review text
#   ratings INTEGER,
#   review_length INTEGER -- length of the review text
# )

In [8]:
table = "coffee_rating"
mode = "overwrite"

df_rs = df_rs.withColumn("label", F.regexp_extract("review_text", r'\d+/\d+/\d+(?:\s)(.*)', 1))
df_rs.head()

In [9]:
table = "review_ratings"
mode = "overwrite"

df_rs.groupBy("label") \
  .agg(F.mean("num_rating").alias("ratings")) \
  .withColumn("review_length", F.length("label")) \
  .write \
  .jdbc(jdbcUrl, table, mode, connectionProperties)