<a href="https://colab.research.google.com/github/pksX01/PySpark_Tutorials/blob/main/Working_with_Hive_and_PySpark_in_Google_Cloud_Dataproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
sc

In [None]:
spark

In [None]:
df = spark.read.option('header', 'true').csv('gs://datsets-for-big-data/stroke_data/healthcare-dataset-stroke-data.csv')

In [None]:
df.show(5)

+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male| 67|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female| 61|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male| 80|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female| 49|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female| 79|           1|            0|         Yes|Self

In [None]:
new_df = df.select('id', 'gender', 'age', 'stroke')

In [None]:
new_df.write.option('header', 'true').csv('/user/spark/sample_stroke_data')

                                                                                

**Working with Hive Tables**

In [None]:
spark.sql("show databases").show()

+----------+
| namespace|
+----------+
|   default|
|   finance|
|healthcare|
+----------+



In [None]:
stocks_df = spark.sql("select * from finance.stocks")

In [None]:
stocks_df.show(5)

+------------+------+------+------+------+-------+-------+
|trading_date|  open|  high|   low| close| volume|openint|
+------------+------+------+------+------+-------+-------+
|        null|  null|  null|  null|  null|   null|   null|
|  2010-07-21|24.333|24.333|23.946|23.946|43321.0|      0|
|  2010-07-22|24.644|24.644|24.362|24.487|18031.0|      0|
|  2010-07-23|24.759|24.759|24.314|24.507| 8897.0|      0|
|  2010-07-26|24.624|24.624|24.449|24.595|19443.0|      0|
+------------+------+------+------+------+-------+-------+
only showing top 5 rows



In [None]:
import pyspark.sql.functions as f
stocks_avg_df = stocks_df.dropna().withColumn('year', f.year(f.to_date('trading_date', 'yyyy-MM-dd'))).groupBy('year')\
        .agg(
            f.avg('open').alias('average_open'),\
            f.avg('close').alias('average_close'),\
            f.avg('low').alias('average_low'),\
            f.avg('high').alias('average_high'),\
        )

In [None]:
stocks_avg_df.show()

+----+------------------+----------------+------------------+------------------+
|year|      average_open|   average_close|       average_low|      average_high|
+----+------------------+----------------+------------------+------------------+
|2010|26.865070866272514|26.8716991728386| 26.77238933596991|26.936309814453125|
|2011|29.679443492060123|29.6788086766782|29.605217394621477|29.751608740765114|
+----+------------------+----------------+------------------+------------------+



In [None]:
%%writefile stocks_transformation.py

import sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

db = sys.argv[1]
tbl = sys.argv[2]

spark = SparkSession.builder.appName("Transformations on Stocks data").enableHiveSupport().getOrCreate()

stocks_df = spark.sql("select * from {}.{}".format(db, tbl))

transformed_stocks_df = stocks_df.dropna().withColumn('year', f.year(f.to_date('trading_date', 'yyyy-MM-dd'))).groupBy('year')\
        .agg(
            f.avg('open').alias('average_open'),\
            f.avg('close').alias('average_close'),\
            f.avg('low').alias('average_low'),\
            f.avg('high').alias('average_high'),\
        )

transformed_stocks_df.write.mode("overwrite").saveAsTable("finance.transformed_stocks")

Overwriting stocks_transformation.py


In [None]:
from google.cloud import storage


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )


In [None]:
upload_blob('datsets-for-big-data', 'stocks_transformation.py', 'python_files/stocks_transformation.py')

File stocks_transformation.py uploaded to python_files/stocks_transformation.py.
