In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,date_format,datediff,to_timestamp

In [5]:
spark = SparkSession.builder.appName("learning").master('local').getOrCreate()
sc = spark.sparkContext

## *****PySpark DF Question******
## Filename: covid19.txt
## 1) Convert both the date columns to (yyyy-MM-dd) format.
## 2) Have an extra column and populate the number of days between these two days.
## 3) Partition the data by year columns and write to HDFS in parquet/avro/json format.

In [9]:
covid_df = spark.read.format('csv')\
                     .option('delimiter',',')\
                     .option('header','True')\
                     .option('inferSchema','True')\
                     .load('file:///home/saif/LFS/datasets/covid19.txt')
covid_df.printSchema()
covid_df.show(5)


root
 |-- D: string (nullable = true)
 |-- Y: integer (nullable = true)
 |-- Dt: string (nullable = true)
 |-- Wkd: string (nullable = true)
 |-- CM: string (nullable = true)
 |-- C: string (nullable = true)
 |-- Com: string (nullable = true)
 |-- TM: string (nullable = true)
 |-- M: string (nullable = true)
 |-- V: integer (nullable = true)
 |-- CL: long (nullable = true)

+-------+----+----------+---------+----------+---+---+---+---+---------+---------+
|      D|   Y|        Dt|      Wkd|        CM|  C|Com| TM|  M|        V|       CL|
+-------+----+----------+---------+----------+---+---+---+---+---------+---------+
|Exports|2015|31/01/2015| Saturday|01/02/2020|All|All|All|  $|257000000|257000000|
|Exports|2015|01/02/2015|   Sunday|02/02/2020|All|All|All|  $|123000000|380000000|
|Exports|2015|02/02/2015|   Monday|03/02/2020|All|All|All|  $|176000000|556000000|
|Exports|2015|03/02/2015|  Tuesday|04/02/2020|All|All|All|  $|115000000|671000000|
|Exports|2015|04/02/2015|Wednesday|05/02/2

In [41]:
## Melwin please Note : is the Date is in strinf format we need to most likely to convert using 
## to_timestamp
## Please refer the below example

In [19]:
covid_df = covid_df.select(
                col('D'),col('Y'),col('Wkd'),col('C'),col('Com'),col('TM'),col('M'),col('V'),col('CL'),
                date_format(to_timestamp(col('Dt'),'dd/MM/yyyy'),'yyyy-MM-dd').alias('Dt'),
                date_format(to_timestamp(col('CM'),'dd/MM/yyyy'),'yyyy-MM-dd').alias('CM')
                )
covid_df.show(5)

+-------+----+---------+---+---+---+---+---------+---------+----------+----------+
|      D|   Y|      Wkd|  C|Com| TM|  M|        V|       CL|        Dt|        CM|
+-------+----+---------+---+---+---+---+---------+---------+----------+----------+
|Exports|2015| Saturday|All|All|All|  $|257000000|257000000|2015-01-31|2020-02-01|
|Exports|2015|   Sunday|All|All|All|  $|123000000|380000000|2015-02-01|2020-02-02|
|Exports|2015|   Monday|All|All|All|  $|176000000|556000000|2015-02-02|2020-02-03|
|Exports|2015|  Tuesday|All|All|All|  $|115000000|671000000|2015-02-03|2020-02-04|
|Exports|2015|Wednesday|All|All|All|  $| 74000000|746000000|2015-02-04|2020-02-05|
+-------+----+---------+---+---+---+---+---------+---------+----------+----------+
only showing top 5 rows



In [21]:
df = covid_df.select(col('*'), 
    datediff(col('CM'),col('Dt')).alias('diffdate')
                   )

df.show(5)

+-------+----+---------+---+---+---+---+---------+---------+----------+----------+--------+
|      D|   Y|      Wkd|  C|Com| TM|  M|        V|       CL|        Dt|        CM|diffdate|
+-------+----+---------+---+---+---+---+---------+---------+----------+----------+--------+
|Exports|2015| Saturday|All|All|All|  $|257000000|257000000|2015-01-31|2020-02-01|    1827|
|Exports|2015|   Sunday|All|All|All|  $|123000000|380000000|2015-02-01|2020-02-02|    1827|
|Exports|2015|   Monday|All|All|All|  $|176000000|556000000|2015-02-02|2020-02-03|    1827|
|Exports|2015|  Tuesday|All|All|All|  $|115000000|671000000|2015-02-03|2020-02-04|    1827|
|Exports|2015|Wednesday|All|All|All|  $| 74000000|746000000|2015-02-04|2020-02-05|    1827|
+-------+----+---------+---+---+---+---+---------+---------+----------+----------+--------+
only showing top 5 rows



In [23]:
df.write.option("header",True) \
        .partitionBy("Y") \
        .mode("overwrite") \
        .json("hdfs://localhost:9000/user/saif/HFS/Output/df_op/covid18_json")

In [24]:
df.write.option("header",True) \
        .partitionBy("Y") \
        .mode("overwrite") \
        .parquet("hdfs://localhost:9000/user/saif/HFS/Output/df_op/covid18_parquet")

In [25]:
df.write.option("header", True) \
        .partitionBy("Y") \
        .mode("overwrite") \
        .format("com.databricks.spark.avro") \
        .save("hdfs://localhost:9000/user/saif/HFS/Output/df_op/covid18_avro")

## *****PySpark DF Question*****
## Filename: txns
## Metadata: txnno: String, txndate: String, custno: String, amount: String, category: String, product: String, city: String, state: String, spendby: String
## 1) Read the file & show 5 records.
## 2) Separate txndate in date, month, year Column. Convert date column into words (e.g. 01 --> Sunday, 02 --> Monday).
## 3) Find the sum of amount daywise.
## 4) Write the output data in json format.
## Output Columns:
## Date(Words),Sum(Amount)

In [51]:
txn_df = spark.read.format('csv')\
              .option('delimiter',',')\
              .option('header','True')\
              .option('inferSchema','True')\
              .load('file:///home/saif/LFS/datasets/txns')
txn_df.printSchema()
txn_df.show(5)

root
 |-- txnno: integer (nullable = true)
 |-- txndate: string (nullable = true)
 |-- custno: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- spendby: string (nullable = true)

+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|txnno|   txndate| custno|amount|          category|             product|       city|     state|spendby|
+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|    0|06-26-2011|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville| Tennessee| credit|
|    1|05-26-2011|4006742|198.44|Exercise & Fitness|Weightlifting Gloves| Long Beach|California| credit|
|    2|06-01-2011|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|    Anaheim|California| credit|
|    3|06-05-2011|4

In [52]:
from pyspark.sql.functions import year, month,col,to_timestamp,dayofmonth,date_format

In [53]:
txn_df.select(
    col('txndate'),
    year(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('Year'),
    month(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('month'),
    dayofmonth(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('day'),
    date_format(to_timestamp(col('txndate'),'MM-dd-yyyy'),'EEEE').alias('weekday')
).show(5)

+----------+----+-----+---+---------+
|   txndate|Year|month|day|  weekday|
+----------+----+-----+---+---------+
|06-26-2011|2011|    6| 26|   Sunday|
|05-26-2011|2011|    5| 26| Thursday|
|06-01-2011|2011|    6|  1|Wednesday|
|06-05-2011|2011|    6|  5|   Sunday|
|12-17-2011|2011|   12| 17| Saturday|
+----------+----+-----+---+---------+
only showing top 5 rows



In [54]:
txn_df= txn_df.select(
    col('*'),
    year(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('Year'),
    month(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('month'),
    dayofmonth(to_timestamp(col('txndate'),'MM-dd-yyyy')).alias('day'),
    date_format(to_timestamp(col('txndate'),'MM-dd-yyyy'),'EEEE').alias('weekday')
).groupby('weekday').sum()

In [55]:
txn_df.show(5)

+---------+----------+-----------+------------------+---------+----------+--------+
|  weekday|sum(txnno)|sum(custno)|       sum(amount)|sum(Year)|sum(month)|sum(day)|
+---------+----------+-----------+------------------+---------+----------+--------+
|Wednesday| 656193507|54651928480|1398153.0799999963| 27442106|     88416|  217559|
|  Tuesday| 660684231|55289483879|1413094.4900000046| 27761855|     89376|  211615|
|   Friday| 653784437|54640840104|1387614.2200000007| 27436073|     90435|  212183|
| Thursday| 657094875|54924088380| 1410064.849999999| 27578854|     90044|  215351|
| Saturday| 656714364|54668300900|1394970.8200000047| 27450150|     87696|  209580|
+---------+----------+-----------+------------------+---------+----------+--------+
only showing top 5 rows



In [56]:
txn_df.write.option("header",True) \
        .mode("overwrite") \
        .json("hdfs://localhost:9000/user/saif/HFS/Output/df_op/txns_json")