In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder\
        .appName("learning")\
        .master("local")\
        .config('spark.jars','/home/saif/LFS/jars/spark-xml_2.12-0.5.0.jar')\
        .getOrCreate()
    #        
        
sc = spark.sparkContext

## *****PySpark DF Question*****
## 1) Find the employee count & cost to company for each group consisting of dept, cadre, and state?
## Filename: sales.txt

In [3]:
sales_df = spark.read.format("csv")\
                     .option('delimiter',',')\
                     .option('header','True') \
                     .option('inferSchema','True') \
                     .load('file:///home/saif/LFS/datasets/sales.txt')

sales_df.printSchema()
sales_df.show(5,truncate=False)

root
 |-- dept: string (nullable = true)
 |-- cadre: string (nullable = true)
 |-- costToCompany: integer (nullable = true)
 |-- state: string (nullable = true)

+-----+-------+-------------+-----+
|dept |cadre  |costToCompany|state|
+-----+-------+-------------+-----+
|Sales|Trainee|12000        |UK   |
|Sales|Lead   |32000        |AUS  |
|Sales|Lead   |32000        |NY   |
|Sales|Lead   |32000        |IND  |
|Sales|Lead   |32000        |AUS  |
+-----+-------+-------------+-----+
only showing top 5 rows



In [4]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

In [5]:
sales_df.count()

11

In [6]:
sales_df.groupby('dept','cadre','state').count().show()

sales_df.groupby('dept','cadre','state').sum("costToCompany")\
      .withColumnRenamed("sum(costToCompany)", "sumCostToCompany").show()

+---------+---------+-----+-----+
|     dept|    cadre|state|count|
+---------+---------+-----+-----+
|Marketing|Associate|  IND|    2|
|    Sales|  Trainee|   UK|    1|
|    Sales|     Lead|   NY|    3|
|    Sales|     Lead|  IND|    2|
|       HR|  Manager|  IND|    1|
|    Sales|     Lead|  AUS|    2|
+---------+---------+-----+-----+

+---------+---------+-----+----------------+
|     dept|    cadre|state|sumCostToCompany|
+---------+---------+-----+----------------+
|Marketing|Associate|  IND|           36000|
|    Sales|  Trainee|   UK|           12000|
|    Sales|     Lead|   NY|           96000|
|    Sales|     Lead|  IND|           64000|
|       HR|  Manager|  IND|           58000|
|    Sales|     Lead|  AUS|           64000|
+---------+---------+-----+----------------+



<h2>
*****PySpark DF Question*****<br>
Filenames: txns<br>
1) Filter category by gymnastics and  Team Sports in one Df.<br>
2) Filter spendby by credit and save in another Df.<br>
3) Join these two DF using DSL.<br.
4) Write the data to hdfs in XML format with roottag and txn and row tag and records</br>
</h2>

In [7]:
txns_df = spark.read.format("csv")\
               .option("header",'True')\
               .option("inferSchema",'True')\
               .option("delimiter",',')\
               .load('file:///home/saif/LFS/datasets/txns')

txns_df.printSchema()
txns_df.show(5)

root
 |-- txnno: integer (nullable = true)
 |-- txndate: string (nullable = true)
 |-- custno: integer (nullable = true)
 |-- amount: double (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- spendby: string (nullable = true)

+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|txnno|   txndate| custno|amount|          category|             product|       city|     state|spendby|
+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|    0|06-26-2011|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville| Tennessee| credit|
|    1|05-26-2011|4006742|198.44|Exercise & Fitness|Weightlifting Gloves| Long Beach|California| credit|
|    2|06-01-2011|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|    Anaheim|California| credit|
|    3|06-05-2011|4

In [8]:
df1 = txns_df.where( (col("category") == 'Gymnastics') | (col("category") == 'Team Sports'))
txns_df.where( (col("category") == 'Gymnastics') | (col("category") == 'Team Sports')).show(5)

+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
|txnno|   txndate| custno|amount|   category|         product|          city|     state|spendby|
+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
|    3|06-05-2011|4002199|198.19| Gymnastics|Gymnastics Rings|     Milwaukee| Wisconsin| credit|
|    4|12-17-2011|4002613| 98.81|Team Sports|    Field Hockey|   Nashville  | Tennessee| credit|
|   13|03-13-2011|4003268| 107.8|Team Sports|    Field Hockey|    Honolulu  |    Hawaii| credit|
|   14|02-25-2011|4004613| 36.81| Gymnastics| Vaulting Horses|   Los Angeles|California| credit|
|   18|11-18-2011|4002444| 88.65|Team Sports|        Baseball|Salt Lake City|      Utah| credit|
+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
only showing top 5 rows



In [9]:
# Melwin Note we need to give "( )" to all the condition 

# the below example will not work
# txns_df.where( col("category") == 'Gymnastics' | col("category") == 'Team Sports')   will throw error

# so its mandatory to give "( )"
#  txns_df.where( (col("category") == 'Gymnastics') | (col("category") == 'Team Sports'))

In [10]:
df2 = txns_df.where( (col("spendby") == 'credit'))
txns_df.where( (col("spendby") == 'credit')).show(5)

+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|txnno|   txndate| custno|amount|          category|             product|       city|     state|spendby|
+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
|    0|06-26-2011|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville| Tennessee| credit|
|    1|05-26-2011|4006742|198.44|Exercise & Fitness|Weightlifting Gloves| Long Beach|California| credit|
|    2|06-01-2011|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|    Anaheim|California| credit|
|    3|06-05-2011|4002199|198.19|        Gymnastics|    Gymnastics Rings|  Milwaukee| Wisconsin| credit|
|    4|12-17-2011|4002613| 98.81|       Team Sports|        Field Hockey|Nashville  | Tennessee| credit|
+-----+----------+-------+------+------------------+--------------------+-----------+----------+-------+
only showing top 5 rows



In [11]:
condition = (df1.txnno == df2.txnno)
df3 = df1.join(df2, condition, "inner").select(
    df1["txnno"],df1["txndate"],df1["custno"],df1["amount"],df1["category"],
    df1["product"],df1["city"],df1["state"],df1["spendby"]
                                              )
df3.show(5)

+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
|txnno|   txndate| custno|amount|   category|         product|          city|     state|spendby|
+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
|    3|06-05-2011|4002199|198.19| Gymnastics|Gymnastics Rings|     Milwaukee| Wisconsin| credit|
|    4|12-17-2011|4002613| 98.81|Team Sports|    Field Hockey|   Nashville  | Tennessee| credit|
|   13|03-13-2011|4003268| 107.8|Team Sports|    Field Hockey|    Honolulu  |    Hawaii| credit|
|   14|02-25-2011|4004613| 36.81| Gymnastics| Vaulting Horses|   Los Angeles|California| credit|
|   18|11-18-2011|4002444| 88.65|Team Sports|        Baseball|Salt Lake City|      Utah| credit|
+-----+----------+-------+------+-----------+----------------+--------------+----------+-------+
only showing top 5 rows



In [12]:
    df3.write.format("com.databricks.spark.xml") \
        .mode('overwrite') \
        .option("rootTag", "txn") \
        .option("rowTag", "records") \
        .save("hdfs://localhost:9000/user/saif/HFS/Input/txns_xml")