In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split

In [3]:
spark = SparkSession.builder.appName("learning").master("local").getOrCreate()
sc = spark.sparkContext

## *****PySpark DF Question*****
## 1) Input:
## Col1
## Saif=A
## Ram=B
## Ram=B
## Mitali=C
## Mitali=C
## Mitali=C
## 
## Output: 
## Key	Value
## Saif	1
## Ram	2
## Mitali 	3

In [4]:
data = ['Saif=A','Ram=B','Ram=B','Mitali=C','Mitali=C','Mitali=C']

df = spark.createDataFrame(data,'string').toDF('Col')
df.printSchema()
df.show()

root
 |-- Col: string (nullable = true)

+--------+
|     Col|
+--------+
|  Saif=A|
|   Ram=B|
|   Ram=B|
|Mitali=C|
|Mitali=C|
|Mitali=C|
+--------+



In [11]:
df.select(col('*'), split(df['Col'],'=').getItem(0).alias('Name'),
                    split(df['Col'],'=').getItem(1).alias('Section'))\
                    .groupby('Name').count().orderBy('count')\
                    .withColumnRenamed('count','value')\
                    .show()

+------+-----+
|  Name|value|
+------+-----+
|  Saif|    1|
|   Ram|    2|
|Mitali|    3|
+------+-----+



## *****PySpark DF Question*****
## 1) You are provided with sales.txt file at location /home/saif/LFS/cca175/. 
## 
## Output: 
## Find the employee count & cost to company for each group consisting of dept, cadre, and 
## state. Compress the output using gzip compression & write the data of costToCompany 
## greater than 50000 to location /user/saif/HFS/CCA_175/Output in a single file. 

In [13]:
sales_df = spark.read.format("csv")\
                     .option('header','True')\
                     .option('delimiter',',')\
                     .option('inferSchema','True')\
                     .load('file:///home/saif/LFS/datasets/sales.txt')

sales_df.printSchema()
sales_df.show(5)

root
 |-- dept: string (nullable = true)
 |-- cadre: string (nullable = true)
 |-- costToCompany: integer (nullable = true)
 |-- state: string (nullable = true)

+-----+-------+-------------+-----+
| dept|  cadre|costToCompany|state|
+-----+-------+-------------+-----+
|Sales|Trainee|        12000|   UK|
|Sales|   Lead|        32000|  AUS|
|Sales|   Lead|        32000|   NY|
|Sales|   Lead|        32000|  IND|
|Sales|   Lead|        32000|  AUS|
+-----+-------+-------------+-----+
only showing top 5 rows



In [23]:
sales_df.count()

11

In [22]:
sales_df.groupby('dept').sum().show()
sales_df.groupby('cadre').sum().show()
sales_df.groupby('state').sum().show()

+---------+------------------+
|     dept|sum(costToCompany)|
+---------+------------------+
|    Sales|            236000|
|       HR|             58000|
|Marketing|             36000|
+---------+------------------+

+---------+------------------+
|    cadre|sum(costToCompany)|
+---------+------------------+
|Associate|             36000|
|     Lead|            224000|
|  Trainee|             12000|
|  Manager|             58000|
+---------+------------------+

+-----+------------------+
|state|sum(costToCompany)|
+-----+------------------+
|  AUS|             64000|
|   NY|             96000|
|   UK|             12000|
|  IND|            158000|
+-----+------------------+



In [21]:
sales_df.select(col('dept'),col('cadre'),col('state'),col('costToCompany'))\
        .groupby(['dept','cadre','state'])\
        .sum('costToCompany').withColumnRenamed('sum(costToCompany)','CostToCompany_Sum')\
        .show()

+---------+---------+-----+-----------------+
|     dept|    cadre|state|CostToCompany_Sum|
+---------+---------+-----+-----------------+
|Marketing|Associate|  IND|            36000|
|    Sales|  Trainee|   UK|            12000|
|    Sales|     Lead|   NY|            96000|
|    Sales|     Lead|  IND|            64000|
|       HR|  Manager|  IND|            58000|
|    Sales|     Lead|  AUS|            64000|
+---------+---------+-----+-----------------+



In [24]:
'''
Compress the output using gzip compression & write the data of costToCompany  
greater than 50000 to location /user/saif/HFS/CCA_175/Output in a single file.
'''
df2 = sales_df.filter(col("costToCompany") > 50000)
df2.show()

+----+-------+-------------+-----+
|dept|  cadre|costToCompany|state|
+----+-------+-------------+-----+
|  HR|Manager|        58000|  IND|
+----+-------+-------------+-----+



In [25]:
df2.write.format('csv')\
         .mode('overwrite')\
        .option("codec", "org.apache.hadoop.io.compress.GzipCodec")\
        .save('hdfs://localhost:9000/user/saif/HFS/Output/df_op/30thdec_gzip_format')