In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder\
                   .appName("DF Operations")\
                   .master("local[2]")\
                   .getOrCreate()

In [None]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,DateType,TimestampType,ArrayType

In [None]:
txnSchema = StructType([
                        StructField("txn_id",IntegerType()),
                        StructField("txn_date",DateType()),
                        StructField("cid",StringType()),
                        StructField("amount",FloatType()),
                        StructField("prod_cat",StringType()), 
                        StructField("prod",StringType()),
                        StructField("city",StringType()),
                        StructField("state",StringType()),
                        StructField("mode",StringType())
                      ])

In [None]:
txnDF = spark.read\
          .option("header",True)\
          .schema(txnSchema)\
          .option("mode","PERMISSIVE")\
          .option("dateFormat","M-dd-y")\
          .csv("d:\\data\\txn_with_header.txt")

In [9]:
txnDF.show(5)

+------+----------+-------+------+------------------+--------------------+----------+----------+------+
|txn_id|  txn_date|    cid|amount|          prod_cat|                prod|      city|     state|  mode|
+------+----------+-------+------+------------------+--------------------+----------+----------+------+
|     1|2011-05-26|4006742| 98.44|Exercise & Fitness|Weightlifting Gloves|Long Beach|California|credit|
|     2|2011-06-01|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|   Anaheim|California|credit|
|     3|2011-06-05|4002199|198.19|        Gymnastics|    Gymnastics Rings| Milwaukee| Wisconsin|credit|
|     4|2011-12-17|4002613| 98.81|       Team Sports|        Field Hockey| Nashville| Tennessee|credit|
|     5|2011-02-14|4007591|193.63|Outdoor Recreation|Camping & Backpac...|   Chicago|  Illinois|credit|
+------+----------+-------+------+------------------+--------------------+----------+----------+------+
only showing top 5 rows



In [10]:
#1 rename col prod -> Product
#2 change type of txn_id to String
#3 Add new column as year 
#4 Add new col - trans_Cate
        # t1 = if amount < 20
        # t2 = if ampount < 50
        # t3 = else


In [42]:
from pyspark.sql.functions import col,year,month,when,lit,concat,to_date

ImportError: cannot import name 'make_date' from 'pyspark.sql.functions' (D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\functions.py)

In [27]:
txnDFnew = txnDF.withColumnRenamed("prod","Product")\
     .withColumn("txn_id",col("txn_id").cast(StringType()))\
     .withColumn("Year",year(col("txn_date")))\
     .withColumn("Month",month(col("txn_date")))\
     .withColumn("Trans_Category", when(col("amount") < 20,"T1")
                                    .when(col("amount") < 50,"T2")
                                    .otherwise("T3"))

# Month Wise sale

In [28]:
txnDFnew.printSchema()

root
 |-- txn_id: string (nullable = true)
 |-- txn_date: date (nullable = true)
 |-- cid: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- prod_cat: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Trans_Category: string (nullable = false)



In [29]:
txnDFnew.show(2)

+------+----------+-------+------+------------------+--------------------+----------+----------+------+----+-----+--------------+
|txn_id|  txn_date|    cid|amount|          prod_cat|             Product|      city|     state|  mode|Year|Month|Trans_Category|
+------+----------+-------+------+------------------+--------------------+----------+----------+------+----+-----+--------------+
|     1|2011-05-26|4006742| 98.44|Exercise & Fitness|Weightlifting Gloves|Long Beach|California|credit|2011|    5|            T3|
|     2|2011-06-01|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|   Anaheim|California|credit|2011|    6|            T1|
+------+----------+-------+------+------------------+--------------------+----------+----------+------+----+-----+--------------+
only showing top 2 rows



In [31]:
# Exercise
# Create a dataFrame :
data = [("Sagar","12","3","2002"),
    ("Abdul","23","5","81"),
    ("John","12","12","6"),
    ("Rosy","7","8","63"),
    ("Abdul","23","5","81"),
    ("Vijay","2","5","21")]

df = spark.createDataFrame(data).toDF("name","day","month","year")

df.printSchema()

root
 |-- name: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- year: string (nullable = true)



In [44]:
df_new = df.withColumn("year",col("year").cast(IntegerType()))\
  .withColumn("year",when(col("year")< 24 , col("year")+2000)
                     .when(col("year")< 100 , col("year")+1900)
                      .otherwise(col("year")))\
  .withColumn("doj",concat(col("day"),lit("/"),col("month"),lit("/"),col("year")))\
  .withColumn("doj",to_date(col("doj"),'d/M/y'))\
  .drop("day","month","year")\
  .dropDuplicates()

+-----+----------+
| name|       doj|
+-----+----------+
| Rosy|1963-08-07|
|Abdul|1981-05-23|
|Vijay|2021-05-02|
| John|2006-12-12|
|Sagar|2002-03-12|
+-----+----------+



In [None]:
df.withColumn("year",col("year").cast(IntegerType()))\
  .withColumn("year",when(col("year")< 24 , col("year")+2000)
                     .when(col("year")< 100 , col("year")+1900)
                      .otherwise(col("year")))\
  .withColumn("doj",make_date(col("year"),col("month"),col("day")))\
  .show()